1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 253b381b3SDavid Woodhouse /* 353b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 453b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 553b381b3SDavid Woodhouse */ 6c1d7c514SDavid Sterba 753b381b3SDavid Woodhouse #include <linux/sched.h> 853b381b3SDavid Woodhouse #include <linux/bio.h> 953b381b3SDavid Woodhouse #include <linux/slab.h> 1053b381b3SDavid Woodhouse #include <linux/blkdev.h> 1153b381b3SDavid Woodhouse #include <linux/raid/pq.h> 1253b381b3SDavid Woodhouse #include <linux/hash.h> 1353b381b3SDavid Woodhouse #include <linux/list_sort.h> 1453b381b3SDavid Woodhouse #include <linux/raid/xor.h> 15818e010bSDavid Sterba #include <linux/mm.h> 16cea62800SJohannes Thumshirn #include "misc.h" 1753b381b3SDavid Woodhouse #include "ctree.h" 1853b381b3SDavid Woodhouse #include "disk-io.h" 1953b381b3SDavid Woodhouse #include "volumes.h" 2053b381b3SDavid Woodhouse #include "raid56.h" 2153b381b3SDavid Woodhouse #include "async-thread.h" 2253b381b3SDavid Woodhouse 2353b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 2453b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 2553b381b3SDavid Woodhouse 264ae10b3aSChris Mason /* 274ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 284ae10b3aSChris Mason * of past RMW 294ae10b3aSChris Mason */ 304ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 314ae10b3aSChris Mason 324ae10b3aSChris Mason /* 334ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 344ae10b3aSChris Mason */ 354ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 364ae10b3aSChris Mason 374ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 384ae10b3aSChris Mason 398a953348SDavid Sterba #define BTRFS_STRIPE_HASH_TABLE_BITS 11 408a953348SDavid Sterba 418a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 428a953348SDavid Sterba struct btrfs_stripe_hash { 438a953348SDavid Sterba struct list_head hash_list; 448a953348SDavid Sterba spinlock_t lock; 458a953348SDavid Sterba }; 468a953348SDavid Sterba 478a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 488a953348SDavid Sterba struct btrfs_stripe_hash_table { 498a953348SDavid Sterba struct list_head stripe_cache; 508a953348SDavid Sterba spinlock_t cache_lock; 518a953348SDavid Sterba int cache_size; 528a953348SDavid Sterba struct btrfs_stripe_hash table[]; 538a953348SDavid Sterba }; 548a953348SDavid Sterba 55eb357060SQu Wenruo /* 56eb357060SQu Wenruo * A bvec like structure to present a sector inside a page. 57eb357060SQu Wenruo * 58eb357060SQu Wenruo * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59eb357060SQu Wenruo */ 60eb357060SQu Wenruo struct sector_ptr { 61eb357060SQu Wenruo struct page *page; 6200425dd9SQu Wenruo unsigned int pgoff:24; 6300425dd9SQu Wenruo unsigned int uptodate:8; 64eb357060SQu Wenruo }; 65eb357060SQu Wenruo 6653b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 6753b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 68385de0efSChristoph Hellwig static void rmw_work(struct work_struct *work); 69385de0efSChristoph Hellwig static void read_rebuild_work(struct work_struct *work); 7053b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 7153b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 7253b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio); 7353b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 7453b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 7553b381b3SDavid Woodhouse 765a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 775a6ac9eaSMiao Xie int need_check); 78385de0efSChristoph Hellwig static void scrub_parity_work(struct work_struct *work); 795a6ac9eaSMiao Xie 80385de0efSChristoph Hellwig static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 81ac638859SDavid Sterba { 82385de0efSChristoph Hellwig INIT_WORK(&rbio->work, work_func); 83385de0efSChristoph Hellwig queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 84ac638859SDavid Sterba } 85ac638859SDavid Sterba 8653b381b3SDavid Woodhouse /* 8753b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 8853b381b3SDavid Woodhouse * bios in hopes of making a full stripe 8953b381b3SDavid Woodhouse */ 9053b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 9153b381b3SDavid Woodhouse { 9253b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 9353b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 9453b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 9553b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 9653b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 9753b381b3SDavid Woodhouse int i; 9853b381b3SDavid Woodhouse 9953b381b3SDavid Woodhouse if (info->stripe_hash_table) 10053b381b3SDavid Woodhouse return 0; 10153b381b3SDavid Woodhouse 10283c8266aSDavid Sterba /* 10383c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 10483c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 10583c8266aSDavid Sterba * 10683c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 10783c8266aSDavid Sterba * of a failing mount. 10883c8266aSDavid Sterba */ 109ee787f95SDavid Sterba table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 11053b381b3SDavid Woodhouse if (!table) 11153b381b3SDavid Woodhouse return -ENOMEM; 11253b381b3SDavid Woodhouse 1134ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 1144ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 1154ae10b3aSChris Mason 11653b381b3SDavid Woodhouse h = table->table; 11753b381b3SDavid Woodhouse 11853b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 11953b381b3SDavid Woodhouse cur = h + i; 12053b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 12153b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 12253b381b3SDavid Woodhouse } 12353b381b3SDavid Woodhouse 12453b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 125f749303bSWang Shilong kvfree(x); 12653b381b3SDavid Woodhouse return 0; 12753b381b3SDavid Woodhouse } 12853b381b3SDavid Woodhouse 12953b381b3SDavid Woodhouse /* 1304ae10b3aSChris Mason * caching an rbio means to copy anything from the 131ac26df8bSQu Wenruo * bio_sectors array into the stripe_pages array. We 1324ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 1334ae10b3aSChris Mason * to indicate if it has valid data 1344ae10b3aSChris Mason * 1354ae10b3aSChris Mason * once the caching is done, we set the cache ready 1364ae10b3aSChris Mason * bit. 1374ae10b3aSChris Mason */ 1384ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 1394ae10b3aSChris Mason { 1404ae10b3aSChris Mason int i; 1414ae10b3aSChris Mason int ret; 1424ae10b3aSChris Mason 1434ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 1444ae10b3aSChris Mason if (ret) 1454ae10b3aSChris Mason return; 1464ae10b3aSChris Mason 14700425dd9SQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 14800425dd9SQu Wenruo /* Some range not covered by bio (partial write), skip it */ 14900425dd9SQu Wenruo if (!rbio->bio_sectors[i].page) 15000425dd9SQu Wenruo continue; 15100425dd9SQu Wenruo 15200425dd9SQu Wenruo ASSERT(rbio->stripe_sectors[i].page); 15300425dd9SQu Wenruo memcpy_page(rbio->stripe_sectors[i].page, 15400425dd9SQu Wenruo rbio->stripe_sectors[i].pgoff, 15500425dd9SQu Wenruo rbio->bio_sectors[i].page, 15600425dd9SQu Wenruo rbio->bio_sectors[i].pgoff, 15700425dd9SQu Wenruo rbio->bioc->fs_info->sectorsize); 15800425dd9SQu Wenruo rbio->stripe_sectors[i].uptodate = 1; 15900425dd9SQu Wenruo } 1604ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1614ae10b3aSChris Mason } 1624ae10b3aSChris Mason 1634ae10b3aSChris Mason /* 16453b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 16553b381b3SDavid Woodhouse */ 16653b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 16753b381b3SDavid Woodhouse { 1684c664611SQu Wenruo u64 num = rbio->bioc->raid_map[0]; 16953b381b3SDavid Woodhouse 17053b381b3SDavid Woodhouse /* 17153b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 17253b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 17353b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 17453b381b3SDavid Woodhouse * returns just one or two different values. 17553b381b3SDavid Woodhouse * 17653b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 17753b381b3SDavid Woodhouse */ 17853b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 17953b381b3SDavid Woodhouse } 18053b381b3SDavid Woodhouse 181d4e28d9bSQu Wenruo static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 182d4e28d9bSQu Wenruo unsigned int page_nr) 183d4e28d9bSQu Wenruo { 184d4e28d9bSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 185d4e28d9bSQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 186d4e28d9bSQu Wenruo int i; 187d4e28d9bSQu Wenruo 188d4e28d9bSQu Wenruo ASSERT(page_nr < rbio->nr_pages); 189d4e28d9bSQu Wenruo 190d4e28d9bSQu Wenruo for (i = sectors_per_page * page_nr; 191d4e28d9bSQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; 192d4e28d9bSQu Wenruo i++) { 193d4e28d9bSQu Wenruo if (!rbio->stripe_sectors[i].uptodate) 194d4e28d9bSQu Wenruo return false; 195d4e28d9bSQu Wenruo } 196d4e28d9bSQu Wenruo return true; 197d4e28d9bSQu Wenruo } 198d4e28d9bSQu Wenruo 19953b381b3SDavid Woodhouse /* 200eb357060SQu Wenruo * Update the stripe_sectors[] array to use correct page and pgoff 201eb357060SQu Wenruo * 202eb357060SQu Wenruo * Should be called every time any page pointer in stripes_pages[] got modified. 203eb357060SQu Wenruo */ 204eb357060SQu Wenruo static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 205eb357060SQu Wenruo { 206eb357060SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 207eb357060SQu Wenruo u32 offset; 208eb357060SQu Wenruo int i; 209eb357060SQu Wenruo 210eb357060SQu Wenruo for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 211eb357060SQu Wenruo int page_index = offset >> PAGE_SHIFT; 212eb357060SQu Wenruo 213eb357060SQu Wenruo ASSERT(page_index < rbio->nr_pages); 214eb357060SQu Wenruo rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 215eb357060SQu Wenruo rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 216eb357060SQu Wenruo } 217eb357060SQu Wenruo } 218eb357060SQu Wenruo 2194d100466SQu Wenruo static void steal_rbio_page(struct btrfs_raid_bio *src, 2204d100466SQu Wenruo struct btrfs_raid_bio *dest, int page_nr) 2214d100466SQu Wenruo { 2224d100466SQu Wenruo const u32 sectorsize = src->bioc->fs_info->sectorsize; 2234d100466SQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2244d100466SQu Wenruo int i; 2254d100466SQu Wenruo 2264d100466SQu Wenruo if (dest->stripe_pages[page_nr]) 2274d100466SQu Wenruo __free_page(dest->stripe_pages[page_nr]); 2284d100466SQu Wenruo dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 2294d100466SQu Wenruo src->stripe_pages[page_nr] = NULL; 2304d100466SQu Wenruo 2314d100466SQu Wenruo /* Also update the sector->uptodate bits. */ 2324d100466SQu Wenruo for (i = sectors_per_page * page_nr; 2334d100466SQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; i++) 2344d100466SQu Wenruo dest->stripe_sectors[i].uptodate = true; 2354d100466SQu Wenruo } 2364d100466SQu Wenruo 237eb357060SQu Wenruo /* 238d4e28d9bSQu Wenruo * Stealing an rbio means taking all the uptodate pages from the stripe array 239d4e28d9bSQu Wenruo * in the source rbio and putting them into the destination rbio. 240d4e28d9bSQu Wenruo * 241d4e28d9bSQu Wenruo * This will also update the involved stripe_sectors[] which are referring to 242d4e28d9bSQu Wenruo * the old pages. 2434ae10b3aSChris Mason */ 2444ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 2454ae10b3aSChris Mason { 2464ae10b3aSChris Mason int i; 2474ae10b3aSChris Mason struct page *s; 2484ae10b3aSChris Mason 2494ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 2504ae10b3aSChris Mason return; 2514ae10b3aSChris Mason 2524ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 2534ae10b3aSChris Mason s = src->stripe_pages[i]; 254d4e28d9bSQu Wenruo if (!s || !full_page_sectors_uptodate(src, i)) 2554ae10b3aSChris Mason continue; 2564ae10b3aSChris Mason 2574d100466SQu Wenruo steal_rbio_page(src, dest, i); 2584ae10b3aSChris Mason } 259eb357060SQu Wenruo index_stripe_sectors(dest); 260eb357060SQu Wenruo index_stripe_sectors(src); 2614ae10b3aSChris Mason } 2624ae10b3aSChris Mason 2634ae10b3aSChris Mason /* 26453b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 26553b381b3SDavid Woodhouse * splice it into the destination. The victim should 26653b381b3SDavid Woodhouse * be discarded afterwards. 26753b381b3SDavid Woodhouse * 26853b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 26953b381b3SDavid Woodhouse */ 27053b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 27153b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 27253b381b3SDavid Woodhouse { 27353b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 27453b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 275bd8f7e62SQu Wenruo /* Also inherit the bitmaps from @victim. */ 276bd8f7e62SQu Wenruo bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 277bd8f7e62SQu Wenruo dest->stripe_nsectors); 2784245215dSMiao Xie dest->generic_bio_cnt += victim->generic_bio_cnt; 27953b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 28053b381b3SDavid Woodhouse } 28153b381b3SDavid Woodhouse 28253b381b3SDavid Woodhouse /* 2834ae10b3aSChris Mason * used to prune items that are in the cache. The caller 2844ae10b3aSChris Mason * must hold the hash table lock. 2854ae10b3aSChris Mason */ 2864ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 2874ae10b3aSChris Mason { 2884ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 2894ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 2904ae10b3aSChris Mason struct btrfs_stripe_hash *h; 2914ae10b3aSChris Mason int freeit = 0; 2924ae10b3aSChris Mason 2934ae10b3aSChris Mason /* 2944ae10b3aSChris Mason * check the bit again under the hash table lock. 2954ae10b3aSChris Mason */ 2964ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 2974ae10b3aSChris Mason return; 2984ae10b3aSChris Mason 2996a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 3004ae10b3aSChris Mason h = table->table + bucket; 3014ae10b3aSChris Mason 3024ae10b3aSChris Mason /* hold the lock for the bucket because we may be 3034ae10b3aSChris Mason * removing it from the hash table 3044ae10b3aSChris Mason */ 3054ae10b3aSChris Mason spin_lock(&h->lock); 3064ae10b3aSChris Mason 3074ae10b3aSChris Mason /* 3084ae10b3aSChris Mason * hold the lock for the bio list because we need 3094ae10b3aSChris Mason * to make sure the bio list is empty 3104ae10b3aSChris Mason */ 3114ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 3124ae10b3aSChris Mason 3134ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 3144ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 3154ae10b3aSChris Mason table->cache_size -= 1; 3164ae10b3aSChris Mason freeit = 1; 3174ae10b3aSChris Mason 3184ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 3194ae10b3aSChris Mason * still involved in an IO. We take it out 3204ae10b3aSChris Mason * of the cache list, and drop the ref that 3214ae10b3aSChris Mason * was held for the list. 3224ae10b3aSChris Mason * 3234ae10b3aSChris Mason * If the bio_list was empty, we also remove 3244ae10b3aSChris Mason * the rbio from the hash_table, and drop 3254ae10b3aSChris Mason * the corresponding ref 3264ae10b3aSChris Mason */ 3274ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 3284ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 3294ae10b3aSChris Mason list_del_init(&rbio->hash_list); 330dec95574SElena Reshetova refcount_dec(&rbio->refs); 3314ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 3324ae10b3aSChris Mason } 3334ae10b3aSChris Mason } 3344ae10b3aSChris Mason } 3354ae10b3aSChris Mason 3364ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 3374ae10b3aSChris Mason spin_unlock(&h->lock); 3384ae10b3aSChris Mason 3394ae10b3aSChris Mason if (freeit) 3404ae10b3aSChris Mason __free_raid_bio(rbio); 3414ae10b3aSChris Mason } 3424ae10b3aSChris Mason 3434ae10b3aSChris Mason /* 3444ae10b3aSChris Mason * prune a given rbio from the cache 3454ae10b3aSChris Mason */ 3464ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3474ae10b3aSChris Mason { 3484ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3494ae10b3aSChris Mason unsigned long flags; 3504ae10b3aSChris Mason 3514ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 3524ae10b3aSChris Mason return; 3534ae10b3aSChris Mason 3546a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 3554ae10b3aSChris Mason 3564ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 3574ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 3584ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 3594ae10b3aSChris Mason } 3604ae10b3aSChris Mason 3614ae10b3aSChris Mason /* 3624ae10b3aSChris Mason * remove everything in the cache 3634ae10b3aSChris Mason */ 36448a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 3654ae10b3aSChris Mason { 3664ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3674ae10b3aSChris Mason unsigned long flags; 3684ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 3694ae10b3aSChris Mason 3704ae10b3aSChris Mason table = info->stripe_hash_table; 3714ae10b3aSChris Mason 3724ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 3734ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 3744ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 3754ae10b3aSChris Mason struct btrfs_raid_bio, 3764ae10b3aSChris Mason stripe_cache); 3774ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 3784ae10b3aSChris Mason } 3794ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 3804ae10b3aSChris Mason } 3814ae10b3aSChris Mason 3824ae10b3aSChris Mason /* 3834ae10b3aSChris Mason * remove all cached entries and free the hash table 3844ae10b3aSChris Mason * used by unmount 38553b381b3SDavid Woodhouse */ 38653b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 38753b381b3SDavid Woodhouse { 38853b381b3SDavid Woodhouse if (!info->stripe_hash_table) 38953b381b3SDavid Woodhouse return; 3904ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 391f749303bSWang Shilong kvfree(info->stripe_hash_table); 39253b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 39353b381b3SDavid Woodhouse } 39453b381b3SDavid Woodhouse 39553b381b3SDavid Woodhouse /* 3964ae10b3aSChris Mason * insert an rbio into the stripe cache. It 3974ae10b3aSChris Mason * must have already been prepared by calling 3984ae10b3aSChris Mason * cache_rbio_pages 3994ae10b3aSChris Mason * 4004ae10b3aSChris Mason * If this rbio was already cached, it gets 4014ae10b3aSChris Mason * moved to the front of the lru. 4024ae10b3aSChris Mason * 4034ae10b3aSChris Mason * If the size of the rbio cache is too big, we 4044ae10b3aSChris Mason * prune an item. 4054ae10b3aSChris Mason */ 4064ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 4074ae10b3aSChris Mason { 4084ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4094ae10b3aSChris Mason unsigned long flags; 4104ae10b3aSChris Mason 4114ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 4124ae10b3aSChris Mason return; 4134ae10b3aSChris Mason 4146a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4154ae10b3aSChris Mason 4164ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4174ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4184ae10b3aSChris Mason 4194ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 4204ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 421dec95574SElena Reshetova refcount_inc(&rbio->refs); 4224ae10b3aSChris Mason 4234ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 4244ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 4254ae10b3aSChris Mason } else { 4264ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 4274ae10b3aSChris Mason table->cache_size += 1; 4284ae10b3aSChris Mason } 4294ae10b3aSChris Mason 4304ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4314ae10b3aSChris Mason 4324ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 4334ae10b3aSChris Mason struct btrfs_raid_bio *found; 4344ae10b3aSChris Mason 4354ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 4364ae10b3aSChris Mason struct btrfs_raid_bio, 4374ae10b3aSChris Mason stripe_cache); 4384ae10b3aSChris Mason 4394ae10b3aSChris Mason if (found != rbio) 4404ae10b3aSChris Mason __remove_rbio_from_cache(found); 4414ae10b3aSChris Mason } 4424ae10b3aSChris Mason 4434ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4444ae10b3aSChris Mason } 4454ae10b3aSChris Mason 4464ae10b3aSChris Mason /* 44753b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 44853b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 44953b381b3SDavid Woodhouse * loop through. 45053b381b3SDavid Woodhouse */ 45153b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 45253b381b3SDavid Woodhouse { 45353b381b3SDavid Woodhouse int src_off = 0; 45453b381b3SDavid Woodhouse int xor_src_cnt = 0; 45553b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 45653b381b3SDavid Woodhouse 45753b381b3SDavid Woodhouse while(src_cnt > 0) { 45853b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 45953b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 46053b381b3SDavid Woodhouse 46153b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 46253b381b3SDavid Woodhouse src_off += xor_src_cnt; 46353b381b3SDavid Woodhouse } 46453b381b3SDavid Woodhouse } 46553b381b3SDavid Woodhouse 46653b381b3SDavid Woodhouse /* 467176571a1SDavid Sterba * Returns true if the bio list inside this rbio covers an entire stripe (no 468176571a1SDavid Sterba * rmw required). 46953b381b3SDavid Woodhouse */ 47053b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 47153b381b3SDavid Woodhouse { 47253b381b3SDavid Woodhouse unsigned long flags; 473176571a1SDavid Sterba unsigned long size = rbio->bio_list_bytes; 474176571a1SDavid Sterba int ret = 1; 47553b381b3SDavid Woodhouse 47653b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 477176571a1SDavid Sterba if (size != rbio->nr_data * rbio->stripe_len) 478176571a1SDavid Sterba ret = 0; 479176571a1SDavid Sterba BUG_ON(size > rbio->nr_data * rbio->stripe_len); 48053b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 481176571a1SDavid Sterba 48253b381b3SDavid Woodhouse return ret; 48353b381b3SDavid Woodhouse } 48453b381b3SDavid Woodhouse 48553b381b3SDavid Woodhouse /* 48653b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 48753b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 48853b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 48953b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 49053b381b3SDavid Woodhouse * locked for final IO 49153b381b3SDavid Woodhouse * 49253b381b3SDavid Woodhouse * The caller is responsible for locking such that 49353b381b3SDavid Woodhouse * rmw_locked is safe to test 49453b381b3SDavid Woodhouse */ 49553b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 49653b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 49753b381b3SDavid Woodhouse { 49853b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 49953b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 50053b381b3SDavid Woodhouse return 0; 50153b381b3SDavid Woodhouse 5024ae10b3aSChris Mason /* 5034ae10b3aSChris Mason * we can't merge with cached rbios, since the 5044ae10b3aSChris Mason * idea is that when we merge the destination 5054ae10b3aSChris Mason * rbio is going to run our IO for us. We can 50601327610SNicholas D Steeves * steal from cached rbios though, other functions 5074ae10b3aSChris Mason * handle that. 5084ae10b3aSChris Mason */ 5094ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 5104ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 5114ae10b3aSChris Mason return 0; 5124ae10b3aSChris Mason 5134c664611SQu Wenruo if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 51453b381b3SDavid Woodhouse return 0; 51553b381b3SDavid Woodhouse 5165a6ac9eaSMiao Xie /* we can't merge with different operations */ 5175a6ac9eaSMiao Xie if (last->operation != cur->operation) 51853b381b3SDavid Woodhouse return 0; 5195a6ac9eaSMiao Xie /* 5205a6ac9eaSMiao Xie * We've need read the full stripe from the drive. 5215a6ac9eaSMiao Xie * check and repair the parity and write the new results. 5225a6ac9eaSMiao Xie * 5235a6ac9eaSMiao Xie * We're not allowed to add any new bios to the 5245a6ac9eaSMiao Xie * bio list here, anyone else that wants to 5255a6ac9eaSMiao Xie * change this stripe needs to do their own rmw. 5265a6ac9eaSMiao Xie */ 527db34be19SLiu Bo if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 5285a6ac9eaSMiao Xie return 0; 52953b381b3SDavid Woodhouse 530db34be19SLiu Bo if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 531b4ee1782SOmar Sandoval return 0; 532b4ee1782SOmar Sandoval 533cc54ff62SLiu Bo if (last->operation == BTRFS_RBIO_READ_REBUILD) { 534cc54ff62SLiu Bo int fa = last->faila; 535cc54ff62SLiu Bo int fb = last->failb; 536cc54ff62SLiu Bo int cur_fa = cur->faila; 537cc54ff62SLiu Bo int cur_fb = cur->failb; 538cc54ff62SLiu Bo 539cc54ff62SLiu Bo if (last->faila >= last->failb) { 540cc54ff62SLiu Bo fa = last->failb; 541cc54ff62SLiu Bo fb = last->faila; 542cc54ff62SLiu Bo } 543cc54ff62SLiu Bo 544cc54ff62SLiu Bo if (cur->faila >= cur->failb) { 545cc54ff62SLiu Bo cur_fa = cur->failb; 546cc54ff62SLiu Bo cur_fb = cur->faila; 547cc54ff62SLiu Bo } 548cc54ff62SLiu Bo 549cc54ff62SLiu Bo if (fa != cur_fa || fb != cur_fb) 550cc54ff62SLiu Bo return 0; 551cc54ff62SLiu Bo } 55253b381b3SDavid Woodhouse return 1; 55353b381b3SDavid Woodhouse } 55453b381b3SDavid Woodhouse 5553e77605dSQu Wenruo static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 5563e77605dSQu Wenruo unsigned int stripe_nr, 5573e77605dSQu Wenruo unsigned int sector_nr) 5583e77605dSQu Wenruo { 5593e77605dSQu Wenruo ASSERT(stripe_nr < rbio->real_stripes); 5603e77605dSQu Wenruo ASSERT(sector_nr < rbio->stripe_nsectors); 5613e77605dSQu Wenruo 5623e77605dSQu Wenruo return stripe_nr * rbio->stripe_nsectors + sector_nr; 5633e77605dSQu Wenruo } 5643e77605dSQu Wenruo 5653e77605dSQu Wenruo /* Return a sector from rbio->stripe_sectors, not from the bio list */ 5663e77605dSQu Wenruo static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 5673e77605dSQu Wenruo unsigned int stripe_nr, 5683e77605dSQu Wenruo unsigned int sector_nr) 5693e77605dSQu Wenruo { 5703e77605dSQu Wenruo return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 5713e77605dSQu Wenruo sector_nr)]; 5723e77605dSQu Wenruo } 5733e77605dSQu Wenruo 5741145059aSQu Wenruo /* Grab a sector inside P stripe */ 5751145059aSQu Wenruo static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 5761145059aSQu Wenruo unsigned int sector_nr) 577b7178a5fSZhao Lei { 5781145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 579b7178a5fSZhao Lei } 580b7178a5fSZhao Lei 5811145059aSQu Wenruo /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 5821145059aSQu Wenruo static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 5831145059aSQu Wenruo unsigned int sector_nr) 58453b381b3SDavid Woodhouse { 5852c8cdd6eSMiao Xie if (rbio->nr_data + 1 == rbio->real_stripes) 58653b381b3SDavid Woodhouse return NULL; 5871145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 5881145059aSQu Wenruo } 5891145059aSQu Wenruo 59053b381b3SDavid Woodhouse /* 59153b381b3SDavid Woodhouse * The first stripe in the table for a logical address 59253b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 59353b381b3SDavid Woodhouse * 59453b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 59553b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 59653b381b3SDavid Woodhouse * themselves. 59753b381b3SDavid Woodhouse * 59853b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 59953b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 60053b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 60153b381b3SDavid Woodhouse * 60253b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 60353b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 60453b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 60553b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 60653b381b3SDavid Woodhouse * 1 is returned 60753b381b3SDavid Woodhouse * 60853b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 60953b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 61053b381b3SDavid Woodhouse * already been freed. 61153b381b3SDavid Woodhouse */ 61253b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 61353b381b3SDavid Woodhouse { 614721860d5SJohannes Thumshirn struct btrfs_stripe_hash *h; 61553b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 61653b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 61753b381b3SDavid Woodhouse unsigned long flags; 61853b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 6194ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 62053b381b3SDavid Woodhouse int ret = 0; 62153b381b3SDavid Woodhouse 6226a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 623721860d5SJohannes Thumshirn 62453b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 62553b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 6264c664611SQu Wenruo if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 6279d6cb1b0SJohannes Thumshirn continue; 6289d6cb1b0SJohannes Thumshirn 62953b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 63053b381b3SDavid Woodhouse 6319d6cb1b0SJohannes Thumshirn /* Can we steal this cached rbio's pages? */ 6324ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 6334ae10b3aSChris Mason list_empty(&cur->plug_list) && 6344ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 6354ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 6364ae10b3aSChris Mason list_del_init(&cur->hash_list); 637dec95574SElena Reshetova refcount_dec(&cur->refs); 6384ae10b3aSChris Mason 6394ae10b3aSChris Mason steal_rbio(cur, rbio); 6404ae10b3aSChris Mason cache_drop = cur; 6414ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 6424ae10b3aSChris Mason 6434ae10b3aSChris Mason goto lockit; 6444ae10b3aSChris Mason } 6454ae10b3aSChris Mason 6469d6cb1b0SJohannes Thumshirn /* Can we merge into the lock owner? */ 64753b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 64853b381b3SDavid Woodhouse merge_rbio(cur, rbio); 64953b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 65053b381b3SDavid Woodhouse freeit = rbio; 65153b381b3SDavid Woodhouse ret = 1; 65253b381b3SDavid Woodhouse goto out; 65353b381b3SDavid Woodhouse } 65453b381b3SDavid Woodhouse 6554ae10b3aSChris Mason 65653b381b3SDavid Woodhouse /* 6579d6cb1b0SJohannes Thumshirn * We couldn't merge with the running rbio, see if we can merge 6589d6cb1b0SJohannes Thumshirn * with the pending ones. We don't have to check for rmw_locked 6599d6cb1b0SJohannes Thumshirn * because there is no way they are inside finish_rmw right now 66053b381b3SDavid Woodhouse */ 6619d6cb1b0SJohannes Thumshirn list_for_each_entry(pending, &cur->plug_list, plug_list) { 66253b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 66353b381b3SDavid Woodhouse merge_rbio(pending, rbio); 66453b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 66553b381b3SDavid Woodhouse freeit = rbio; 66653b381b3SDavid Woodhouse ret = 1; 66753b381b3SDavid Woodhouse goto out; 66853b381b3SDavid Woodhouse } 66953b381b3SDavid Woodhouse } 67053b381b3SDavid Woodhouse 6719d6cb1b0SJohannes Thumshirn /* 6729d6cb1b0SJohannes Thumshirn * No merging, put us on the tail of the plug list, our rbio 6739d6cb1b0SJohannes Thumshirn * will be started with the currently running rbio unlocks 67453b381b3SDavid Woodhouse */ 67553b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 67653b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 67753b381b3SDavid Woodhouse ret = 1; 67853b381b3SDavid Woodhouse goto out; 67953b381b3SDavid Woodhouse } 6804ae10b3aSChris Mason lockit: 681dec95574SElena Reshetova refcount_inc(&rbio->refs); 68253b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 68353b381b3SDavid Woodhouse out: 68453b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 6854ae10b3aSChris Mason if (cache_drop) 6864ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 68753b381b3SDavid Woodhouse if (freeit) 68853b381b3SDavid Woodhouse __free_raid_bio(freeit); 68953b381b3SDavid Woodhouse return ret; 69053b381b3SDavid Woodhouse } 69153b381b3SDavid Woodhouse 69253b381b3SDavid Woodhouse /* 69353b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 69453b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 69553b381b3SDavid Woodhouse */ 69653b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 69753b381b3SDavid Woodhouse { 69853b381b3SDavid Woodhouse int bucket; 69953b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 70053b381b3SDavid Woodhouse unsigned long flags; 7014ae10b3aSChris Mason int keep_cache = 0; 70253b381b3SDavid Woodhouse 70353b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 7046a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 70553b381b3SDavid Woodhouse 7064ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 7074ae10b3aSChris Mason cache_rbio(rbio); 7084ae10b3aSChris Mason 70953b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 71053b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 71153b381b3SDavid Woodhouse 71253b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 7134ae10b3aSChris Mason /* 7144ae10b3aSChris Mason * if we're still cached and there is no other IO 7154ae10b3aSChris Mason * to perform, just leave this rbio here for others 7164ae10b3aSChris Mason * to steal from later 7174ae10b3aSChris Mason */ 7184ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 7194ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 7204ae10b3aSChris Mason keep_cache = 1; 7214ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 7224ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 7234ae10b3aSChris Mason goto done; 7244ae10b3aSChris Mason } 72553b381b3SDavid Woodhouse 72653b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 727dec95574SElena Reshetova refcount_dec(&rbio->refs); 72853b381b3SDavid Woodhouse 72953b381b3SDavid Woodhouse /* 73053b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 73153b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 73253b381b3SDavid Woodhouse * hand the lock over to one of them. 73353b381b3SDavid Woodhouse */ 73453b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 73553b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 73653b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 73753b381b3SDavid Woodhouse 73853b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 73953b381b3SDavid Woodhouse plug_list); 74053b381b3SDavid Woodhouse 74153b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 74253b381b3SDavid Woodhouse 74353b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 744dec95574SElena Reshetova refcount_inc(&next->refs); 74553b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 74653b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 74753b381b3SDavid Woodhouse 7481b94b556SMiao Xie if (next->operation == BTRFS_RBIO_READ_REBUILD) 749e66d8d5aSDavid Sterba start_async_work(next, read_rebuild_work); 750b4ee1782SOmar Sandoval else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 751b4ee1782SOmar Sandoval steal_rbio(rbio, next); 752e66d8d5aSDavid Sterba start_async_work(next, read_rebuild_work); 753b4ee1782SOmar Sandoval } else if (next->operation == BTRFS_RBIO_WRITE) { 7544ae10b3aSChris Mason steal_rbio(rbio, next); 755cf6a4a75SDavid Sterba start_async_work(next, rmw_work); 7565a6ac9eaSMiao Xie } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 7575a6ac9eaSMiao Xie steal_rbio(rbio, next); 758a81b747dSDavid Sterba start_async_work(next, scrub_parity_work); 7594ae10b3aSChris Mason } 76053b381b3SDavid Woodhouse 76153b381b3SDavid Woodhouse goto done_nolock; 76253b381b3SDavid Woodhouse } 76353b381b3SDavid Woodhouse } 7644ae10b3aSChris Mason done: 76553b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 76653b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 76753b381b3SDavid Woodhouse 76853b381b3SDavid Woodhouse done_nolock: 7694ae10b3aSChris Mason if (!keep_cache) 7704ae10b3aSChris Mason remove_rbio_from_cache(rbio); 77153b381b3SDavid Woodhouse } 77253b381b3SDavid Woodhouse 77353b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio) 77453b381b3SDavid Woodhouse { 77553b381b3SDavid Woodhouse int i; 77653b381b3SDavid Woodhouse 777dec95574SElena Reshetova if (!refcount_dec_and_test(&rbio->refs)) 77853b381b3SDavid Woodhouse return; 77953b381b3SDavid Woodhouse 7804ae10b3aSChris Mason WARN_ON(!list_empty(&rbio->stripe_cache)); 78153b381b3SDavid Woodhouse WARN_ON(!list_empty(&rbio->hash_list)); 78253b381b3SDavid Woodhouse WARN_ON(!bio_list_empty(&rbio->bio_list)); 78353b381b3SDavid Woodhouse 78453b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 78553b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) { 78653b381b3SDavid Woodhouse __free_page(rbio->stripe_pages[i]); 78753b381b3SDavid Woodhouse rbio->stripe_pages[i] = NULL; 78853b381b3SDavid Woodhouse } 78953b381b3SDavid Woodhouse } 790af8e2d1dSMiao Xie 7914c664611SQu Wenruo btrfs_put_bioc(rbio->bioc); 79253b381b3SDavid Woodhouse kfree(rbio); 79353b381b3SDavid Woodhouse } 79453b381b3SDavid Woodhouse 7957583d8d0SLiu Bo static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 79653b381b3SDavid Woodhouse { 7977583d8d0SLiu Bo struct bio *next; 7987583d8d0SLiu Bo 7997583d8d0SLiu Bo while (cur) { 8007583d8d0SLiu Bo next = cur->bi_next; 8017583d8d0SLiu Bo cur->bi_next = NULL; 8027583d8d0SLiu Bo cur->bi_status = err; 8037583d8d0SLiu Bo bio_endio(cur); 8047583d8d0SLiu Bo cur = next; 8057583d8d0SLiu Bo } 80653b381b3SDavid Woodhouse } 80753b381b3SDavid Woodhouse 80853b381b3SDavid Woodhouse /* 80953b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 81053b381b3SDavid Woodhouse * bio_list and calls end_io on them 81153b381b3SDavid Woodhouse */ 8124e4cbee9SChristoph Hellwig static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 81353b381b3SDavid Woodhouse { 81453b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 8157583d8d0SLiu Bo struct bio *extra; 8164245215dSMiao Xie 8174245215dSMiao Xie if (rbio->generic_bio_cnt) 8186a258d72SQu Wenruo btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); 819bd8f7e62SQu Wenruo /* 820bd8f7e62SQu Wenruo * Clear the data bitmap, as the rbio may be cached for later usage. 821bd8f7e62SQu Wenruo * do this before before unlock_stripe() so there will be no new bio 822bd8f7e62SQu Wenruo * for this bio. 823bd8f7e62SQu Wenruo */ 824bd8f7e62SQu Wenruo bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 8254245215dSMiao Xie 8267583d8d0SLiu Bo /* 8277583d8d0SLiu Bo * At this moment, rbio->bio_list is empty, however since rbio does not 8287583d8d0SLiu Bo * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 8297583d8d0SLiu Bo * hash list, rbio may be merged with others so that rbio->bio_list 8307583d8d0SLiu Bo * becomes non-empty. 8317583d8d0SLiu Bo * Once unlock_stripe() is done, rbio->bio_list will not be updated any 8327583d8d0SLiu Bo * more and we can call bio_endio() on all queued bios. 8337583d8d0SLiu Bo */ 8347583d8d0SLiu Bo unlock_stripe(rbio); 8357583d8d0SLiu Bo extra = bio_list_get(&rbio->bio_list); 8367583d8d0SLiu Bo __free_raid_bio(rbio); 83753b381b3SDavid Woodhouse 8387583d8d0SLiu Bo rbio_endio_bio_list(cur, err); 8397583d8d0SLiu Bo if (extra) 8407583d8d0SLiu Bo rbio_endio_bio_list(extra, err); 84153b381b3SDavid Woodhouse } 84253b381b3SDavid Woodhouse 84353b381b3SDavid Woodhouse /* 84453b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 84553b381b3SDavid Woodhouse * get here, we've written a full stripe 84653b381b3SDavid Woodhouse */ 8474246a0b6SChristoph Hellwig static void raid_write_end_io(struct bio *bio) 84853b381b3SDavid Woodhouse { 84953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 8504e4cbee9SChristoph Hellwig blk_status_t err = bio->bi_status; 851a6111d11SZhao Lei int max_errors; 85253b381b3SDavid Woodhouse 85353b381b3SDavid Woodhouse if (err) 85453b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 85553b381b3SDavid Woodhouse 85653b381b3SDavid Woodhouse bio_put(bio); 85753b381b3SDavid Woodhouse 858b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 85953b381b3SDavid Woodhouse return; 86053b381b3SDavid Woodhouse 86158efbc9fSOmar Sandoval err = BLK_STS_OK; 86253b381b3SDavid Woodhouse 86353b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 864a6111d11SZhao Lei max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 8654c664611SQu Wenruo 0 : rbio->bioc->max_errors; 866a6111d11SZhao Lei if (atomic_read(&rbio->error) > max_errors) 8674e4cbee9SChristoph Hellwig err = BLK_STS_IOERR; 86853b381b3SDavid Woodhouse 8694246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 87053b381b3SDavid Woodhouse } 87153b381b3SDavid Woodhouse 8723e77605dSQu Wenruo /** 8733e77605dSQu Wenruo * Get a sector pointer specified by its @stripe_nr and @sector_nr 8743e77605dSQu Wenruo * 8753e77605dSQu Wenruo * @rbio: The raid bio 8763e77605dSQu Wenruo * @stripe_nr: Stripe number, valid range [0, real_stripe) 8773e77605dSQu Wenruo * @sector_nr: Sector number inside the stripe, 8783e77605dSQu Wenruo * valid range [0, stripe_nsectors) 8793e77605dSQu Wenruo * @bio_list_only: Whether to use sectors inside the bio list only. 8803e77605dSQu Wenruo * 8813e77605dSQu Wenruo * The read/modify/write code wants to reuse the original bio page as much 8823e77605dSQu Wenruo * as possible, and only use stripe_sectors as fallback. 8833e77605dSQu Wenruo */ 8843e77605dSQu Wenruo static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 8853e77605dSQu Wenruo int stripe_nr, int sector_nr, 8863e77605dSQu Wenruo bool bio_list_only) 8873e77605dSQu Wenruo { 8883e77605dSQu Wenruo struct sector_ptr *sector; 8893e77605dSQu Wenruo int index; 8903e77605dSQu Wenruo 8913e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 8923e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 8933e77605dSQu Wenruo 8943e77605dSQu Wenruo index = stripe_nr * rbio->stripe_nsectors + sector_nr; 8953e77605dSQu Wenruo ASSERT(index >= 0 && index < rbio->nr_sectors); 8963e77605dSQu Wenruo 8973e77605dSQu Wenruo spin_lock_irq(&rbio->bio_list_lock); 8983e77605dSQu Wenruo sector = &rbio->bio_sectors[index]; 8993e77605dSQu Wenruo if (sector->page || bio_list_only) { 9003e77605dSQu Wenruo /* Don't return sector without a valid page pointer */ 9013e77605dSQu Wenruo if (!sector->page) 9023e77605dSQu Wenruo sector = NULL; 9033e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9043e77605dSQu Wenruo return sector; 9053e77605dSQu Wenruo } 9063e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 9073e77605dSQu Wenruo 9083e77605dSQu Wenruo return &rbio->stripe_sectors[index]; 9093e77605dSQu Wenruo } 9103e77605dSQu Wenruo 91153b381b3SDavid Woodhouse /* 91253b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 91353b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 91453b381b3SDavid Woodhouse */ 9152ff7e61eSJeff Mahoney static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 9164c664611SQu Wenruo struct btrfs_io_context *bioc, 917cc353a8bSQu Wenruo u32 stripe_len) 91853b381b3SDavid Woodhouse { 919843de58bSQu Wenruo const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 920843de58bSQu Wenruo const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; 921843de58bSQu Wenruo const unsigned int num_pages = stripe_npages * real_stripes; 92294efbe19SQu Wenruo const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; 92394efbe19SQu Wenruo const unsigned int num_sectors = stripe_nsectors * real_stripes; 92453b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 92553b381b3SDavid Woodhouse int nr_data = 0; 92653b381b3SDavid Woodhouse void *p; 92753b381b3SDavid Woodhouse 928843de58bSQu Wenruo ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); 92994efbe19SQu Wenruo /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 93094efbe19SQu Wenruo ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 931c67c68ebSQu Wenruo /* 932c67c68ebSQu Wenruo * Our current stripe len should be fixed to 64k thus stripe_nsectors 933c67c68ebSQu Wenruo * (at most 16) should be no larger than BITS_PER_LONG. 934c67c68ebSQu Wenruo */ 935c67c68ebSQu Wenruo ASSERT(stripe_nsectors <= BITS_PER_LONG); 936843de58bSQu Wenruo 9371389053eSKees Cook rbio = kzalloc(sizeof(*rbio) + 9381389053eSKees Cook sizeof(*rbio->stripe_pages) * num_pages + 93900425dd9SQu Wenruo sizeof(*rbio->bio_sectors) * num_sectors + 940eb357060SQu Wenruo sizeof(*rbio->stripe_sectors) * num_sectors + 941c67c68ebSQu Wenruo sizeof(*rbio->finish_pointers) * real_stripes, 9421389053eSKees Cook GFP_NOFS); 943af8e2d1dSMiao Xie if (!rbio) 94453b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 94553b381b3SDavid Woodhouse 94653b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 94753b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 94853b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 9494ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 95053b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 9514c664611SQu Wenruo rbio->bioc = bioc; 95253b381b3SDavid Woodhouse rbio->stripe_len = stripe_len; 95353b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 95494efbe19SQu Wenruo rbio->nr_sectors = num_sectors; 9552c8cdd6eSMiao Xie rbio->real_stripes = real_stripes; 9565a6ac9eaSMiao Xie rbio->stripe_npages = stripe_npages; 95794efbe19SQu Wenruo rbio->stripe_nsectors = stripe_nsectors; 95853b381b3SDavid Woodhouse rbio->faila = -1; 95953b381b3SDavid Woodhouse rbio->failb = -1; 960dec95574SElena Reshetova refcount_set(&rbio->refs, 1); 961b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 962b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, 0); 96353b381b3SDavid Woodhouse 96453b381b3SDavid Woodhouse /* 965ac26df8bSQu Wenruo * The stripe_pages, bio_sectors, etc arrays point to the extra memory 966ac26df8bSQu Wenruo * we allocated past the end of the rbio. 96753b381b3SDavid Woodhouse */ 96853b381b3SDavid Woodhouse p = rbio + 1; 9691389053eSKees Cook #define CONSUME_ALLOC(ptr, count) do { \ 9701389053eSKees Cook ptr = p; \ 9711389053eSKees Cook p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 9721389053eSKees Cook } while (0) 9731389053eSKees Cook CONSUME_ALLOC(rbio->stripe_pages, num_pages); 97400425dd9SQu Wenruo CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 975eb357060SQu Wenruo CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 9761389053eSKees Cook CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 9771389053eSKees Cook #undef CONSUME_ALLOC 97853b381b3SDavid Woodhouse 9794c664611SQu Wenruo if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) 98010f11900SZhao Lei nr_data = real_stripes - 1; 9814c664611SQu Wenruo else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) 9822c8cdd6eSMiao Xie nr_data = real_stripes - 2; 98353b381b3SDavid Woodhouse else 98410f11900SZhao Lei BUG(); 98553b381b3SDavid Woodhouse 98653b381b3SDavid Woodhouse rbio->nr_data = nr_data; 98753b381b3SDavid Woodhouse return rbio; 98853b381b3SDavid Woodhouse } 98953b381b3SDavid Woodhouse 99053b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 99153b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 99253b381b3SDavid Woodhouse { 993eb357060SQu Wenruo int ret; 994eb357060SQu Wenruo 995eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 996eb357060SQu Wenruo if (ret < 0) 997eb357060SQu Wenruo return ret; 998eb357060SQu Wenruo /* Mapping all sectors */ 999eb357060SQu Wenruo index_stripe_sectors(rbio); 1000eb357060SQu Wenruo return 0; 100153b381b3SDavid Woodhouse } 100253b381b3SDavid Woodhouse 1003b7178a5fSZhao Lei /* only allocate pages for p/q stripes */ 100453b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 100553b381b3SDavid Woodhouse { 1006f77183dcSQu Wenruo const int data_pages = rbio->nr_data * rbio->stripe_npages; 1007eb357060SQu Wenruo int ret; 100853b381b3SDavid Woodhouse 1009eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1010dd137dd1SSweet Tea Dorminy rbio->stripe_pages + data_pages); 1011eb357060SQu Wenruo if (ret < 0) 1012eb357060SQu Wenruo return ret; 1013eb357060SQu Wenruo 1014eb357060SQu Wenruo index_stripe_sectors(rbio); 1015eb357060SQu Wenruo return 0; 101653b381b3SDavid Woodhouse } 101753b381b3SDavid Woodhouse 101853b381b3SDavid Woodhouse /* 10193e77605dSQu Wenruo * Add a single sector @sector into our list of bios for IO. 10203e77605dSQu Wenruo * 10213e77605dSQu Wenruo * Return 0 if everything went well. 10223e77605dSQu Wenruo * Return <0 for error. 102353b381b3SDavid Woodhouse */ 10243e77605dSQu Wenruo static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 102553b381b3SDavid Woodhouse struct bio_list *bio_list, 10263e77605dSQu Wenruo struct sector_ptr *sector, 10273e77605dSQu Wenruo unsigned int stripe_nr, 10283e77605dSQu Wenruo unsigned int sector_nr, 1029e01bf588SChristoph Hellwig unsigned long bio_max_len, 1030e01bf588SChristoph Hellwig unsigned int opf) 103153b381b3SDavid Woodhouse { 10323e77605dSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 103353b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 103453b381b3SDavid Woodhouse int ret; 103553b381b3SDavid Woodhouse struct bio *bio; 10364c664611SQu Wenruo struct btrfs_io_stripe *stripe; 103753b381b3SDavid Woodhouse u64 disk_start; 103853b381b3SDavid Woodhouse 10393e77605dSQu Wenruo /* 10403e77605dSQu Wenruo * Note: here stripe_nr has taken device replace into consideration, 10413e77605dSQu Wenruo * thus it can be larger than rbio->real_stripe. 10423e77605dSQu Wenruo * So here we check against bioc->num_stripes, not rbio->real_stripes. 10433e77605dSQu Wenruo */ 10443e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 10453e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 10463e77605dSQu Wenruo ASSERT(sector->page); 10473e77605dSQu Wenruo 10484c664611SQu Wenruo stripe = &rbio->bioc->stripes[stripe_nr]; 10493e77605dSQu Wenruo disk_start = stripe->physical + sector_nr * sectorsize; 105053b381b3SDavid Woodhouse 105153b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 105253b381b3SDavid Woodhouse if (!stripe->dev->bdev) 105353b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 105453b381b3SDavid Woodhouse 105553b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 105653b381b3SDavid Woodhouse if (last) { 10571201b58bSDavid Sterba u64 last_end = last->bi_iter.bi_sector << 9; 10584f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 105953b381b3SDavid Woodhouse 106053b381b3SDavid Woodhouse /* 106153b381b3SDavid Woodhouse * we can't merge these if they are from different 106253b381b3SDavid Woodhouse * devices or if they are not contiguous 106353b381b3SDavid Woodhouse */ 1064f90ae76aSNikolay Borisov if (last_end == disk_start && !last->bi_status && 1065309dca30SChristoph Hellwig last->bi_bdev == stripe->dev->bdev) { 10663e77605dSQu Wenruo ret = bio_add_page(last, sector->page, sectorsize, 10673e77605dSQu Wenruo sector->pgoff); 10683e77605dSQu Wenruo if (ret == sectorsize) 106953b381b3SDavid Woodhouse return 0; 107053b381b3SDavid Woodhouse } 107153b381b3SDavid Woodhouse } 107253b381b3SDavid Woodhouse 107353b381b3SDavid Woodhouse /* put a new bio on the list */ 1074e1b4b44eSChristoph Hellwig bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), 1075e1b4b44eSChristoph Hellwig opf, GFP_NOFS); 10764f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 1077e01bf588SChristoph Hellwig bio->bi_private = rbio; 107853b381b3SDavid Woodhouse 10793e77605dSQu Wenruo bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 108053b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 108153b381b3SDavid Woodhouse return 0; 108253b381b3SDavid Woodhouse } 108353b381b3SDavid Woodhouse 108453b381b3SDavid Woodhouse /* 108553b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 108653b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 108753b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 108853b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 108953b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 109053b381b3SDavid Woodhouse */ 109153b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 109253b381b3SDavid Woodhouse { 109353b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 10942c8cdd6eSMiao Xie BUG_ON(rbio->faila == rbio->real_stripes - 1); 109553b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 109653b381b3SDavid Woodhouse } else { 109753b381b3SDavid Woodhouse finish_rmw(rbio); 109853b381b3SDavid Woodhouse } 109953b381b3SDavid Woodhouse } 110053b381b3SDavid Woodhouse 110100425dd9SQu Wenruo static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 110200425dd9SQu Wenruo { 110300425dd9SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 110400425dd9SQu Wenruo struct bio_vec bvec; 110500425dd9SQu Wenruo struct bvec_iter iter; 110600425dd9SQu Wenruo u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 110700425dd9SQu Wenruo rbio->bioc->raid_map[0]; 110800425dd9SQu Wenruo 110900425dd9SQu Wenruo bio_for_each_segment(bvec, bio, iter) { 111000425dd9SQu Wenruo u32 bvec_offset; 111100425dd9SQu Wenruo 111200425dd9SQu Wenruo for (bvec_offset = 0; bvec_offset < bvec.bv_len; 111300425dd9SQu Wenruo bvec_offset += sectorsize, offset += sectorsize) { 111400425dd9SQu Wenruo int index = offset / sectorsize; 111500425dd9SQu Wenruo struct sector_ptr *sector = &rbio->bio_sectors[index]; 111600425dd9SQu Wenruo 111700425dd9SQu Wenruo sector->page = bvec.bv_page; 111800425dd9SQu Wenruo sector->pgoff = bvec.bv_offset + bvec_offset; 111900425dd9SQu Wenruo ASSERT(sector->pgoff < PAGE_SIZE); 112000425dd9SQu Wenruo } 112100425dd9SQu Wenruo } 112200425dd9SQu Wenruo } 112300425dd9SQu Wenruo 112453b381b3SDavid Woodhouse /* 112553b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 112653b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 112753b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 112853b381b3SDavid Woodhouse * reconstruction. 112953b381b3SDavid Woodhouse * 113053b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 113153b381b3SDavid Woodhouse */ 113253b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 113353b381b3SDavid Woodhouse { 113453b381b3SDavid Woodhouse struct bio *bio; 113553b381b3SDavid Woodhouse 113653b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 113700425dd9SQu Wenruo bio_list_for_each(bio, &rbio->bio_list) 113800425dd9SQu Wenruo index_one_bio(rbio, bio); 113900425dd9SQu Wenruo 114053b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 114153b381b3SDavid Woodhouse } 114253b381b3SDavid Woodhouse 1143b8bea09aSQu Wenruo static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1144b8bea09aSQu Wenruo struct raid56_bio_trace_info *trace_info) 1145b8bea09aSQu Wenruo { 1146b8bea09aSQu Wenruo const struct btrfs_io_context *bioc = rbio->bioc; 1147b8bea09aSQu Wenruo int i; 1148b8bea09aSQu Wenruo 1149b8bea09aSQu Wenruo ASSERT(bioc); 1150b8bea09aSQu Wenruo 1151b8bea09aSQu Wenruo /* We rely on bio->bi_bdev to find the stripe number. */ 1152b8bea09aSQu Wenruo if (!bio->bi_bdev) 1153b8bea09aSQu Wenruo goto not_found; 1154b8bea09aSQu Wenruo 1155b8bea09aSQu Wenruo for (i = 0; i < bioc->num_stripes; i++) { 1156b8bea09aSQu Wenruo if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1157b8bea09aSQu Wenruo continue; 1158b8bea09aSQu Wenruo trace_info->stripe_nr = i; 1159b8bea09aSQu Wenruo trace_info->devid = bioc->stripes[i].dev->devid; 1160b8bea09aSQu Wenruo trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1161b8bea09aSQu Wenruo bioc->stripes[i].physical; 1162b8bea09aSQu Wenruo return; 1163b8bea09aSQu Wenruo } 1164b8bea09aSQu Wenruo 1165b8bea09aSQu Wenruo not_found: 1166b8bea09aSQu Wenruo trace_info->devid = -1; 1167b8bea09aSQu Wenruo trace_info->offset = -1; 1168b8bea09aSQu Wenruo trace_info->stripe_nr = -1; 1169b8bea09aSQu Wenruo } 1170b8bea09aSQu Wenruo 117153b381b3SDavid Woodhouse /* 117253b381b3SDavid Woodhouse * this is called from one of two situations. We either 117353b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 117453b381b3SDavid Woodhouse * the missing bits off disk. 117553b381b3SDavid Woodhouse * 117653b381b3SDavid Woodhouse * This will calculate the parity and then send down any 117753b381b3SDavid Woodhouse * changed blocks. 117853b381b3SDavid Woodhouse */ 117953b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 118053b381b3SDavid Woodhouse { 11814c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 11821145059aSQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 11831389053eSKees Cook void **pointers = rbio->finish_pointers; 118453b381b3SDavid Woodhouse int nr_data = rbio->nr_data; 118536920044SQu Wenruo /* The total sector number inside the full stripe. */ 118636920044SQu Wenruo int total_sector_nr; 118753b381b3SDavid Woodhouse int stripe; 118836920044SQu Wenruo /* Sector number inside a stripe. */ 11893e77605dSQu Wenruo int sectornr; 1190c17af965SDavid Sterba bool has_qstripe; 119153b381b3SDavid Woodhouse struct bio_list bio_list; 119253b381b3SDavid Woodhouse struct bio *bio; 119353b381b3SDavid Woodhouse int ret; 119453b381b3SDavid Woodhouse 119553b381b3SDavid Woodhouse bio_list_init(&bio_list); 119653b381b3SDavid Woodhouse 1197c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 1198c17af965SDavid Sterba has_qstripe = false; 1199c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 1200c17af965SDavid Sterba has_qstripe = true; 1201c17af965SDavid Sterba else 120253b381b3SDavid Woodhouse BUG(); 120353b381b3SDavid Woodhouse 1204bd8f7e62SQu Wenruo /* We should have at least one data sector. */ 1205bd8f7e62SQu Wenruo ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1206bd8f7e62SQu Wenruo 120753b381b3SDavid Woodhouse /* at this point we either have a full stripe, 120853b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 120953b381b3SDavid Woodhouse * recalculate the parity and write the new results. 121053b381b3SDavid Woodhouse * 121153b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 121253b381b3SDavid Woodhouse * bio list here, anyone else that wants to 121353b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 121453b381b3SDavid Woodhouse */ 121553b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 121653b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 121753b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 121853b381b3SDavid Woodhouse 1219b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 122053b381b3SDavid Woodhouse 122153b381b3SDavid Woodhouse /* 122253b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 122353b381b3SDavid Woodhouse * bio list one last time and map the page pointers 12244ae10b3aSChris Mason * 12254ae10b3aSChris Mason * We don't cache full rbios because we're assuming 12264ae10b3aSChris Mason * the higher layers are unlikely to use this area of 12274ae10b3aSChris Mason * the disk again soon. If they do use it again, 12284ae10b3aSChris Mason * hopefully they will send another full bio. 122953b381b3SDavid Woodhouse */ 123053b381b3SDavid Woodhouse index_rbio_pages(rbio); 12314ae10b3aSChris Mason if (!rbio_is_full(rbio)) 12324ae10b3aSChris Mason cache_rbio_pages(rbio); 12334ae10b3aSChris Mason else 12344ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 123553b381b3SDavid Woodhouse 12363e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 12371145059aSQu Wenruo struct sector_ptr *sector; 12381145059aSQu Wenruo 12391145059aSQu Wenruo /* First collect one sector from each data stripe */ 124053b381b3SDavid Woodhouse for (stripe = 0; stripe < nr_data; stripe++) { 12411145059aSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 12421145059aSQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 12431145059aSQu Wenruo sector->pgoff; 124453b381b3SDavid Woodhouse } 124553b381b3SDavid Woodhouse 12461145059aSQu Wenruo /* Then add the parity stripe */ 12471145059aSQu Wenruo sector = rbio_pstripe_sector(rbio, sectornr); 12481145059aSQu Wenruo sector->uptodate = 1; 12491145059aSQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 125053b381b3SDavid Woodhouse 1251c17af965SDavid Sterba if (has_qstripe) { 125253b381b3SDavid Woodhouse /* 12531145059aSQu Wenruo * RAID6, add the qstripe and call the library function 12541145059aSQu Wenruo * to fill in our p/q 125553b381b3SDavid Woodhouse */ 12561145059aSQu Wenruo sector = rbio_qstripe_sector(rbio, sectornr); 12571145059aSQu Wenruo sector->uptodate = 1; 12581145059aSQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + 12591145059aSQu Wenruo sector->pgoff; 126053b381b3SDavid Woodhouse 12611145059aSQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 126253b381b3SDavid Woodhouse pointers); 126353b381b3SDavid Woodhouse } else { 126453b381b3SDavid Woodhouse /* raid5 */ 12651145059aSQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 12661145059aSQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 126753b381b3SDavid Woodhouse } 126894a0b58dSIra Weiny for (stripe = stripe - 1; stripe >= 0; stripe--) 126994a0b58dSIra Weiny kunmap_local(pointers[stripe]); 127053b381b3SDavid Woodhouse } 127153b381b3SDavid Woodhouse 127253b381b3SDavid Woodhouse /* 127336920044SQu Wenruo * Start writing. Make bios for everything from the higher layers (the 127436920044SQu Wenruo * bio_list in our rbio) and our P/Q. Ignore everything else. 127553b381b3SDavid Woodhouse */ 127636920044SQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 127736920044SQu Wenruo total_sector_nr++) { 12783e77605dSQu Wenruo struct sector_ptr *sector; 12793e77605dSQu Wenruo 128036920044SQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 128136920044SQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 128236920044SQu Wenruo 1283bd8f7e62SQu Wenruo /* This vertical stripe has no data, skip it. */ 1284bd8f7e62SQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 1285bd8f7e62SQu Wenruo continue; 1286bd8f7e62SQu Wenruo 128753b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 12883e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 12893e77605dSQu Wenruo if (!sector) 129053b381b3SDavid Woodhouse continue; 129153b381b3SDavid Woodhouse } else { 12923e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 129353b381b3SDavid Woodhouse } 129453b381b3SDavid Woodhouse 12953e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 12963e77605dSQu Wenruo sectornr, rbio->stripe_len, 1297e01bf588SChristoph Hellwig REQ_OP_WRITE); 129853b381b3SDavid Woodhouse if (ret) 129953b381b3SDavid Woodhouse goto cleanup; 130053b381b3SDavid Woodhouse } 130153b381b3SDavid Woodhouse 13024c664611SQu Wenruo if (likely(!bioc->num_tgtdevs)) 13032c8cdd6eSMiao Xie goto write_data; 13042c8cdd6eSMiao Xie 130536920044SQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 130636920044SQu Wenruo total_sector_nr++) { 13073e77605dSQu Wenruo struct sector_ptr *sector; 13083e77605dSQu Wenruo 130936920044SQu Wenruo stripe = total_sector_nr / rbio->stripe_nsectors; 131036920044SQu Wenruo sectornr = total_sector_nr % rbio->stripe_nsectors; 131136920044SQu Wenruo 131236920044SQu Wenruo if (!bioc->tgtdev_map[stripe]) { 131336920044SQu Wenruo /* 131436920044SQu Wenruo * We can skip the whole stripe completely, note 131536920044SQu Wenruo * total_sector_nr will be increased by one anyway. 131636920044SQu Wenruo */ 131736920044SQu Wenruo ASSERT(sectornr == 0); 131836920044SQu Wenruo total_sector_nr += rbio->stripe_nsectors - 1; 131936920044SQu Wenruo continue; 132036920044SQu Wenruo } 132136920044SQu Wenruo 1322bd8f7e62SQu Wenruo /* This vertical stripe has no data, skip it. */ 1323bd8f7e62SQu Wenruo if (!test_bit(sectornr, &rbio->dbitmap)) 1324bd8f7e62SQu Wenruo continue; 1325bd8f7e62SQu Wenruo 13262c8cdd6eSMiao Xie if (stripe < rbio->nr_data) { 13273e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13283e77605dSQu Wenruo if (!sector) 13292c8cdd6eSMiao Xie continue; 13302c8cdd6eSMiao Xie } else { 13313e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 13322c8cdd6eSMiao Xie } 13332c8cdd6eSMiao Xie 13343e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 13354c664611SQu Wenruo rbio->bioc->tgtdev_map[stripe], 13363e77605dSQu Wenruo sectornr, rbio->stripe_len, 1337e01bf588SChristoph Hellwig REQ_OP_WRITE); 13382c8cdd6eSMiao Xie if (ret) 13392c8cdd6eSMiao Xie goto cleanup; 13402c8cdd6eSMiao Xie } 13412c8cdd6eSMiao Xie 13422c8cdd6eSMiao Xie write_data: 1343b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1344b89e1b01SMiao Xie BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 134553b381b3SDavid Woodhouse 1346bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 134753b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 13484e49ea4aSMike Christie 1349b8bea09aSQu Wenruo if (trace_raid56_write_stripe_enabled()) { 1350b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 1351b8bea09aSQu Wenruo 1352b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1353b8bea09aSQu Wenruo trace_raid56_write_stripe(rbio, bio, &trace_info); 1354b8bea09aSQu Wenruo } 13554e49ea4aSMike Christie submit_bio(bio); 135653b381b3SDavid Woodhouse } 135753b381b3SDavid Woodhouse return; 135853b381b3SDavid Woodhouse 135953b381b3SDavid Woodhouse cleanup: 136058efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1361785884fcSLiu Bo 1362785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1363785884fcSLiu Bo bio_put(bio); 136453b381b3SDavid Woodhouse } 136553b381b3SDavid Woodhouse 136653b381b3SDavid Woodhouse /* 136753b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 136853b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 136953b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 137053b381b3SDavid Woodhouse */ 137153b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 137253b381b3SDavid Woodhouse struct bio *bio) 137353b381b3SDavid Woodhouse { 13744f024f37SKent Overstreet u64 physical = bio->bi_iter.bi_sector; 137553b381b3SDavid Woodhouse int i; 13764c664611SQu Wenruo struct btrfs_io_stripe *stripe; 137753b381b3SDavid Woodhouse 137853b381b3SDavid Woodhouse physical <<= 9; 137953b381b3SDavid Woodhouse 13804c664611SQu Wenruo for (i = 0; i < rbio->bioc->num_stripes; i++) { 13814c664611SQu Wenruo stripe = &rbio->bioc->stripes[i]; 138283025863SNikolay Borisov if (in_range(physical, stripe->physical, rbio->stripe_len) && 1383309dca30SChristoph Hellwig stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 138453b381b3SDavid Woodhouse return i; 138553b381b3SDavid Woodhouse } 138653b381b3SDavid Woodhouse } 138753b381b3SDavid Woodhouse return -1; 138853b381b3SDavid Woodhouse } 138953b381b3SDavid Woodhouse 139053b381b3SDavid Woodhouse /* 139153b381b3SDavid Woodhouse * helper to find the stripe number for a given 139253b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 139353b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 139453b381b3SDavid Woodhouse */ 139553b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 139653b381b3SDavid Woodhouse struct bio *bio) 139753b381b3SDavid Woodhouse { 13981201b58bSDavid Sterba u64 logical = bio->bi_iter.bi_sector << 9; 139953b381b3SDavid Woodhouse int i; 140053b381b3SDavid Woodhouse 140153b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 14024c664611SQu Wenruo u64 stripe_start = rbio->bioc->raid_map[i]; 140383025863SNikolay Borisov 140483025863SNikolay Borisov if (in_range(logical, stripe_start, rbio->stripe_len)) 140553b381b3SDavid Woodhouse return i; 140653b381b3SDavid Woodhouse } 140753b381b3SDavid Woodhouse return -1; 140853b381b3SDavid Woodhouse } 140953b381b3SDavid Woodhouse 141053b381b3SDavid Woodhouse /* 141153b381b3SDavid Woodhouse * returns -EIO if we had too many failures 141253b381b3SDavid Woodhouse */ 141353b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 141453b381b3SDavid Woodhouse { 141553b381b3SDavid Woodhouse unsigned long flags; 141653b381b3SDavid Woodhouse int ret = 0; 141753b381b3SDavid Woodhouse 141853b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 141953b381b3SDavid Woodhouse 142053b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 142153b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 142253b381b3SDavid Woodhouse goto out; 142353b381b3SDavid Woodhouse 142453b381b3SDavid Woodhouse if (rbio->faila == -1) { 142553b381b3SDavid Woodhouse /* first failure on this rbio */ 142653b381b3SDavid Woodhouse rbio->faila = failed; 1427b89e1b01SMiao Xie atomic_inc(&rbio->error); 142853b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 142953b381b3SDavid Woodhouse /* second failure on this rbio */ 143053b381b3SDavid Woodhouse rbio->failb = failed; 1431b89e1b01SMiao Xie atomic_inc(&rbio->error); 143253b381b3SDavid Woodhouse } else { 143353b381b3SDavid Woodhouse ret = -EIO; 143453b381b3SDavid Woodhouse } 143553b381b3SDavid Woodhouse out: 143653b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 143753b381b3SDavid Woodhouse 143853b381b3SDavid Woodhouse return ret; 143953b381b3SDavid Woodhouse } 144053b381b3SDavid Woodhouse 144153b381b3SDavid Woodhouse /* 144253b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 144353b381b3SDavid Woodhouse * bio. 144453b381b3SDavid Woodhouse */ 144553b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 144653b381b3SDavid Woodhouse struct bio *bio) 144753b381b3SDavid Woodhouse { 144853b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 144953b381b3SDavid Woodhouse 145053b381b3SDavid Woodhouse if (failed < 0) 145153b381b3SDavid Woodhouse return -EIO; 145253b381b3SDavid Woodhouse 145353b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 145453b381b3SDavid Woodhouse } 145553b381b3SDavid Woodhouse 145653b381b3SDavid Woodhouse /* 14575fdb7afcSQu Wenruo * For subpage case, we can no longer set page Uptodate directly for 14585fdb7afcSQu Wenruo * stripe_pages[], thus we need to locate the sector. 14595fdb7afcSQu Wenruo */ 14605fdb7afcSQu Wenruo static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 14615fdb7afcSQu Wenruo struct page *page, 14625fdb7afcSQu Wenruo unsigned int pgoff) 14635fdb7afcSQu Wenruo { 14645fdb7afcSQu Wenruo int i; 14655fdb7afcSQu Wenruo 14665fdb7afcSQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 14675fdb7afcSQu Wenruo struct sector_ptr *sector = &rbio->stripe_sectors[i]; 14685fdb7afcSQu Wenruo 14695fdb7afcSQu Wenruo if (sector->page == page && sector->pgoff == pgoff) 14705fdb7afcSQu Wenruo return sector; 14715fdb7afcSQu Wenruo } 14725fdb7afcSQu Wenruo return NULL; 14735fdb7afcSQu Wenruo } 14745fdb7afcSQu Wenruo 14755fdb7afcSQu Wenruo /* 147653b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 147753b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 147853b381b3SDavid Woodhouse */ 14795fdb7afcSQu Wenruo static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 148053b381b3SDavid Woodhouse { 14815fdb7afcSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 14820198e5b7SLiu Bo struct bio_vec *bvec; 14836dc4f100SMing Lei struct bvec_iter_all iter_all; 148453b381b3SDavid Woodhouse 14850198e5b7SLiu Bo ASSERT(!bio_flagged(bio, BIO_CLONED)); 14866592e58cSFilipe Manana 14875fdb7afcSQu Wenruo bio_for_each_segment_all(bvec, bio, iter_all) { 14885fdb7afcSQu Wenruo struct sector_ptr *sector; 14895fdb7afcSQu Wenruo int pgoff; 14905fdb7afcSQu Wenruo 14915fdb7afcSQu Wenruo for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 14925fdb7afcSQu Wenruo pgoff += sectorsize) { 14935fdb7afcSQu Wenruo sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 14945fdb7afcSQu Wenruo ASSERT(sector); 14955fdb7afcSQu Wenruo if (sector) 14965fdb7afcSQu Wenruo sector->uptodate = 1; 14975fdb7afcSQu Wenruo } 14985fdb7afcSQu Wenruo } 149953b381b3SDavid Woodhouse } 150053b381b3SDavid Woodhouse 1501d34e123dSChristoph Hellwig static void raid56_bio_end_io(struct bio *bio) 150253b381b3SDavid Woodhouse { 150353b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 150453b381b3SDavid Woodhouse 15054e4cbee9SChristoph Hellwig if (bio->bi_status) 150653b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 150753b381b3SDavid Woodhouse else 15085fdb7afcSQu Wenruo set_bio_pages_uptodate(rbio, bio); 150953b381b3SDavid Woodhouse 151053b381b3SDavid Woodhouse bio_put(bio); 151153b381b3SDavid Woodhouse 1512d34e123dSChristoph Hellwig if (atomic_dec_and_test(&rbio->stripes_pending)) 1513d34e123dSChristoph Hellwig queue_work(rbio->bioc->fs_info->endio_raid56_workers, 1514d34e123dSChristoph Hellwig &rbio->end_io_work); 1515d34e123dSChristoph Hellwig } 151653b381b3SDavid Woodhouse 151753b381b3SDavid Woodhouse /* 1518d34e123dSChristoph Hellwig * End io handler for the read phase of the RMW cycle. All the bios here are 1519d34e123dSChristoph Hellwig * physical stripe bios we've read from the disk so we can recalculate the 1520d34e123dSChristoph Hellwig * parity of the stripe. 1521d34e123dSChristoph Hellwig * 1522d34e123dSChristoph Hellwig * This will usually kick off finish_rmw once all the bios are read in, but it 1523d34e123dSChristoph Hellwig * may trigger parity reconstruction if we had any errors along the way 1524d34e123dSChristoph Hellwig */ 1525d34e123dSChristoph Hellwig static void raid56_rmw_end_io_work(struct work_struct *work) 1526d34e123dSChristoph Hellwig { 1527d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 1528d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 1529d34e123dSChristoph Hellwig 1530d34e123dSChristoph Hellwig if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { 1531d34e123dSChristoph Hellwig rbio_orig_end_io(rbio, BLK_STS_IOERR); 1532d34e123dSChristoph Hellwig return; 1533d34e123dSChristoph Hellwig } 1534d34e123dSChristoph Hellwig 1535d34e123dSChristoph Hellwig /* 1536d34e123dSChristoph Hellwig * This will normally call finish_rmw to start our write but if there 1537d34e123dSChristoph Hellwig * are any failed stripes we'll reconstruct from parity first. 153853b381b3SDavid Woodhouse */ 153953b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 154053b381b3SDavid Woodhouse } 154153b381b3SDavid Woodhouse 154253b381b3SDavid Woodhouse /* 154353b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 154453b381b3SDavid Woodhouse * unlock after all the writes are done 154553b381b3SDavid Woodhouse */ 154653b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 154753b381b3SDavid Woodhouse { 154853b381b3SDavid Woodhouse int bios_to_read = 0; 154953b381b3SDavid Woodhouse struct bio_list bio_list; 155053b381b3SDavid Woodhouse int ret; 15513e77605dSQu Wenruo int sectornr; 155253b381b3SDavid Woodhouse int stripe; 155353b381b3SDavid Woodhouse struct bio *bio; 155453b381b3SDavid Woodhouse 155553b381b3SDavid Woodhouse bio_list_init(&bio_list); 155653b381b3SDavid Woodhouse 155753b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 155853b381b3SDavid Woodhouse if (ret) 155953b381b3SDavid Woodhouse goto cleanup; 156053b381b3SDavid Woodhouse 156153b381b3SDavid Woodhouse index_rbio_pages(rbio); 156253b381b3SDavid Woodhouse 1563b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 156453b381b3SDavid Woodhouse /* 156553b381b3SDavid Woodhouse * build a list of bios to read all the missing parts of this 156653b381b3SDavid Woodhouse * stripe 156753b381b3SDavid Woodhouse */ 156853b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->nr_data; stripe++) { 15693e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 15703e77605dSQu Wenruo struct sector_ptr *sector; 15713e77605dSQu Wenruo 157253b381b3SDavid Woodhouse /* 15733e77605dSQu Wenruo * We want to find all the sectors missing from the 15743e77605dSQu Wenruo * rbio and read them from the disk. If * sector_in_rbio() 15753e77605dSQu Wenruo * finds a page in the bio list we don't need to read 15763e77605dSQu Wenruo * it off the stripe. 157753b381b3SDavid Woodhouse */ 15783e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 15793e77605dSQu Wenruo if (sector) 158053b381b3SDavid Woodhouse continue; 158153b381b3SDavid Woodhouse 15823e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 15834ae10b3aSChris Mason /* 15843e77605dSQu Wenruo * The bio cache may have handed us an uptodate page. 15853e77605dSQu Wenruo * If so, be happy and use it. 15864ae10b3aSChris Mason */ 15873e77605dSQu Wenruo if (sector->uptodate) 15884ae10b3aSChris Mason continue; 15894ae10b3aSChris Mason 15903e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 15913e77605dSQu Wenruo stripe, sectornr, rbio->stripe_len, 1592e01bf588SChristoph Hellwig REQ_OP_READ); 159353b381b3SDavid Woodhouse if (ret) 159453b381b3SDavid Woodhouse goto cleanup; 159553b381b3SDavid Woodhouse } 159653b381b3SDavid Woodhouse } 159753b381b3SDavid Woodhouse 159853b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 159953b381b3SDavid Woodhouse if (!bios_to_read) { 160053b381b3SDavid Woodhouse /* 160153b381b3SDavid Woodhouse * this can happen if others have merged with 160253b381b3SDavid Woodhouse * us, it means there is nothing left to read. 160353b381b3SDavid Woodhouse * But if there are missing devices it may not be 160453b381b3SDavid Woodhouse * safe to do the full stripe write yet. 160553b381b3SDavid Woodhouse */ 160653b381b3SDavid Woodhouse goto finish; 160753b381b3SDavid Woodhouse } 160853b381b3SDavid Woodhouse 160953b381b3SDavid Woodhouse /* 16104c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 16114c664611SQu Wenruo * touch it after that. 161253b381b3SDavid Woodhouse */ 1613b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 1614d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); 1615bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 1616d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 161753b381b3SDavid Woodhouse 1618b8bea09aSQu Wenruo if (trace_raid56_read_partial_enabled()) { 1619b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 1620b8bea09aSQu Wenruo 1621b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 1622b8bea09aSQu Wenruo trace_raid56_read_partial(rbio, bio, &trace_info); 1623b8bea09aSQu Wenruo } 16244e49ea4aSMike Christie submit_bio(bio); 162553b381b3SDavid Woodhouse } 162653b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 162753b381b3SDavid Woodhouse return 0; 162853b381b3SDavid Woodhouse 162953b381b3SDavid Woodhouse cleanup: 163058efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1631785884fcSLiu Bo 1632785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1633785884fcSLiu Bo bio_put(bio); 1634785884fcSLiu Bo 163553b381b3SDavid Woodhouse return -EIO; 163653b381b3SDavid Woodhouse 163753b381b3SDavid Woodhouse finish: 163853b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 163953b381b3SDavid Woodhouse return 0; 164053b381b3SDavid Woodhouse } 164153b381b3SDavid Woodhouse 164253b381b3SDavid Woodhouse /* 164353b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 164453b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 164553b381b3SDavid Woodhouse */ 164653b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 164753b381b3SDavid Woodhouse { 164853b381b3SDavid Woodhouse int ret; 164953b381b3SDavid Woodhouse 165053b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 16513cd846d1SMiao Xie if (ret) { 16523cd846d1SMiao Xie __free_raid_bio(rbio); 165353b381b3SDavid Woodhouse return ret; 16543cd846d1SMiao Xie } 165553b381b3SDavid Woodhouse 165653b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 165753b381b3SDavid Woodhouse if (ret == 0) 165853b381b3SDavid Woodhouse finish_rmw(rbio); 165953b381b3SDavid Woodhouse return 0; 166053b381b3SDavid Woodhouse } 166153b381b3SDavid Woodhouse 166253b381b3SDavid Woodhouse /* 166353b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 166453b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 166553b381b3SDavid Woodhouse * rbio before calculating new parity 166653b381b3SDavid Woodhouse */ 166753b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 166853b381b3SDavid Woodhouse { 166953b381b3SDavid Woodhouse int ret; 167053b381b3SDavid Woodhouse 167153b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 167253b381b3SDavid Woodhouse if (ret == 0) 1673cf6a4a75SDavid Sterba start_async_work(rbio, rmw_work); 167453b381b3SDavid Woodhouse return 0; 167553b381b3SDavid Woodhouse } 167653b381b3SDavid Woodhouse 167753b381b3SDavid Woodhouse /* 167853b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 167953b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 168053b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 168153b381b3SDavid Woodhouse * go directly to finish_rmw 168253b381b3SDavid Woodhouse */ 168353b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 168453b381b3SDavid Woodhouse { 168553b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 168653b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 168753b381b3SDavid Woodhouse return partial_stripe_write(rbio); 168853b381b3SDavid Woodhouse return full_stripe_write(rbio); 168953b381b3SDavid Woodhouse } 169053b381b3SDavid Woodhouse 169153b381b3SDavid Woodhouse /* 16926ac0f488SChris Mason * We use plugging call backs to collect full stripes. 16936ac0f488SChris Mason * Any time we get a partial stripe write while plugged 16946ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 16956ac0f488SChris Mason * we sort the list by logical block number and merge 16966ac0f488SChris Mason * everything we can into the same rbios 16976ac0f488SChris Mason */ 16986ac0f488SChris Mason struct btrfs_plug_cb { 16996ac0f488SChris Mason struct blk_plug_cb cb; 17006ac0f488SChris Mason struct btrfs_fs_info *info; 17016ac0f488SChris Mason struct list_head rbio_list; 1702385de0efSChristoph Hellwig struct work_struct work; 17036ac0f488SChris Mason }; 17046ac0f488SChris Mason 17056ac0f488SChris Mason /* 17066ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 17076ac0f488SChris Mason */ 17084f0f586bSSami Tolvanen static int plug_cmp(void *priv, const struct list_head *a, 17094f0f586bSSami Tolvanen const struct list_head *b) 17106ac0f488SChris Mason { 1711214cc184SDavid Sterba const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 17126ac0f488SChris Mason plug_list); 1713214cc184SDavid Sterba const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 17146ac0f488SChris Mason plug_list); 17154f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 17164f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 17176ac0f488SChris Mason 17186ac0f488SChris Mason if (a_sector < b_sector) 17196ac0f488SChris Mason return -1; 17206ac0f488SChris Mason if (a_sector > b_sector) 17216ac0f488SChris Mason return 1; 17226ac0f488SChris Mason return 0; 17236ac0f488SChris Mason } 17246ac0f488SChris Mason 17256ac0f488SChris Mason static void run_plug(struct btrfs_plug_cb *plug) 17266ac0f488SChris Mason { 17276ac0f488SChris Mason struct btrfs_raid_bio *cur; 17286ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 17296ac0f488SChris Mason 17306ac0f488SChris Mason /* 17316ac0f488SChris Mason * sort our plug list then try to merge 17326ac0f488SChris Mason * everything we can in hopes of creating full 17336ac0f488SChris Mason * stripes. 17346ac0f488SChris Mason */ 17356ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 17366ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 17376ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 17386ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 17396ac0f488SChris Mason list_del_init(&cur->plug_list); 17406ac0f488SChris Mason 17416ac0f488SChris Mason if (rbio_is_full(cur)) { 1742c7b562c5SDavid Sterba int ret; 1743c7b562c5SDavid Sterba 17446ac0f488SChris Mason /* we have a full stripe, send it down */ 1745c7b562c5SDavid Sterba ret = full_stripe_write(cur); 1746c7b562c5SDavid Sterba BUG_ON(ret); 17476ac0f488SChris Mason continue; 17486ac0f488SChris Mason } 17496ac0f488SChris Mason if (last) { 17506ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 17516ac0f488SChris Mason merge_rbio(last, cur); 17526ac0f488SChris Mason __free_raid_bio(cur); 17536ac0f488SChris Mason continue; 17546ac0f488SChris Mason 17556ac0f488SChris Mason } 17566ac0f488SChris Mason __raid56_parity_write(last); 17576ac0f488SChris Mason } 17586ac0f488SChris Mason last = cur; 17596ac0f488SChris Mason } 17606ac0f488SChris Mason if (last) { 17616ac0f488SChris Mason __raid56_parity_write(last); 17626ac0f488SChris Mason } 17636ac0f488SChris Mason kfree(plug); 17646ac0f488SChris Mason } 17656ac0f488SChris Mason 17666ac0f488SChris Mason /* 17676ac0f488SChris Mason * if the unplug comes from schedule, we have to push the 17686ac0f488SChris Mason * work off to a helper thread 17696ac0f488SChris Mason */ 1770385de0efSChristoph Hellwig static void unplug_work(struct work_struct *work) 17716ac0f488SChris Mason { 17726ac0f488SChris Mason struct btrfs_plug_cb *plug; 17736ac0f488SChris Mason plug = container_of(work, struct btrfs_plug_cb, work); 17746ac0f488SChris Mason run_plug(plug); 17756ac0f488SChris Mason } 17766ac0f488SChris Mason 17776ac0f488SChris Mason static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 17786ac0f488SChris Mason { 17796ac0f488SChris Mason struct btrfs_plug_cb *plug; 17806ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 17816ac0f488SChris Mason 17826ac0f488SChris Mason if (from_schedule) { 1783385de0efSChristoph Hellwig INIT_WORK(&plug->work, unplug_work); 1784385de0efSChristoph Hellwig queue_work(plug->info->rmw_workers, &plug->work); 17856ac0f488SChris Mason return; 17866ac0f488SChris Mason } 17876ac0f488SChris Mason run_plug(plug); 17886ac0f488SChris Mason } 17896ac0f488SChris Mason 1790bd8f7e62SQu Wenruo /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1791bd8f7e62SQu Wenruo static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1792bd8f7e62SQu Wenruo { 1793bd8f7e62SQu Wenruo const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1794bd8f7e62SQu Wenruo const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1795bd8f7e62SQu Wenruo const u64 full_stripe_start = rbio->bioc->raid_map[0]; 1796bd8f7e62SQu Wenruo const u32 orig_len = orig_bio->bi_iter.bi_size; 1797bd8f7e62SQu Wenruo const u32 sectorsize = fs_info->sectorsize; 1798bd8f7e62SQu Wenruo u64 cur_logical; 1799bd8f7e62SQu Wenruo 1800bd8f7e62SQu Wenruo ASSERT(orig_logical >= full_stripe_start && 1801bd8f7e62SQu Wenruo orig_logical + orig_len <= full_stripe_start + 1802bd8f7e62SQu Wenruo rbio->nr_data * rbio->stripe_len); 1803bd8f7e62SQu Wenruo 1804bd8f7e62SQu Wenruo bio_list_add(&rbio->bio_list, orig_bio); 1805bd8f7e62SQu Wenruo rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1806bd8f7e62SQu Wenruo 1807bd8f7e62SQu Wenruo /* Update the dbitmap. */ 1808bd8f7e62SQu Wenruo for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1809bd8f7e62SQu Wenruo cur_logical += sectorsize) { 1810bd8f7e62SQu Wenruo int bit = ((u32)(cur_logical - full_stripe_start) >> 1811bd8f7e62SQu Wenruo fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1812bd8f7e62SQu Wenruo 1813bd8f7e62SQu Wenruo set_bit(bit, &rbio->dbitmap); 1814bd8f7e62SQu Wenruo } 1815bd8f7e62SQu Wenruo } 1816bd8f7e62SQu Wenruo 18176ac0f488SChris Mason /* 181853b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 181953b381b3SDavid Woodhouse */ 1820cc353a8bSQu Wenruo int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) 182153b381b3SDavid Woodhouse { 18226a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 182353b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 18246ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 18256ac0f488SChris Mason struct blk_plug_cb *cb; 18264245215dSMiao Xie int ret; 182753b381b3SDavid Woodhouse 18284c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 1829af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 18304c664611SQu Wenruo btrfs_put_bioc(bioc); 183153b381b3SDavid Woodhouse return PTR_ERR(rbio); 1832af8e2d1dSMiao Xie } 18331b94b556SMiao Xie rbio->operation = BTRFS_RBIO_WRITE; 1834bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 18356ac0f488SChris Mason 18360b246afaSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 18374245215dSMiao Xie rbio->generic_bio_cnt = 1; 18384245215dSMiao Xie 18396ac0f488SChris Mason /* 18406ac0f488SChris Mason * don't plug on full rbios, just get them out the door 18416ac0f488SChris Mason * as quickly as we can 18426ac0f488SChris Mason */ 18434245215dSMiao Xie if (rbio_is_full(rbio)) { 18444245215dSMiao Xie ret = full_stripe_write(rbio); 18454245215dSMiao Xie if (ret) 18460b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 18474245215dSMiao Xie return ret; 18484245215dSMiao Xie } 18496ac0f488SChris Mason 18500b246afaSJeff Mahoney cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 18516ac0f488SChris Mason if (cb) { 18526ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 18536ac0f488SChris Mason if (!plug->info) { 18540b246afaSJeff Mahoney plug->info = fs_info; 18556ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 18566ac0f488SChris Mason } 18576ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 18584245215dSMiao Xie ret = 0; 18596ac0f488SChris Mason } else { 18604245215dSMiao Xie ret = __raid56_parity_write(rbio); 18614245215dSMiao Xie if (ret) 18620b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 186353b381b3SDavid Woodhouse } 18644245215dSMiao Xie return ret; 18656ac0f488SChris Mason } 186653b381b3SDavid Woodhouse 186753b381b3SDavid Woodhouse /* 186853b381b3SDavid Woodhouse * all parity reconstruction happens here. We've read in everything 186953b381b3SDavid Woodhouse * we can find from the drives and this does the heavy lifting of 187053b381b3SDavid Woodhouse * sorting the good from the bad. 187153b381b3SDavid Woodhouse */ 187253b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 187353b381b3SDavid Woodhouse { 187407e4d380SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 187507e4d380SQu Wenruo int sectornr, stripe; 187653b381b3SDavid Woodhouse void **pointers; 187794a0b58dSIra Weiny void **unmap_array; 187853b381b3SDavid Woodhouse int faila = -1, failb = -1; 187958efbc9fSOmar Sandoval blk_status_t err; 188053b381b3SDavid Woodhouse int i; 188153b381b3SDavid Woodhouse 188207e4d380SQu Wenruo /* 188307e4d380SQu Wenruo * This array stores the pointer for each sector, thus it has the extra 188407e4d380SQu Wenruo * pgoff value added from each sector 188507e4d380SQu Wenruo */ 188631e818feSDavid Sterba pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 188753b381b3SDavid Woodhouse if (!pointers) { 188858efbc9fSOmar Sandoval err = BLK_STS_RESOURCE; 188953b381b3SDavid Woodhouse goto cleanup_io; 189053b381b3SDavid Woodhouse } 189153b381b3SDavid Woodhouse 189294a0b58dSIra Weiny /* 189394a0b58dSIra Weiny * Store copy of pointers that does not get reordered during 189494a0b58dSIra Weiny * reconstruction so that kunmap_local works. 189594a0b58dSIra Weiny */ 189694a0b58dSIra Weiny unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 189794a0b58dSIra Weiny if (!unmap_array) { 189894a0b58dSIra Weiny err = BLK_STS_RESOURCE; 189994a0b58dSIra Weiny goto cleanup_pointers; 190094a0b58dSIra Weiny } 190194a0b58dSIra Weiny 190253b381b3SDavid Woodhouse faila = rbio->faila; 190353b381b3SDavid Woodhouse failb = rbio->failb; 190453b381b3SDavid Woodhouse 1905b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1906b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 190753b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 190853b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 190953b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 191053b381b3SDavid Woodhouse } 191153b381b3SDavid Woodhouse 191253b381b3SDavid Woodhouse index_rbio_pages(rbio); 191353b381b3SDavid Woodhouse 191407e4d380SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 191507e4d380SQu Wenruo struct sector_ptr *sector; 191607e4d380SQu Wenruo 19175a6ac9eaSMiao Xie /* 19185a6ac9eaSMiao Xie * Now we just use bitmap to mark the horizontal stripes in 19195a6ac9eaSMiao Xie * which we have data when doing parity scrub. 19205a6ac9eaSMiao Xie */ 19215a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1922c67c68ebSQu Wenruo !test_bit(sectornr, &rbio->dbitmap)) 19235a6ac9eaSMiao Xie continue; 19245a6ac9eaSMiao Xie 192594a0b58dSIra Weiny /* 192607e4d380SQu Wenruo * Setup our array of pointers with sectors from each stripe 192794a0b58dSIra Weiny * 192894a0b58dSIra Weiny * NOTE: store a duplicate array of pointers to preserve the 192994a0b58dSIra Weiny * pointer order 193053b381b3SDavid Woodhouse */ 19312c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 193253b381b3SDavid Woodhouse /* 193307e4d380SQu Wenruo * If we're rebuilding a read, we have to use 193453b381b3SDavid Woodhouse * pages from the bio list 193553b381b3SDavid Woodhouse */ 1936b4ee1782SOmar Sandoval if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1937b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 193853b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 193907e4d380SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 194053b381b3SDavid Woodhouse } else { 194107e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 194253b381b3SDavid Woodhouse } 194307e4d380SQu Wenruo ASSERT(sector->page); 194407e4d380SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 194507e4d380SQu Wenruo sector->pgoff; 194694a0b58dSIra Weiny unmap_array[stripe] = pointers[stripe]; 194753b381b3SDavid Woodhouse } 194853b381b3SDavid Woodhouse 194907e4d380SQu Wenruo /* All raid6 handling here */ 19504c664611SQu Wenruo if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 195107e4d380SQu Wenruo /* Single failure, rebuild from parity raid5 style */ 195253b381b3SDavid Woodhouse if (failb < 0) { 195353b381b3SDavid Woodhouse if (faila == rbio->nr_data) { 195453b381b3SDavid Woodhouse /* 195553b381b3SDavid Woodhouse * Just the P stripe has failed, without 195653b381b3SDavid Woodhouse * a bad data or Q stripe. 195753b381b3SDavid Woodhouse * TODO, we should redo the xor here. 195853b381b3SDavid Woodhouse */ 195958efbc9fSOmar Sandoval err = BLK_STS_IOERR; 196053b381b3SDavid Woodhouse goto cleanup; 196153b381b3SDavid Woodhouse } 196253b381b3SDavid Woodhouse /* 196353b381b3SDavid Woodhouse * a single failure in raid6 is rebuilt 196453b381b3SDavid Woodhouse * in the pstripe code below 196553b381b3SDavid Woodhouse */ 196653b381b3SDavid Woodhouse goto pstripe; 196753b381b3SDavid Woodhouse } 196853b381b3SDavid Woodhouse 196953b381b3SDavid Woodhouse /* make sure our ps and qs are in order */ 1970b7d2083aSNikolay Borisov if (faila > failb) 1971b7d2083aSNikolay Borisov swap(faila, failb); 197253b381b3SDavid Woodhouse 197353b381b3SDavid Woodhouse /* if the q stripe is failed, do a pstripe reconstruction 197453b381b3SDavid Woodhouse * from the xors. 197553b381b3SDavid Woodhouse * If both the q stripe and the P stripe are failed, we're 197653b381b3SDavid Woodhouse * here due to a crc mismatch and we can't give them the 197753b381b3SDavid Woodhouse * data they want 197853b381b3SDavid Woodhouse */ 19794c664611SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 19804c664611SQu Wenruo if (rbio->bioc->raid_map[faila] == 19818e5cfb55SZhao Lei RAID5_P_STRIPE) { 198258efbc9fSOmar Sandoval err = BLK_STS_IOERR; 198353b381b3SDavid Woodhouse goto cleanup; 198453b381b3SDavid Woodhouse } 198553b381b3SDavid Woodhouse /* 198653b381b3SDavid Woodhouse * otherwise we have one bad data stripe and 198753b381b3SDavid Woodhouse * a good P stripe. raid5! 198853b381b3SDavid Woodhouse */ 198953b381b3SDavid Woodhouse goto pstripe; 199053b381b3SDavid Woodhouse } 199153b381b3SDavid Woodhouse 19924c664611SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 19932c8cdd6eSMiao Xie raid6_datap_recov(rbio->real_stripes, 199407e4d380SQu Wenruo sectorsize, faila, pointers); 199553b381b3SDavid Woodhouse } else { 19962c8cdd6eSMiao Xie raid6_2data_recov(rbio->real_stripes, 199707e4d380SQu Wenruo sectorsize, faila, failb, 199853b381b3SDavid Woodhouse pointers); 199953b381b3SDavid Woodhouse } 200053b381b3SDavid Woodhouse } else { 200153b381b3SDavid Woodhouse void *p; 200253b381b3SDavid Woodhouse 200353b381b3SDavid Woodhouse /* rebuild from P stripe here (raid5 or raid6) */ 200453b381b3SDavid Woodhouse BUG_ON(failb != -1); 200553b381b3SDavid Woodhouse pstripe: 200653b381b3SDavid Woodhouse /* Copy parity block into failed block to start with */ 200707e4d380SQu Wenruo memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 200853b381b3SDavid Woodhouse 200953b381b3SDavid Woodhouse /* rearrange the pointer array */ 201053b381b3SDavid Woodhouse p = pointers[faila]; 201153b381b3SDavid Woodhouse for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 201253b381b3SDavid Woodhouse pointers[stripe] = pointers[stripe + 1]; 201353b381b3SDavid Woodhouse pointers[rbio->nr_data - 1] = p; 201453b381b3SDavid Woodhouse 201553b381b3SDavid Woodhouse /* xor in the rest */ 201607e4d380SQu Wenruo run_xor(pointers, rbio->nr_data - 1, sectorsize); 201753b381b3SDavid Woodhouse } 201853b381b3SDavid Woodhouse /* if we're doing this rebuild as part of an rmw, go through 201953b381b3SDavid Woodhouse * and set all of our private rbio pages in the 202053b381b3SDavid Woodhouse * failed stripes as uptodate. This way finish_rmw will 202153b381b3SDavid Woodhouse * know they can be trusted. If this was a read reconstruction, 202253b381b3SDavid Woodhouse * other endio functions will fiddle the uptodate bits 202353b381b3SDavid Woodhouse */ 20241b94b556SMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) { 202507e4d380SQu Wenruo for (i = 0; i < rbio->stripe_nsectors; i++) { 202653b381b3SDavid Woodhouse if (faila != -1) { 202707e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, faila, i); 202807e4d380SQu Wenruo sector->uptodate = 1; 202953b381b3SDavid Woodhouse } 203053b381b3SDavid Woodhouse if (failb != -1) { 203107e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, failb, i); 203207e4d380SQu Wenruo sector->uptodate = 1; 203353b381b3SDavid Woodhouse } 203453b381b3SDavid Woodhouse } 203553b381b3SDavid Woodhouse } 203694a0b58dSIra Weiny for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 203794a0b58dSIra Weiny kunmap_local(unmap_array[stripe]); 203853b381b3SDavid Woodhouse } 203953b381b3SDavid Woodhouse 204058efbc9fSOmar Sandoval err = BLK_STS_OK; 204153b381b3SDavid Woodhouse cleanup: 204294a0b58dSIra Weiny kfree(unmap_array); 204394a0b58dSIra Weiny cleanup_pointers: 204453b381b3SDavid Woodhouse kfree(pointers); 204553b381b3SDavid Woodhouse 204653b381b3SDavid Woodhouse cleanup_io: 2047580c6efaSLiu Bo /* 2048580c6efaSLiu Bo * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2049580c6efaSLiu Bo * valid rbio which is consistent with ondisk content, thus such a 2050580c6efaSLiu Bo * valid rbio can be cached to avoid further disk reads. 2051580c6efaSLiu Bo */ 2052580c6efaSLiu Bo if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2053580c6efaSLiu Bo rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 205444ac474dSLiu Bo /* 205544ac474dSLiu Bo * - In case of two failures, where rbio->failb != -1: 205644ac474dSLiu Bo * 205744ac474dSLiu Bo * Do not cache this rbio since the above read reconstruction 205844ac474dSLiu Bo * (raid6_datap_recov() or raid6_2data_recov()) may have 205944ac474dSLiu Bo * changed some content of stripes which are not identical to 206044ac474dSLiu Bo * on-disk content any more, otherwise, a later write/recover 206144ac474dSLiu Bo * may steal stripe_pages from this rbio and end up with 206244ac474dSLiu Bo * corruptions or rebuild failures. 206344ac474dSLiu Bo * 206444ac474dSLiu Bo * - In case of single failure, where rbio->failb == -1: 206544ac474dSLiu Bo * 206644ac474dSLiu Bo * Cache this rbio iff the above read reconstruction is 206752042d8eSAndrea Gelmini * executed without problems. 206844ac474dSLiu Bo */ 206944ac474dSLiu Bo if (err == BLK_STS_OK && rbio->failb < 0) 20704ae10b3aSChris Mason cache_rbio_pages(rbio); 20714ae10b3aSChris Mason else 20724ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 20734ae10b3aSChris Mason 20744246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 207558efbc9fSOmar Sandoval } else if (err == BLK_STS_OK) { 207653b381b3SDavid Woodhouse rbio->faila = -1; 207753b381b3SDavid Woodhouse rbio->failb = -1; 20785a6ac9eaSMiao Xie 20795a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) 208053b381b3SDavid Woodhouse finish_rmw(rbio); 20815a6ac9eaSMiao Xie else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 20825a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 20835a6ac9eaSMiao Xie else 20845a6ac9eaSMiao Xie BUG(); 208553b381b3SDavid Woodhouse } else { 20864246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 208753b381b3SDavid Woodhouse } 208853b381b3SDavid Woodhouse } 208953b381b3SDavid Woodhouse 209053b381b3SDavid Woodhouse /* 2091d34e123dSChristoph Hellwig * This is called only for stripes we've read from disk to reconstruct the 2092d34e123dSChristoph Hellwig * parity. 209353b381b3SDavid Woodhouse */ 2094d34e123dSChristoph Hellwig static void raid_recover_end_io_work(struct work_struct *work) 209553b381b3SDavid Woodhouse { 2096d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 2097d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 209853b381b3SDavid Woodhouse 20994c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 210058efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 210153b381b3SDavid Woodhouse else 210253b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 210353b381b3SDavid Woodhouse } 210453b381b3SDavid Woodhouse 210553b381b3SDavid Woodhouse /* 210653b381b3SDavid Woodhouse * reads everything we need off the disk to reconstruct 210753b381b3SDavid Woodhouse * the parity. endio handlers trigger final reconstruction 210853b381b3SDavid Woodhouse * when the IO is done. 210953b381b3SDavid Woodhouse * 211053b381b3SDavid Woodhouse * This is used both for reads from the higher layers and for 211153b381b3SDavid Woodhouse * parity construction required to finish a rmw cycle. 211253b381b3SDavid Woodhouse */ 211353b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 211453b381b3SDavid Woodhouse { 211553b381b3SDavid Woodhouse int bios_to_read = 0; 211653b381b3SDavid Woodhouse struct bio_list bio_list; 211753b381b3SDavid Woodhouse int ret; 2118*ef340fccSQu Wenruo int total_sector_nr; 211953b381b3SDavid Woodhouse struct bio *bio; 212053b381b3SDavid Woodhouse 212153b381b3SDavid Woodhouse bio_list_init(&bio_list); 212253b381b3SDavid Woodhouse 212353b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 212453b381b3SDavid Woodhouse if (ret) 212553b381b3SDavid Woodhouse goto cleanup; 212653b381b3SDavid Woodhouse 2127b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 212853b381b3SDavid Woodhouse 212953b381b3SDavid Woodhouse /* 21304ae10b3aSChris Mason * read everything that hasn't failed. Thanks to the 21314ae10b3aSChris Mason * stripe cache, it is possible that some or all of these 21324ae10b3aSChris Mason * pages are going to be uptodate. 213353b381b3SDavid Woodhouse */ 2134*ef340fccSQu Wenruo for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2135*ef340fccSQu Wenruo total_sector_nr++) { 2136*ef340fccSQu Wenruo int stripe = total_sector_nr / rbio->stripe_nsectors; 2137*ef340fccSQu Wenruo int sectornr = total_sector_nr % rbio->stripe_nsectors; 21383e77605dSQu Wenruo struct sector_ptr *sector; 213953b381b3SDavid Woodhouse 2140*ef340fccSQu Wenruo if (rbio->faila == stripe || rbio->failb == stripe) { 2141*ef340fccSQu Wenruo atomic_inc(&rbio->error); 2142*ef340fccSQu Wenruo /* Skip the current stripe. */ 2143*ef340fccSQu Wenruo ASSERT(sectornr == 0); 2144*ef340fccSQu Wenruo total_sector_nr += rbio->stripe_nsectors - 1; 2145*ef340fccSQu Wenruo continue; 2146*ef340fccSQu Wenruo } 2147*ef340fccSQu Wenruo /* The RMW code may have already read this page in. */ 21483e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 21493e77605dSQu Wenruo if (sector->uptodate) 215053b381b3SDavid Woodhouse continue; 215153b381b3SDavid Woodhouse 2152*ef340fccSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2153*ef340fccSQu Wenruo sectornr, rbio->stripe_len, 2154e01bf588SChristoph Hellwig REQ_OP_READ); 215553b381b3SDavid Woodhouse if (ret < 0) 215653b381b3SDavid Woodhouse goto cleanup; 215753b381b3SDavid Woodhouse } 215853b381b3SDavid Woodhouse 215953b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 216053b381b3SDavid Woodhouse if (!bios_to_read) { 216153b381b3SDavid Woodhouse /* 216253b381b3SDavid Woodhouse * we might have no bios to read just because the pages 216353b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 216453b381b3SDavid Woodhouse * the devices were gone. 216553b381b3SDavid Woodhouse */ 21664c664611SQu Wenruo if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 216753b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 2168813f8a0eSNikolay Borisov return 0; 216953b381b3SDavid Woodhouse } else { 217053b381b3SDavid Woodhouse goto cleanup; 217153b381b3SDavid Woodhouse } 217253b381b3SDavid Woodhouse } 217353b381b3SDavid Woodhouse 217453b381b3SDavid Woodhouse /* 21754c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 21764c664611SQu Wenruo * touch it after that. 217753b381b3SDavid Woodhouse */ 2178b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2179d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); 2180bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2181d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 218253b381b3SDavid Woodhouse 2183b8bea09aSQu Wenruo if (trace_raid56_scrub_read_recover_enabled()) { 2184b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 2185b8bea09aSQu Wenruo 2186b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2187b8bea09aSQu Wenruo trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 2188b8bea09aSQu Wenruo } 21894e49ea4aSMike Christie submit_bio(bio); 219053b381b3SDavid Woodhouse } 2191813f8a0eSNikolay Borisov 219253b381b3SDavid Woodhouse return 0; 219353b381b3SDavid Woodhouse 219453b381b3SDavid Woodhouse cleanup: 2195b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2196b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 219758efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2198785884fcSLiu Bo 2199785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2200785884fcSLiu Bo bio_put(bio); 2201785884fcSLiu Bo 220253b381b3SDavid Woodhouse return -EIO; 220353b381b3SDavid Woodhouse } 220453b381b3SDavid Woodhouse 220553b381b3SDavid Woodhouse /* 220653b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 220753b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 220853b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 220953b381b3SDavid Woodhouse * of the drive. 221053b381b3SDavid Woodhouse */ 22116a258d72SQu Wenruo int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2212cc353a8bSQu Wenruo u32 stripe_len, int mirror_num, int generic_io) 221353b381b3SDavid Woodhouse { 22146a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 221553b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 221653b381b3SDavid Woodhouse int ret; 221753b381b3SDavid Woodhouse 2218abad60c6SLiu Bo if (generic_io) { 22194c664611SQu Wenruo ASSERT(bioc->mirror_num == mirror_num); 2220c3a3b19bSQu Wenruo btrfs_bio(bio)->mirror_num = mirror_num; 2221abad60c6SLiu Bo } 2222abad60c6SLiu Bo 22234c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 2224af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 22256e9606d2SZhao Lei if (generic_io) 22264c664611SQu Wenruo btrfs_put_bioc(bioc); 222753b381b3SDavid Woodhouse return PTR_ERR(rbio); 2228af8e2d1dSMiao Xie } 222953b381b3SDavid Woodhouse 22301b94b556SMiao Xie rbio->operation = BTRFS_RBIO_READ_REBUILD; 2231bd8f7e62SQu Wenruo rbio_add_bio(rbio, bio); 223253b381b3SDavid Woodhouse 223353b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 223453b381b3SDavid Woodhouse if (rbio->faila == -1) { 22350b246afaSJeff Mahoney btrfs_warn(fs_info, 22364c664611SQu Wenruo "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 22371201b58bSDavid Sterba __func__, bio->bi_iter.bi_sector << 9, 22384c664611SQu Wenruo (u64)bio->bi_iter.bi_size, bioc->map_type); 22396e9606d2SZhao Lei if (generic_io) 22404c664611SQu Wenruo btrfs_put_bioc(bioc); 224153b381b3SDavid Woodhouse kfree(rbio); 224253b381b3SDavid Woodhouse return -EIO; 224353b381b3SDavid Woodhouse } 224453b381b3SDavid Woodhouse 22454245215dSMiao Xie if (generic_io) { 22460b246afaSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 22474245215dSMiao Xie rbio->generic_bio_cnt = 1; 22484245215dSMiao Xie } else { 22494c664611SQu Wenruo btrfs_get_bioc(bioc); 22504245215dSMiao Xie } 22514245215dSMiao Xie 225253b381b3SDavid Woodhouse /* 22538810f751SLiu Bo * Loop retry: 22548810f751SLiu Bo * for 'mirror == 2', reconstruct from all other stripes. 22558810f751SLiu Bo * for 'mirror_num > 2', select a stripe to fail on every retry. 225653b381b3SDavid Woodhouse */ 22578810f751SLiu Bo if (mirror_num > 2) { 22588810f751SLiu Bo /* 22598810f751SLiu Bo * 'mirror == 3' is to fail the p stripe and 22608810f751SLiu Bo * reconstruct from the q stripe. 'mirror > 3' is to 22618810f751SLiu Bo * fail a data stripe and reconstruct from p+q stripe. 22628810f751SLiu Bo */ 22638810f751SLiu Bo rbio->failb = rbio->real_stripes - (mirror_num - 1); 22648810f751SLiu Bo ASSERT(rbio->failb > 0); 22658810f751SLiu Bo if (rbio->failb <= rbio->faila) 22668810f751SLiu Bo rbio->failb--; 22678810f751SLiu Bo } 226853b381b3SDavid Woodhouse 226953b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 227053b381b3SDavid Woodhouse 227153b381b3SDavid Woodhouse /* 227253b381b3SDavid Woodhouse * __raid56_parity_recover will end the bio with 227353b381b3SDavid Woodhouse * any errors it hits. We don't want to return 227453b381b3SDavid Woodhouse * its error value up the stack because our caller 227553b381b3SDavid Woodhouse * will end up calling bio_endio with any nonzero 227653b381b3SDavid Woodhouse * return 227753b381b3SDavid Woodhouse */ 227853b381b3SDavid Woodhouse if (ret == 0) 227953b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 228053b381b3SDavid Woodhouse /* 228153b381b3SDavid Woodhouse * our rbio has been added to the list of 228253b381b3SDavid Woodhouse * rbios that will be handled after the 228353b381b3SDavid Woodhouse * currently lock owner is done 228453b381b3SDavid Woodhouse */ 228553b381b3SDavid Woodhouse return 0; 228653b381b3SDavid Woodhouse 228753b381b3SDavid Woodhouse } 228853b381b3SDavid Woodhouse 2289385de0efSChristoph Hellwig static void rmw_work(struct work_struct *work) 229053b381b3SDavid Woodhouse { 229153b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 229253b381b3SDavid Woodhouse 229353b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 229453b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 229553b381b3SDavid Woodhouse } 229653b381b3SDavid Woodhouse 2297385de0efSChristoph Hellwig static void read_rebuild_work(struct work_struct *work) 229853b381b3SDavid Woodhouse { 229953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 230053b381b3SDavid Woodhouse 230153b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 230253b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 230353b381b3SDavid Woodhouse } 23045a6ac9eaSMiao Xie 23055a6ac9eaSMiao Xie /* 23065a6ac9eaSMiao Xie * The following code is used to scrub/replace the parity stripe 23075a6ac9eaSMiao Xie * 23084c664611SQu Wenruo * Caller must have already increased bio_counter for getting @bioc. 2309ae6529c3SQu Wenruo * 23105a6ac9eaSMiao Xie * Note: We need make sure all the pages that add into the scrub/replace 23115a6ac9eaSMiao Xie * raid bio are correct and not be changed during the scrub/replace. That 23125a6ac9eaSMiao Xie * is those pages just hold metadata or file data with checksum. 23135a6ac9eaSMiao Xie */ 23145a6ac9eaSMiao Xie 23156a258d72SQu Wenruo struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 23166a258d72SQu Wenruo struct btrfs_io_context *bioc, 2317cc353a8bSQu Wenruo u32 stripe_len, struct btrfs_device *scrub_dev, 23185a6ac9eaSMiao Xie unsigned long *dbitmap, int stripe_nsectors) 23195a6ac9eaSMiao Xie { 23206a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 23215a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 23225a6ac9eaSMiao Xie int i; 23235a6ac9eaSMiao Xie 23244c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 23255a6ac9eaSMiao Xie if (IS_ERR(rbio)) 23265a6ac9eaSMiao Xie return NULL; 23275a6ac9eaSMiao Xie bio_list_add(&rbio->bio_list, bio); 23285a6ac9eaSMiao Xie /* 23295a6ac9eaSMiao Xie * This is a special bio which is used to hold the completion handler 23305a6ac9eaSMiao Xie * and make the scrub rbio is similar to the other types 23315a6ac9eaSMiao Xie */ 23325a6ac9eaSMiao Xie ASSERT(!bio->bi_iter.bi_size); 23335a6ac9eaSMiao Xie rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 23345a6ac9eaSMiao Xie 23359cd3a7ebSLiu Bo /* 23364c664611SQu Wenruo * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 23379cd3a7ebSLiu Bo * to the end position, so this search can start from the first parity 23389cd3a7ebSLiu Bo * stripe. 23399cd3a7ebSLiu Bo */ 23409cd3a7ebSLiu Bo for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 23414c664611SQu Wenruo if (bioc->stripes[i].dev == scrub_dev) { 23425a6ac9eaSMiao Xie rbio->scrubp = i; 23435a6ac9eaSMiao Xie break; 23445a6ac9eaSMiao Xie } 23455a6ac9eaSMiao Xie } 23469cd3a7ebSLiu Bo ASSERT(i < rbio->real_stripes); 23475a6ac9eaSMiao Xie 2348c67c68ebSQu Wenruo bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 23495a6ac9eaSMiao Xie 2350ae6529c3SQu Wenruo /* 23514c664611SQu Wenruo * We have already increased bio_counter when getting bioc, record it 2352ae6529c3SQu Wenruo * so we can free it at rbio_orig_end_io(). 2353ae6529c3SQu Wenruo */ 2354ae6529c3SQu Wenruo rbio->generic_bio_cnt = 1; 2355ae6529c3SQu Wenruo 23565a6ac9eaSMiao Xie return rbio; 23575a6ac9eaSMiao Xie } 23585a6ac9eaSMiao Xie 2359b4ee1782SOmar Sandoval /* Used for both parity scrub and missing. */ 2360b4ee1782SOmar Sandoval void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 23616346f6bfSQu Wenruo unsigned int pgoff, u64 logical) 23625a6ac9eaSMiao Xie { 23636346f6bfSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 23645a6ac9eaSMiao Xie int stripe_offset; 23655a6ac9eaSMiao Xie int index; 23665a6ac9eaSMiao Xie 23674c664611SQu Wenruo ASSERT(logical >= rbio->bioc->raid_map[0]); 23686346f6bfSQu Wenruo ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 23695a6ac9eaSMiao Xie rbio->stripe_len * rbio->nr_data); 23704c664611SQu Wenruo stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 23716346f6bfSQu Wenruo index = stripe_offset / sectorsize; 23726346f6bfSQu Wenruo rbio->bio_sectors[index].page = page; 23736346f6bfSQu Wenruo rbio->bio_sectors[index].pgoff = pgoff; 23745a6ac9eaSMiao Xie } 23755a6ac9eaSMiao Xie 23765a6ac9eaSMiao Xie /* 23775a6ac9eaSMiao Xie * We just scrub the parity that we have correct data on the same horizontal, 23785a6ac9eaSMiao Xie * so we needn't allocate all pages for all the stripes. 23795a6ac9eaSMiao Xie */ 23805a6ac9eaSMiao Xie static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 23815a6ac9eaSMiao Xie { 23823907ce29SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 23833907ce29SQu Wenruo int stripe; 23843907ce29SQu Wenruo int sectornr; 23855a6ac9eaSMiao Xie 2386c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 23873907ce29SQu Wenruo for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 23883907ce29SQu Wenruo struct page *page; 23893907ce29SQu Wenruo int index = (stripe * rbio->stripe_nsectors + sectornr) * 23903907ce29SQu Wenruo sectorsize >> PAGE_SHIFT; 23913907ce29SQu Wenruo 23925a6ac9eaSMiao Xie if (rbio->stripe_pages[index]) 23935a6ac9eaSMiao Xie continue; 23945a6ac9eaSMiao Xie 2395b0ee5e1eSDavid Sterba page = alloc_page(GFP_NOFS); 23965a6ac9eaSMiao Xie if (!page) 23975a6ac9eaSMiao Xie return -ENOMEM; 23985a6ac9eaSMiao Xie rbio->stripe_pages[index] = page; 23995a6ac9eaSMiao Xie } 24005a6ac9eaSMiao Xie } 2401eb357060SQu Wenruo index_stripe_sectors(rbio); 24025a6ac9eaSMiao Xie return 0; 24035a6ac9eaSMiao Xie } 24045a6ac9eaSMiao Xie 24055a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 24065a6ac9eaSMiao Xie int need_check) 24075a6ac9eaSMiao Xie { 24084c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 240946900662SQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 24101389053eSKees Cook void **pointers = rbio->finish_pointers; 2411c67c68ebSQu Wenruo unsigned long *pbitmap = &rbio->finish_pbitmap; 24125a6ac9eaSMiao Xie int nr_data = rbio->nr_data; 24135a6ac9eaSMiao Xie int stripe; 24143e77605dSQu Wenruo int sectornr; 2415c17af965SDavid Sterba bool has_qstripe; 241646900662SQu Wenruo struct sector_ptr p_sector = { 0 }; 241746900662SQu Wenruo struct sector_ptr q_sector = { 0 }; 24185a6ac9eaSMiao Xie struct bio_list bio_list; 24195a6ac9eaSMiao Xie struct bio *bio; 242076035976SMiao Xie int is_replace = 0; 24215a6ac9eaSMiao Xie int ret; 24225a6ac9eaSMiao Xie 24235a6ac9eaSMiao Xie bio_list_init(&bio_list); 24245a6ac9eaSMiao Xie 2425c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 2426c17af965SDavid Sterba has_qstripe = false; 2427c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 2428c17af965SDavid Sterba has_qstripe = true; 2429c17af965SDavid Sterba else 24305a6ac9eaSMiao Xie BUG(); 24315a6ac9eaSMiao Xie 24324c664611SQu Wenruo if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 243376035976SMiao Xie is_replace = 1; 2434c67c68ebSQu Wenruo bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 243576035976SMiao Xie } 243676035976SMiao Xie 24375a6ac9eaSMiao Xie /* 24385a6ac9eaSMiao Xie * Because the higher layers(scrubber) are unlikely to 24395a6ac9eaSMiao Xie * use this area of the disk again soon, so don't cache 24405a6ac9eaSMiao Xie * it. 24415a6ac9eaSMiao Xie */ 24425a6ac9eaSMiao Xie clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 24435a6ac9eaSMiao Xie 24445a6ac9eaSMiao Xie if (!need_check) 24455a6ac9eaSMiao Xie goto writeback; 24465a6ac9eaSMiao Xie 244746900662SQu Wenruo p_sector.page = alloc_page(GFP_NOFS); 244846900662SQu Wenruo if (!p_sector.page) 24495a6ac9eaSMiao Xie goto cleanup; 245046900662SQu Wenruo p_sector.pgoff = 0; 245146900662SQu Wenruo p_sector.uptodate = 1; 24525a6ac9eaSMiao Xie 2453c17af965SDavid Sterba if (has_qstripe) { 2454d70cef0dSIra Weiny /* RAID6, allocate and map temp space for the Q stripe */ 245546900662SQu Wenruo q_sector.page = alloc_page(GFP_NOFS); 245646900662SQu Wenruo if (!q_sector.page) { 245746900662SQu Wenruo __free_page(p_sector.page); 245846900662SQu Wenruo p_sector.page = NULL; 24595a6ac9eaSMiao Xie goto cleanup; 24605a6ac9eaSMiao Xie } 246146900662SQu Wenruo q_sector.pgoff = 0; 246246900662SQu Wenruo q_sector.uptodate = 1; 246346900662SQu Wenruo pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 24645a6ac9eaSMiao Xie } 24655a6ac9eaSMiao Xie 24665a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 24675a6ac9eaSMiao Xie 2468d70cef0dSIra Weiny /* Map the parity stripe just once */ 246946900662SQu Wenruo pointers[nr_data] = kmap_local_page(p_sector.page); 2470d70cef0dSIra Weiny 2471c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 247246900662SQu Wenruo struct sector_ptr *sector; 24735a6ac9eaSMiao Xie void *parity; 247446900662SQu Wenruo 24755a6ac9eaSMiao Xie /* first collect one page from each data stripe */ 24765a6ac9eaSMiao Xie for (stripe = 0; stripe < nr_data; stripe++) { 247746900662SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 247846900662SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 247946900662SQu Wenruo sector->pgoff; 24805a6ac9eaSMiao Xie } 24815a6ac9eaSMiao Xie 2482c17af965SDavid Sterba if (has_qstripe) { 2483d70cef0dSIra Weiny /* RAID6, call the library function to fill in our P/Q */ 248446900662SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 24855a6ac9eaSMiao Xie pointers); 24865a6ac9eaSMiao Xie } else { 24875a6ac9eaSMiao Xie /* raid5 */ 248846900662SQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 248946900662SQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 24905a6ac9eaSMiao Xie } 24915a6ac9eaSMiao Xie 249201327610SNicholas D Steeves /* Check scrubbing parity and repair it */ 249346900662SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 249446900662SQu Wenruo parity = kmap_local_page(sector->page) + sector->pgoff; 249546900662SQu Wenruo if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 249646900662SQu Wenruo memcpy(parity, pointers[rbio->scrubp], sectorsize); 24975a6ac9eaSMiao Xie else 24985a6ac9eaSMiao Xie /* Parity is right, needn't writeback */ 2499c67c68ebSQu Wenruo bitmap_clear(&rbio->dbitmap, sectornr, 1); 250058c1a35cSIra Weiny kunmap_local(parity); 25015a6ac9eaSMiao Xie 250294a0b58dSIra Weiny for (stripe = nr_data - 1; stripe >= 0; stripe--) 250394a0b58dSIra Weiny kunmap_local(pointers[stripe]); 25045a6ac9eaSMiao Xie } 25055a6ac9eaSMiao Xie 250694a0b58dSIra Weiny kunmap_local(pointers[nr_data]); 250746900662SQu Wenruo __free_page(p_sector.page); 250846900662SQu Wenruo p_sector.page = NULL; 250946900662SQu Wenruo if (q_sector.page) { 251094a0b58dSIra Weiny kunmap_local(pointers[rbio->real_stripes - 1]); 251146900662SQu Wenruo __free_page(q_sector.page); 251246900662SQu Wenruo q_sector.page = NULL; 2513d70cef0dSIra Weiny } 25145a6ac9eaSMiao Xie 25155a6ac9eaSMiao Xie writeback: 25165a6ac9eaSMiao Xie /* 25175a6ac9eaSMiao Xie * time to start writing. Make bios for everything from the 25185a6ac9eaSMiao Xie * higher layers (the bio_list in our rbio) and our p/q. Ignore 25195a6ac9eaSMiao Xie * everything else. 25205a6ac9eaSMiao Xie */ 2521c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 25223e77605dSQu Wenruo struct sector_ptr *sector; 25235a6ac9eaSMiao Xie 25243e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 25253e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 25263e77605dSQu Wenruo sectornr, rbio->stripe_len, REQ_OP_WRITE); 25275a6ac9eaSMiao Xie if (ret) 25285a6ac9eaSMiao Xie goto cleanup; 25295a6ac9eaSMiao Xie } 25305a6ac9eaSMiao Xie 253176035976SMiao Xie if (!is_replace) 253276035976SMiao Xie goto submit_write; 253376035976SMiao Xie 25343e77605dSQu Wenruo for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 25353e77605dSQu Wenruo struct sector_ptr *sector; 253676035976SMiao Xie 25373e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 25383e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 25394c664611SQu Wenruo bioc->tgtdev_map[rbio->scrubp], 25403e77605dSQu Wenruo sectornr, rbio->stripe_len, REQ_OP_WRITE); 254176035976SMiao Xie if (ret) 254276035976SMiao Xie goto cleanup; 254376035976SMiao Xie } 254476035976SMiao Xie 254576035976SMiao Xie submit_write: 25465a6ac9eaSMiao Xie nr_data = bio_list_size(&bio_list); 25475a6ac9eaSMiao Xie if (!nr_data) { 25485a6ac9eaSMiao Xie /* Every parity is right */ 254958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_OK); 25505a6ac9eaSMiao Xie return; 25515a6ac9eaSMiao Xie } 25525a6ac9eaSMiao Xie 25535a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, nr_data); 25545a6ac9eaSMiao Xie 2555bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2556a6111d11SZhao Lei bio->bi_end_io = raid_write_end_io; 25574e49ea4aSMike Christie 2558b8bea09aSQu Wenruo if (trace_raid56_scrub_write_stripe_enabled()) { 2559b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 2560b8bea09aSQu Wenruo 2561b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2562b8bea09aSQu Wenruo trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); 2563b8bea09aSQu Wenruo } 25644e49ea4aSMike Christie submit_bio(bio); 25655a6ac9eaSMiao Xie } 25665a6ac9eaSMiao Xie return; 25675a6ac9eaSMiao Xie 25685a6ac9eaSMiao Xie cleanup: 256958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2570785884fcSLiu Bo 2571785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2572785884fcSLiu Bo bio_put(bio); 25735a6ac9eaSMiao Xie } 25745a6ac9eaSMiao Xie 25755a6ac9eaSMiao Xie static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 25765a6ac9eaSMiao Xie { 25775a6ac9eaSMiao Xie if (stripe >= 0 && stripe < rbio->nr_data) 25785a6ac9eaSMiao Xie return 1; 25795a6ac9eaSMiao Xie return 0; 25805a6ac9eaSMiao Xie } 25815a6ac9eaSMiao Xie 25825a6ac9eaSMiao Xie /* 25835a6ac9eaSMiao Xie * While we're doing the parity check and repair, we could have errors 25845a6ac9eaSMiao Xie * in reading pages off the disk. This checks for errors and if we're 25855a6ac9eaSMiao Xie * not able to read the page it'll trigger parity reconstruction. The 25865a6ac9eaSMiao Xie * parity scrub will be finished after we've reconstructed the failed 25875a6ac9eaSMiao Xie * stripes 25885a6ac9eaSMiao Xie */ 25895a6ac9eaSMiao Xie static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 25905a6ac9eaSMiao Xie { 25914c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 25925a6ac9eaSMiao Xie goto cleanup; 25935a6ac9eaSMiao Xie 25945a6ac9eaSMiao Xie if (rbio->faila >= 0 || rbio->failb >= 0) { 25955a6ac9eaSMiao Xie int dfail = 0, failp = -1; 25965a6ac9eaSMiao Xie 25975a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->faila)) 25985a6ac9eaSMiao Xie dfail++; 25995a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->faila)) 26005a6ac9eaSMiao Xie failp = rbio->faila; 26015a6ac9eaSMiao Xie 26025a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->failb)) 26035a6ac9eaSMiao Xie dfail++; 26045a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->failb)) 26055a6ac9eaSMiao Xie failp = rbio->failb; 26065a6ac9eaSMiao Xie 26075a6ac9eaSMiao Xie /* 26085a6ac9eaSMiao Xie * Because we can not use a scrubbing parity to repair 26095a6ac9eaSMiao Xie * the data, so the capability of the repair is declined. 26105a6ac9eaSMiao Xie * (In the case of RAID5, we can not repair anything) 26115a6ac9eaSMiao Xie */ 26124c664611SQu Wenruo if (dfail > rbio->bioc->max_errors - 1) 26135a6ac9eaSMiao Xie goto cleanup; 26145a6ac9eaSMiao Xie 26155a6ac9eaSMiao Xie /* 26165a6ac9eaSMiao Xie * If all data is good, only parity is correctly, just 26175a6ac9eaSMiao Xie * repair the parity. 26185a6ac9eaSMiao Xie */ 26195a6ac9eaSMiao Xie if (dfail == 0) { 26205a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 26215a6ac9eaSMiao Xie return; 26225a6ac9eaSMiao Xie } 26235a6ac9eaSMiao Xie 26245a6ac9eaSMiao Xie /* 26255a6ac9eaSMiao Xie * Here means we got one corrupted data stripe and one 26265a6ac9eaSMiao Xie * corrupted parity on RAID6, if the corrupted parity 262701327610SNicholas D Steeves * is scrubbing parity, luckily, use the other one to repair 26285a6ac9eaSMiao Xie * the data, or we can not repair the data stripe. 26295a6ac9eaSMiao Xie */ 26305a6ac9eaSMiao Xie if (failp != rbio->scrubp) 26315a6ac9eaSMiao Xie goto cleanup; 26325a6ac9eaSMiao Xie 26335a6ac9eaSMiao Xie __raid_recover_end_io(rbio); 26345a6ac9eaSMiao Xie } else { 26355a6ac9eaSMiao Xie finish_parity_scrub(rbio, 1); 26365a6ac9eaSMiao Xie } 26375a6ac9eaSMiao Xie return; 26385a6ac9eaSMiao Xie 26395a6ac9eaSMiao Xie cleanup: 264058efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 26415a6ac9eaSMiao Xie } 26425a6ac9eaSMiao Xie 26435a6ac9eaSMiao Xie /* 26445a6ac9eaSMiao Xie * end io for the read phase of the rmw cycle. All the bios here are physical 26455a6ac9eaSMiao Xie * stripe bios we've read from the disk so we can recalculate the parity of the 26465a6ac9eaSMiao Xie * stripe. 26475a6ac9eaSMiao Xie * 26485a6ac9eaSMiao Xie * This will usually kick off finish_rmw once all the bios are read in, but it 26495a6ac9eaSMiao Xie * may trigger parity reconstruction if we had any errors along the way 26505a6ac9eaSMiao Xie */ 2651d34e123dSChristoph Hellwig static void raid56_parity_scrub_end_io_work(struct work_struct *work) 26525a6ac9eaSMiao Xie { 2653d34e123dSChristoph Hellwig struct btrfs_raid_bio *rbio = 2654d34e123dSChristoph Hellwig container_of(work, struct btrfs_raid_bio, end_io_work); 26555a6ac9eaSMiao Xie 26565a6ac9eaSMiao Xie /* 2657d34e123dSChristoph Hellwig * This will normally call finish_rmw to start our write, but if there 2658d34e123dSChristoph Hellwig * are any failed stripes we'll reconstruct from parity first 26595a6ac9eaSMiao Xie */ 26605a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 26615a6ac9eaSMiao Xie } 26625a6ac9eaSMiao Xie 26635a6ac9eaSMiao Xie static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 26645a6ac9eaSMiao Xie { 26655a6ac9eaSMiao Xie int bios_to_read = 0; 26665a6ac9eaSMiao Xie struct bio_list bio_list; 26675a6ac9eaSMiao Xie int ret; 26683e77605dSQu Wenruo int sectornr; 26695a6ac9eaSMiao Xie int stripe; 26705a6ac9eaSMiao Xie struct bio *bio; 26715a6ac9eaSMiao Xie 2672785884fcSLiu Bo bio_list_init(&bio_list); 2673785884fcSLiu Bo 26745a6ac9eaSMiao Xie ret = alloc_rbio_essential_pages(rbio); 26755a6ac9eaSMiao Xie if (ret) 26765a6ac9eaSMiao Xie goto cleanup; 26775a6ac9eaSMiao Xie 26785a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 26795a6ac9eaSMiao Xie /* 26805a6ac9eaSMiao Xie * build a list of bios to read all the missing parts of this 26815a6ac9eaSMiao Xie * stripe 26825a6ac9eaSMiao Xie */ 26832c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2684c67c68ebSQu Wenruo for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 26853e77605dSQu Wenruo struct sector_ptr *sector; 26865a6ac9eaSMiao Xie /* 26873e77605dSQu Wenruo * We want to find all the sectors missing from the 26883e77605dSQu Wenruo * rbio and read them from the disk. If * sector_in_rbio() 26893e77605dSQu Wenruo * finds a sector in the bio list we don't need to read 26903e77605dSQu Wenruo * it off the stripe. 26915a6ac9eaSMiao Xie */ 26923e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 26933e77605dSQu Wenruo if (sector) 26945a6ac9eaSMiao Xie continue; 26955a6ac9eaSMiao Xie 26963e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 26975a6ac9eaSMiao Xie /* 26983e77605dSQu Wenruo * The bio cache may have handed us an uptodate sector. 26993e77605dSQu Wenruo * If so, be happy and use it. 27005a6ac9eaSMiao Xie */ 27013e77605dSQu Wenruo if (sector->uptodate) 27025a6ac9eaSMiao Xie continue; 27035a6ac9eaSMiao Xie 27043e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 27053e77605dSQu Wenruo stripe, sectornr, rbio->stripe_len, 27063e77605dSQu Wenruo REQ_OP_READ); 27075a6ac9eaSMiao Xie if (ret) 27085a6ac9eaSMiao Xie goto cleanup; 27095a6ac9eaSMiao Xie } 27105a6ac9eaSMiao Xie } 27115a6ac9eaSMiao Xie 27125a6ac9eaSMiao Xie bios_to_read = bio_list_size(&bio_list); 27135a6ac9eaSMiao Xie if (!bios_to_read) { 27145a6ac9eaSMiao Xie /* 27155a6ac9eaSMiao Xie * this can happen if others have merged with 27165a6ac9eaSMiao Xie * us, it means there is nothing left to read. 27175a6ac9eaSMiao Xie * But if there are missing devices it may not be 27185a6ac9eaSMiao Xie * safe to do the full stripe write yet. 27195a6ac9eaSMiao Xie */ 27205a6ac9eaSMiao Xie goto finish; 27215a6ac9eaSMiao Xie } 27225a6ac9eaSMiao Xie 27235a6ac9eaSMiao Xie /* 27244c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 27254c664611SQu Wenruo * touch it after that. 27265a6ac9eaSMiao Xie */ 27275a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2728d34e123dSChristoph Hellwig INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); 2729bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2730d34e123dSChristoph Hellwig bio->bi_end_io = raid56_bio_end_io; 27315a6ac9eaSMiao Xie 2732b8bea09aSQu Wenruo if (trace_raid56_scrub_read_enabled()) { 2733b8bea09aSQu Wenruo struct raid56_bio_trace_info trace_info = { 0 }; 2734b8bea09aSQu Wenruo 2735b8bea09aSQu Wenruo bio_get_trace_info(rbio, bio, &trace_info); 2736b8bea09aSQu Wenruo trace_raid56_scrub_read(rbio, bio, &trace_info); 2737b8bea09aSQu Wenruo } 27384e49ea4aSMike Christie submit_bio(bio); 27395a6ac9eaSMiao Xie } 27405a6ac9eaSMiao Xie /* the actual write will happen once the reads are done */ 27415a6ac9eaSMiao Xie return; 27425a6ac9eaSMiao Xie 27435a6ac9eaSMiao Xie cleanup: 274458efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2745785884fcSLiu Bo 2746785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2747785884fcSLiu Bo bio_put(bio); 2748785884fcSLiu Bo 27495a6ac9eaSMiao Xie return; 27505a6ac9eaSMiao Xie 27515a6ac9eaSMiao Xie finish: 27525a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 27535a6ac9eaSMiao Xie } 27545a6ac9eaSMiao Xie 2755385de0efSChristoph Hellwig static void scrub_parity_work(struct work_struct *work) 27565a6ac9eaSMiao Xie { 27575a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 27585a6ac9eaSMiao Xie 27595a6ac9eaSMiao Xie rbio = container_of(work, struct btrfs_raid_bio, work); 27605a6ac9eaSMiao Xie raid56_parity_scrub_stripe(rbio); 27615a6ac9eaSMiao Xie } 27625a6ac9eaSMiao Xie 27635a6ac9eaSMiao Xie void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 27645a6ac9eaSMiao Xie { 27655a6ac9eaSMiao Xie if (!lock_stripe_add(rbio)) 2766a81b747dSDavid Sterba start_async_work(rbio, scrub_parity_work); 27675a6ac9eaSMiao Xie } 2768b4ee1782SOmar Sandoval 2769b4ee1782SOmar Sandoval /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2770b4ee1782SOmar Sandoval 2771b4ee1782SOmar Sandoval struct btrfs_raid_bio * 27726a258d72SQu Wenruo raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, 27736a258d72SQu Wenruo u64 length) 2774b4ee1782SOmar Sandoval { 27756a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 2776b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2777b4ee1782SOmar Sandoval 27784c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, length); 2779b4ee1782SOmar Sandoval if (IS_ERR(rbio)) 2780b4ee1782SOmar Sandoval return NULL; 2781b4ee1782SOmar Sandoval 2782b4ee1782SOmar Sandoval rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2783b4ee1782SOmar Sandoval bio_list_add(&rbio->bio_list, bio); 2784b4ee1782SOmar Sandoval /* 2785b4ee1782SOmar Sandoval * This is a special bio which is used to hold the completion handler 2786b4ee1782SOmar Sandoval * and make the scrub rbio is similar to the other types 2787b4ee1782SOmar Sandoval */ 2788b4ee1782SOmar Sandoval ASSERT(!bio->bi_iter.bi_size); 2789b4ee1782SOmar Sandoval 2790b4ee1782SOmar Sandoval rbio->faila = find_logical_bio_stripe(rbio, bio); 2791b4ee1782SOmar Sandoval if (rbio->faila == -1) { 2792b4ee1782SOmar Sandoval BUG(); 2793b4ee1782SOmar Sandoval kfree(rbio); 2794b4ee1782SOmar Sandoval return NULL; 2795b4ee1782SOmar Sandoval } 2796b4ee1782SOmar Sandoval 2797ae6529c3SQu Wenruo /* 27984c664611SQu Wenruo * When we get bioc, we have already increased bio_counter, record it 2799ae6529c3SQu Wenruo * so we can free it at rbio_orig_end_io() 2800ae6529c3SQu Wenruo */ 2801ae6529c3SQu Wenruo rbio->generic_bio_cnt = 1; 2802ae6529c3SQu Wenruo 2803b4ee1782SOmar Sandoval return rbio; 2804b4ee1782SOmar Sandoval } 2805b4ee1782SOmar Sandoval 2806b4ee1782SOmar Sandoval void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2807b4ee1782SOmar Sandoval { 2808b4ee1782SOmar Sandoval if (!lock_stripe_add(rbio)) 2809e66d8d5aSDavid Sterba start_async_work(rbio, read_rebuild_work); 2810b4ee1782SOmar Sandoval } 2811