1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0 253b381b3SDavid Woodhouse /* 353b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 453b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 553b381b3SDavid Woodhouse */ 6c1d7c514SDavid Sterba 753b381b3SDavid Woodhouse #include <linux/sched.h> 853b381b3SDavid Woodhouse #include <linux/bio.h> 953b381b3SDavid Woodhouse #include <linux/slab.h> 1053b381b3SDavid Woodhouse #include <linux/blkdev.h> 1153b381b3SDavid Woodhouse #include <linux/raid/pq.h> 1253b381b3SDavid Woodhouse #include <linux/hash.h> 1353b381b3SDavid Woodhouse #include <linux/list_sort.h> 1453b381b3SDavid Woodhouse #include <linux/raid/xor.h> 15818e010bSDavid Sterba #include <linux/mm.h> 16cea62800SJohannes Thumshirn #include "misc.h" 1753b381b3SDavid Woodhouse #include "ctree.h" 1853b381b3SDavid Woodhouse #include "disk-io.h" 1953b381b3SDavid Woodhouse #include "volumes.h" 2053b381b3SDavid Woodhouse #include "raid56.h" 2153b381b3SDavid Woodhouse #include "async-thread.h" 2253b381b3SDavid Woodhouse 2353b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 2453b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 2553b381b3SDavid Woodhouse 264ae10b3aSChris Mason /* 274ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 284ae10b3aSChris Mason * of past RMW 294ae10b3aSChris Mason */ 304ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 314ae10b3aSChris Mason 324ae10b3aSChris Mason /* 334ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 344ae10b3aSChris Mason */ 354ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 364ae10b3aSChris Mason 374ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 384ae10b3aSChris Mason 398a953348SDavid Sterba #define BTRFS_STRIPE_HASH_TABLE_BITS 11 408a953348SDavid Sterba 418a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 428a953348SDavid Sterba struct btrfs_stripe_hash { 438a953348SDavid Sterba struct list_head hash_list; 448a953348SDavid Sterba spinlock_t lock; 458a953348SDavid Sterba }; 468a953348SDavid Sterba 478a953348SDavid Sterba /* Used by the raid56 code to lock stripes for read/modify/write */ 488a953348SDavid Sterba struct btrfs_stripe_hash_table { 498a953348SDavid Sterba struct list_head stripe_cache; 508a953348SDavid Sterba spinlock_t cache_lock; 518a953348SDavid Sterba int cache_size; 528a953348SDavid Sterba struct btrfs_stripe_hash table[]; 538a953348SDavid Sterba }; 548a953348SDavid Sterba 55eb357060SQu Wenruo /* 56eb357060SQu Wenruo * A bvec like structure to present a sector inside a page. 57eb357060SQu Wenruo * 58eb357060SQu Wenruo * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59eb357060SQu Wenruo */ 60eb357060SQu Wenruo struct sector_ptr { 61eb357060SQu Wenruo struct page *page; 6200425dd9SQu Wenruo unsigned int pgoff:24; 6300425dd9SQu Wenruo unsigned int uptodate:8; 64eb357060SQu Wenruo }; 65eb357060SQu Wenruo 661b94b556SMiao Xie enum btrfs_rbio_ops { 67b4ee1782SOmar Sandoval BTRFS_RBIO_WRITE, 68b4ee1782SOmar Sandoval BTRFS_RBIO_READ_REBUILD, 69b4ee1782SOmar Sandoval BTRFS_RBIO_PARITY_SCRUB, 70b4ee1782SOmar Sandoval BTRFS_RBIO_REBUILD_MISSING, 711b94b556SMiao Xie }; 721b94b556SMiao Xie 7353b381b3SDavid Woodhouse struct btrfs_raid_bio { 744c664611SQu Wenruo struct btrfs_io_context *bioc; 7553b381b3SDavid Woodhouse 7653b381b3SDavid Woodhouse /* while we're doing rmw on a stripe 7753b381b3SDavid Woodhouse * we put it into a hash table so we can 7853b381b3SDavid Woodhouse * lock the stripe and merge more rbios 7953b381b3SDavid Woodhouse * into it. 8053b381b3SDavid Woodhouse */ 8153b381b3SDavid Woodhouse struct list_head hash_list; 8253b381b3SDavid Woodhouse 8353b381b3SDavid Woodhouse /* 844ae10b3aSChris Mason * LRU list for the stripe cache 854ae10b3aSChris Mason */ 864ae10b3aSChris Mason struct list_head stripe_cache; 874ae10b3aSChris Mason 884ae10b3aSChris Mason /* 8953b381b3SDavid Woodhouse * for scheduling work in the helper threads 9053b381b3SDavid Woodhouse */ 9153b381b3SDavid Woodhouse struct btrfs_work work; 9253b381b3SDavid Woodhouse 9353b381b3SDavid Woodhouse /* 9453b381b3SDavid Woodhouse * bio list and bio_list_lock are used 9553b381b3SDavid Woodhouse * to add more bios into the stripe 9653b381b3SDavid Woodhouse * in hopes of avoiding the full rmw 9753b381b3SDavid Woodhouse */ 9853b381b3SDavid Woodhouse struct bio_list bio_list; 9953b381b3SDavid Woodhouse spinlock_t bio_list_lock; 10053b381b3SDavid Woodhouse 1016ac0f488SChris Mason /* also protected by the bio_list_lock, the 1026ac0f488SChris Mason * plug list is used by the plugging code 1036ac0f488SChris Mason * to collect partial bios while plugged. The 1046ac0f488SChris Mason * stripe locking code also uses it to hand off 10553b381b3SDavid Woodhouse * the stripe lock to the next pending IO 10653b381b3SDavid Woodhouse */ 10753b381b3SDavid Woodhouse struct list_head plug_list; 10853b381b3SDavid Woodhouse 10953b381b3SDavid Woodhouse /* 11053b381b3SDavid Woodhouse * flags that tell us if it is safe to 11153b381b3SDavid Woodhouse * merge with this bio 11253b381b3SDavid Woodhouse */ 11353b381b3SDavid Woodhouse unsigned long flags; 11453b381b3SDavid Woodhouse 11553b381b3SDavid Woodhouse /* 11653b381b3SDavid Woodhouse * set if we're doing a parity rebuild 11753b381b3SDavid Woodhouse * for a read from higher up, which is handled 11853b381b3SDavid Woodhouse * differently from a parity rebuild as part of 11953b381b3SDavid Woodhouse * rmw 12053b381b3SDavid Woodhouse */ 1211b94b556SMiao Xie enum btrfs_rbio_ops operation; 12253b381b3SDavid Woodhouse 12329b06838SQu Wenruo /* Size of each individual stripe on disk */ 12429b06838SQu Wenruo u32 stripe_len; 12553b381b3SDavid Woodhouse 12629b06838SQu Wenruo /* How many pages there are for the full stripe including P/Q */ 12729b06838SQu Wenruo u16 nr_pages; 12853b381b3SDavid Woodhouse 12994efbe19SQu Wenruo /* How many sectors there are for the full stripe including P/Q */ 13094efbe19SQu Wenruo u16 nr_sectors; 13194efbe19SQu Wenruo 13229b06838SQu Wenruo /* Number of data stripes (no p/q) */ 13329b06838SQu Wenruo u8 nr_data; 13429b06838SQu Wenruo 13529b06838SQu Wenruo /* Numer of all stripes (including P/Q) */ 13629b06838SQu Wenruo u8 real_stripes; 13729b06838SQu Wenruo 13829b06838SQu Wenruo /* How many pages there are for each stripe */ 13929b06838SQu Wenruo u8 stripe_npages; 14029b06838SQu Wenruo 14194efbe19SQu Wenruo /* How many sectors there are for each stripe */ 14294efbe19SQu Wenruo u8 stripe_nsectors; 14394efbe19SQu Wenruo 14429b06838SQu Wenruo /* First bad stripe, -1 means no corruption */ 14529b06838SQu Wenruo s8 faila; 14629b06838SQu Wenruo 14729b06838SQu Wenruo /* Second bad stripe (for RAID6 use) */ 14829b06838SQu Wenruo s8 failb; 14929b06838SQu Wenruo 15029b06838SQu Wenruo /* Stripe number that we're scrubbing */ 15129b06838SQu Wenruo u8 scrubp; 15253b381b3SDavid Woodhouse 15353b381b3SDavid Woodhouse /* 15453b381b3SDavid Woodhouse * size of all the bios in the bio_list. This 15553b381b3SDavid Woodhouse * helps us decide if the rbio maps to a full 15653b381b3SDavid Woodhouse * stripe or not 15753b381b3SDavid Woodhouse */ 15853b381b3SDavid Woodhouse int bio_list_bytes; 15953b381b3SDavid Woodhouse 1604245215dSMiao Xie int generic_bio_cnt; 1614245215dSMiao Xie 162dec95574SElena Reshetova refcount_t refs; 16353b381b3SDavid Woodhouse 164b89e1b01SMiao Xie atomic_t stripes_pending; 165b89e1b01SMiao Xie 166b89e1b01SMiao Xie atomic_t error; 16753b381b3SDavid Woodhouse /* 16853b381b3SDavid Woodhouse * these are two arrays of pointers. We allocate the 16953b381b3SDavid Woodhouse * rbio big enough to hold them both and setup their 17053b381b3SDavid Woodhouse * locations when the rbio is allocated 17153b381b3SDavid Woodhouse */ 17253b381b3SDavid Woodhouse 17353b381b3SDavid Woodhouse /* pointers to pages that we allocated for 17453b381b3SDavid Woodhouse * reading/writing stripes directly from the disk (including P/Q) 17553b381b3SDavid Woodhouse */ 17653b381b3SDavid Woodhouse struct page **stripe_pages; 17753b381b3SDavid Woodhouse 17800425dd9SQu Wenruo /* Pointers to the sectors in the bio_list, for faster lookup */ 17900425dd9SQu Wenruo struct sector_ptr *bio_sectors; 18000425dd9SQu Wenruo 18153b381b3SDavid Woodhouse /* 182eb357060SQu Wenruo * For subpage support, we need to map each sector to above 183eb357060SQu Wenruo * stripe_pages. 1845a6ac9eaSMiao Xie */ 185eb357060SQu Wenruo struct sector_ptr *stripe_sectors; 186eb357060SQu Wenruo 187eb357060SQu Wenruo /* Bitmap to record which horizontal stripe has data */ 1885a6ac9eaSMiao Xie unsigned long *dbitmap; 1891389053eSKees Cook 1901389053eSKees Cook /* allocated with real_stripes-many pointers for finish_*() calls */ 1911389053eSKees Cook void **finish_pointers; 1921389053eSKees Cook 19394efbe19SQu Wenruo /* Allocated with stripe_nsectors-many bits for finish_*() calls */ 1941389053eSKees Cook unsigned long *finish_pbitmap; 19553b381b3SDavid Woodhouse }; 19653b381b3SDavid Woodhouse 19753b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 19853b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 19953b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work); 20053b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work); 20153b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 20253b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 20353b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio); 20453b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 20553b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 20653b381b3SDavid Woodhouse 2075a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2085a6ac9eaSMiao Xie int need_check); 209a81b747dSDavid Sterba static void scrub_parity_work(struct btrfs_work *work); 2105a6ac9eaSMiao Xie 211ac638859SDavid Sterba static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 212ac638859SDavid Sterba { 213a0cac0ecSOmar Sandoval btrfs_init_work(&rbio->work, work_func, NULL, NULL); 2146a258d72SQu Wenruo btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 215ac638859SDavid Sterba } 216ac638859SDavid Sterba 21753b381b3SDavid Woodhouse /* 21853b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 21953b381b3SDavid Woodhouse * bios in hopes of making a full stripe 22053b381b3SDavid Woodhouse */ 22153b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 22253b381b3SDavid Woodhouse { 22353b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 22453b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 22553b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 22653b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 22753b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 22853b381b3SDavid Woodhouse int i; 22953b381b3SDavid Woodhouse 23053b381b3SDavid Woodhouse if (info->stripe_hash_table) 23153b381b3SDavid Woodhouse return 0; 23253b381b3SDavid Woodhouse 23383c8266aSDavid Sterba /* 23483c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 23583c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 23683c8266aSDavid Sterba * 23783c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 23883c8266aSDavid Sterba * of a failing mount. 23983c8266aSDavid Sterba */ 240ee787f95SDavid Sterba table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 24153b381b3SDavid Woodhouse if (!table) 24253b381b3SDavid Woodhouse return -ENOMEM; 24353b381b3SDavid Woodhouse 2444ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 2454ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 2464ae10b3aSChris Mason 24753b381b3SDavid Woodhouse h = table->table; 24853b381b3SDavid Woodhouse 24953b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 25053b381b3SDavid Woodhouse cur = h + i; 25153b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 25253b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 25353b381b3SDavid Woodhouse } 25453b381b3SDavid Woodhouse 25553b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 256f749303bSWang Shilong kvfree(x); 25753b381b3SDavid Woodhouse return 0; 25853b381b3SDavid Woodhouse } 25953b381b3SDavid Woodhouse 26053b381b3SDavid Woodhouse /* 2614ae10b3aSChris Mason * caching an rbio means to copy anything from the 262ac26df8bSQu Wenruo * bio_sectors array into the stripe_pages array. We 2634ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 2644ae10b3aSChris Mason * to indicate if it has valid data 2654ae10b3aSChris Mason * 2664ae10b3aSChris Mason * once the caching is done, we set the cache ready 2674ae10b3aSChris Mason * bit. 2684ae10b3aSChris Mason */ 2694ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 2704ae10b3aSChris Mason { 2714ae10b3aSChris Mason int i; 2724ae10b3aSChris Mason int ret; 2734ae10b3aSChris Mason 2744ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 2754ae10b3aSChris Mason if (ret) 2764ae10b3aSChris Mason return; 2774ae10b3aSChris Mason 27800425dd9SQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 27900425dd9SQu Wenruo /* Some range not covered by bio (partial write), skip it */ 28000425dd9SQu Wenruo if (!rbio->bio_sectors[i].page) 28100425dd9SQu Wenruo continue; 28200425dd9SQu Wenruo 28300425dd9SQu Wenruo ASSERT(rbio->stripe_sectors[i].page); 28400425dd9SQu Wenruo memcpy_page(rbio->stripe_sectors[i].page, 28500425dd9SQu Wenruo rbio->stripe_sectors[i].pgoff, 28600425dd9SQu Wenruo rbio->bio_sectors[i].page, 28700425dd9SQu Wenruo rbio->bio_sectors[i].pgoff, 28800425dd9SQu Wenruo rbio->bioc->fs_info->sectorsize); 28900425dd9SQu Wenruo rbio->stripe_sectors[i].uptodate = 1; 29000425dd9SQu Wenruo } 2914ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2924ae10b3aSChris Mason } 2934ae10b3aSChris Mason 2944ae10b3aSChris Mason /* 29553b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 29653b381b3SDavid Woodhouse */ 29753b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 29853b381b3SDavid Woodhouse { 2994c664611SQu Wenruo u64 num = rbio->bioc->raid_map[0]; 30053b381b3SDavid Woodhouse 30153b381b3SDavid Woodhouse /* 30253b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 30353b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 30453b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 30553b381b3SDavid Woodhouse * returns just one or two different values. 30653b381b3SDavid Woodhouse * 30753b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 30853b381b3SDavid Woodhouse */ 30953b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 31053b381b3SDavid Woodhouse } 31153b381b3SDavid Woodhouse 312d4e28d9bSQu Wenruo static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 313d4e28d9bSQu Wenruo unsigned int page_nr) 314d4e28d9bSQu Wenruo { 315d4e28d9bSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 316d4e28d9bSQu Wenruo const u32 sectors_per_page = PAGE_SIZE / sectorsize; 317d4e28d9bSQu Wenruo int i; 318d4e28d9bSQu Wenruo 319d4e28d9bSQu Wenruo ASSERT(page_nr < rbio->nr_pages); 320d4e28d9bSQu Wenruo 321d4e28d9bSQu Wenruo for (i = sectors_per_page * page_nr; 322d4e28d9bSQu Wenruo i < sectors_per_page * page_nr + sectors_per_page; 323d4e28d9bSQu Wenruo i++) { 324d4e28d9bSQu Wenruo if (!rbio->stripe_sectors[i].uptodate) 325d4e28d9bSQu Wenruo return false; 326d4e28d9bSQu Wenruo } 327d4e28d9bSQu Wenruo return true; 328d4e28d9bSQu Wenruo } 329d4e28d9bSQu Wenruo 33053b381b3SDavid Woodhouse /* 331eb357060SQu Wenruo * Update the stripe_sectors[] array to use correct page and pgoff 332eb357060SQu Wenruo * 333eb357060SQu Wenruo * Should be called every time any page pointer in stripes_pages[] got modified. 334eb357060SQu Wenruo */ 335eb357060SQu Wenruo static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 336eb357060SQu Wenruo { 337eb357060SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 338eb357060SQu Wenruo u32 offset; 339eb357060SQu Wenruo int i; 340eb357060SQu Wenruo 341eb357060SQu Wenruo for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 342eb357060SQu Wenruo int page_index = offset >> PAGE_SHIFT; 343eb357060SQu Wenruo 344eb357060SQu Wenruo ASSERT(page_index < rbio->nr_pages); 345eb357060SQu Wenruo rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 346eb357060SQu Wenruo rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 347eb357060SQu Wenruo } 348eb357060SQu Wenruo } 349eb357060SQu Wenruo 350eb357060SQu Wenruo /* 351d4e28d9bSQu Wenruo * Stealing an rbio means taking all the uptodate pages from the stripe array 352d4e28d9bSQu Wenruo * in the source rbio and putting them into the destination rbio. 353d4e28d9bSQu Wenruo * 354d4e28d9bSQu Wenruo * This will also update the involved stripe_sectors[] which are referring to 355d4e28d9bSQu Wenruo * the old pages. 3564ae10b3aSChris Mason */ 3574ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 3584ae10b3aSChris Mason { 3594ae10b3aSChris Mason int i; 3604ae10b3aSChris Mason struct page *s; 3614ae10b3aSChris Mason struct page *d; 3624ae10b3aSChris Mason 3634ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 3644ae10b3aSChris Mason return; 3654ae10b3aSChris Mason 3664ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 3674ae10b3aSChris Mason s = src->stripe_pages[i]; 368d4e28d9bSQu Wenruo if (!s || !full_page_sectors_uptodate(src, i)) 3694ae10b3aSChris Mason continue; 3704ae10b3aSChris Mason 3714ae10b3aSChris Mason d = dest->stripe_pages[i]; 3724ae10b3aSChris Mason if (d) 3734ae10b3aSChris Mason __free_page(d); 3744ae10b3aSChris Mason 3754ae10b3aSChris Mason dest->stripe_pages[i] = s; 3764ae10b3aSChris Mason src->stripe_pages[i] = NULL; 3774ae10b3aSChris Mason } 378eb357060SQu Wenruo index_stripe_sectors(dest); 379eb357060SQu Wenruo index_stripe_sectors(src); 3804ae10b3aSChris Mason } 3814ae10b3aSChris Mason 3824ae10b3aSChris Mason /* 38353b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 38453b381b3SDavid Woodhouse * splice it into the destination. The victim should 38553b381b3SDavid Woodhouse * be discarded afterwards. 38653b381b3SDavid Woodhouse * 38753b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 38853b381b3SDavid Woodhouse */ 38953b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 39053b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 39153b381b3SDavid Woodhouse { 39253b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 39353b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 3944245215dSMiao Xie dest->generic_bio_cnt += victim->generic_bio_cnt; 39553b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 39653b381b3SDavid Woodhouse } 39753b381b3SDavid Woodhouse 39853b381b3SDavid Woodhouse /* 3994ae10b3aSChris Mason * used to prune items that are in the cache. The caller 4004ae10b3aSChris Mason * must hold the hash table lock. 4014ae10b3aSChris Mason */ 4024ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 4034ae10b3aSChris Mason { 4044ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 4054ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4064ae10b3aSChris Mason struct btrfs_stripe_hash *h; 4074ae10b3aSChris Mason int freeit = 0; 4084ae10b3aSChris Mason 4094ae10b3aSChris Mason /* 4104ae10b3aSChris Mason * check the bit again under the hash table lock. 4114ae10b3aSChris Mason */ 4124ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4134ae10b3aSChris Mason return; 4144ae10b3aSChris Mason 4156a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4164ae10b3aSChris Mason h = table->table + bucket; 4174ae10b3aSChris Mason 4184ae10b3aSChris Mason /* hold the lock for the bucket because we may be 4194ae10b3aSChris Mason * removing it from the hash table 4204ae10b3aSChris Mason */ 4214ae10b3aSChris Mason spin_lock(&h->lock); 4224ae10b3aSChris Mason 4234ae10b3aSChris Mason /* 4244ae10b3aSChris Mason * hold the lock for the bio list because we need 4254ae10b3aSChris Mason * to make sure the bio list is empty 4264ae10b3aSChris Mason */ 4274ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4284ae10b3aSChris Mason 4294ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 4304ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 4314ae10b3aSChris Mason table->cache_size -= 1; 4324ae10b3aSChris Mason freeit = 1; 4334ae10b3aSChris Mason 4344ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 4354ae10b3aSChris Mason * still involved in an IO. We take it out 4364ae10b3aSChris Mason * of the cache list, and drop the ref that 4374ae10b3aSChris Mason * was held for the list. 4384ae10b3aSChris Mason * 4394ae10b3aSChris Mason * If the bio_list was empty, we also remove 4404ae10b3aSChris Mason * the rbio from the hash_table, and drop 4414ae10b3aSChris Mason * the corresponding ref 4424ae10b3aSChris Mason */ 4434ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 4444ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 4454ae10b3aSChris Mason list_del_init(&rbio->hash_list); 446dec95574SElena Reshetova refcount_dec(&rbio->refs); 4474ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 4484ae10b3aSChris Mason } 4494ae10b3aSChris Mason } 4504ae10b3aSChris Mason } 4514ae10b3aSChris Mason 4524ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4534ae10b3aSChris Mason spin_unlock(&h->lock); 4544ae10b3aSChris Mason 4554ae10b3aSChris Mason if (freeit) 4564ae10b3aSChris Mason __free_raid_bio(rbio); 4574ae10b3aSChris Mason } 4584ae10b3aSChris Mason 4594ae10b3aSChris Mason /* 4604ae10b3aSChris Mason * prune a given rbio from the cache 4614ae10b3aSChris Mason */ 4624ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 4634ae10b3aSChris Mason { 4644ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4654ae10b3aSChris Mason unsigned long flags; 4664ae10b3aSChris Mason 4674ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4684ae10b3aSChris Mason return; 4694ae10b3aSChris Mason 4706a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 4714ae10b3aSChris Mason 4724ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4734ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4744ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4754ae10b3aSChris Mason } 4764ae10b3aSChris Mason 4774ae10b3aSChris Mason /* 4784ae10b3aSChris Mason * remove everything in the cache 4794ae10b3aSChris Mason */ 48048a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 4814ae10b3aSChris Mason { 4824ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4834ae10b3aSChris Mason unsigned long flags; 4844ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 4854ae10b3aSChris Mason 4864ae10b3aSChris Mason table = info->stripe_hash_table; 4874ae10b3aSChris Mason 4884ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4894ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 4904ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 4914ae10b3aSChris Mason struct btrfs_raid_bio, 4924ae10b3aSChris Mason stripe_cache); 4934ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4944ae10b3aSChris Mason } 4954ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4964ae10b3aSChris Mason } 4974ae10b3aSChris Mason 4984ae10b3aSChris Mason /* 4994ae10b3aSChris Mason * remove all cached entries and free the hash table 5004ae10b3aSChris Mason * used by unmount 50153b381b3SDavid Woodhouse */ 50253b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 50353b381b3SDavid Woodhouse { 50453b381b3SDavid Woodhouse if (!info->stripe_hash_table) 50553b381b3SDavid Woodhouse return; 5064ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 507f749303bSWang Shilong kvfree(info->stripe_hash_table); 50853b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 50953b381b3SDavid Woodhouse } 51053b381b3SDavid Woodhouse 51153b381b3SDavid Woodhouse /* 5124ae10b3aSChris Mason * insert an rbio into the stripe cache. It 5134ae10b3aSChris Mason * must have already been prepared by calling 5144ae10b3aSChris Mason * cache_rbio_pages 5154ae10b3aSChris Mason * 5164ae10b3aSChris Mason * If this rbio was already cached, it gets 5174ae10b3aSChris Mason * moved to the front of the lru. 5184ae10b3aSChris Mason * 5194ae10b3aSChris Mason * If the size of the rbio cache is too big, we 5204ae10b3aSChris Mason * prune an item. 5214ae10b3aSChris Mason */ 5224ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 5234ae10b3aSChris Mason { 5244ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 5254ae10b3aSChris Mason unsigned long flags; 5264ae10b3aSChris Mason 5274ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 5284ae10b3aSChris Mason return; 5294ae10b3aSChris Mason 5306a258d72SQu Wenruo table = rbio->bioc->fs_info->stripe_hash_table; 5314ae10b3aSChris Mason 5324ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 5334ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 5344ae10b3aSChris Mason 5354ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 5364ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 537dec95574SElena Reshetova refcount_inc(&rbio->refs); 5384ae10b3aSChris Mason 5394ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 5404ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 5414ae10b3aSChris Mason } else { 5424ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 5434ae10b3aSChris Mason table->cache_size += 1; 5444ae10b3aSChris Mason } 5454ae10b3aSChris Mason 5464ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 5474ae10b3aSChris Mason 5484ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 5494ae10b3aSChris Mason struct btrfs_raid_bio *found; 5504ae10b3aSChris Mason 5514ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 5524ae10b3aSChris Mason struct btrfs_raid_bio, 5534ae10b3aSChris Mason stripe_cache); 5544ae10b3aSChris Mason 5554ae10b3aSChris Mason if (found != rbio) 5564ae10b3aSChris Mason __remove_rbio_from_cache(found); 5574ae10b3aSChris Mason } 5584ae10b3aSChris Mason 5594ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 5604ae10b3aSChris Mason } 5614ae10b3aSChris Mason 5624ae10b3aSChris Mason /* 56353b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 56453b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 56553b381b3SDavid Woodhouse * loop through. 56653b381b3SDavid Woodhouse */ 56753b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 56853b381b3SDavid Woodhouse { 56953b381b3SDavid Woodhouse int src_off = 0; 57053b381b3SDavid Woodhouse int xor_src_cnt = 0; 57153b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 57253b381b3SDavid Woodhouse 57353b381b3SDavid Woodhouse while(src_cnt > 0) { 57453b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 57553b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 57653b381b3SDavid Woodhouse 57753b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 57853b381b3SDavid Woodhouse src_off += xor_src_cnt; 57953b381b3SDavid Woodhouse } 58053b381b3SDavid Woodhouse } 58153b381b3SDavid Woodhouse 58253b381b3SDavid Woodhouse /* 583176571a1SDavid Sterba * Returns true if the bio list inside this rbio covers an entire stripe (no 584176571a1SDavid Sterba * rmw required). 58553b381b3SDavid Woodhouse */ 58653b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 58753b381b3SDavid Woodhouse { 58853b381b3SDavid Woodhouse unsigned long flags; 589176571a1SDavid Sterba unsigned long size = rbio->bio_list_bytes; 590176571a1SDavid Sterba int ret = 1; 59153b381b3SDavid Woodhouse 59253b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 593176571a1SDavid Sterba if (size != rbio->nr_data * rbio->stripe_len) 594176571a1SDavid Sterba ret = 0; 595176571a1SDavid Sterba BUG_ON(size > rbio->nr_data * rbio->stripe_len); 59653b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 597176571a1SDavid Sterba 59853b381b3SDavid Woodhouse return ret; 59953b381b3SDavid Woodhouse } 60053b381b3SDavid Woodhouse 60153b381b3SDavid Woodhouse /* 60253b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 60353b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 60453b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 60553b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 60653b381b3SDavid Woodhouse * locked for final IO 60753b381b3SDavid Woodhouse * 60853b381b3SDavid Woodhouse * The caller is responsible for locking such that 60953b381b3SDavid Woodhouse * rmw_locked is safe to test 61053b381b3SDavid Woodhouse */ 61153b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 61253b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 61353b381b3SDavid Woodhouse { 61453b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 61553b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 61653b381b3SDavid Woodhouse return 0; 61753b381b3SDavid Woodhouse 6184ae10b3aSChris Mason /* 6194ae10b3aSChris Mason * we can't merge with cached rbios, since the 6204ae10b3aSChris Mason * idea is that when we merge the destination 6214ae10b3aSChris Mason * rbio is going to run our IO for us. We can 62201327610SNicholas D Steeves * steal from cached rbios though, other functions 6234ae10b3aSChris Mason * handle that. 6244ae10b3aSChris Mason */ 6254ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 6264ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 6274ae10b3aSChris Mason return 0; 6284ae10b3aSChris Mason 6294c664611SQu Wenruo if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 63053b381b3SDavid Woodhouse return 0; 63153b381b3SDavid Woodhouse 6325a6ac9eaSMiao Xie /* we can't merge with different operations */ 6335a6ac9eaSMiao Xie if (last->operation != cur->operation) 63453b381b3SDavid Woodhouse return 0; 6355a6ac9eaSMiao Xie /* 6365a6ac9eaSMiao Xie * We've need read the full stripe from the drive. 6375a6ac9eaSMiao Xie * check and repair the parity and write the new results. 6385a6ac9eaSMiao Xie * 6395a6ac9eaSMiao Xie * We're not allowed to add any new bios to the 6405a6ac9eaSMiao Xie * bio list here, anyone else that wants to 6415a6ac9eaSMiao Xie * change this stripe needs to do their own rmw. 6425a6ac9eaSMiao Xie */ 643db34be19SLiu Bo if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 6445a6ac9eaSMiao Xie return 0; 64553b381b3SDavid Woodhouse 646db34be19SLiu Bo if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 647b4ee1782SOmar Sandoval return 0; 648b4ee1782SOmar Sandoval 649cc54ff62SLiu Bo if (last->operation == BTRFS_RBIO_READ_REBUILD) { 650cc54ff62SLiu Bo int fa = last->faila; 651cc54ff62SLiu Bo int fb = last->failb; 652cc54ff62SLiu Bo int cur_fa = cur->faila; 653cc54ff62SLiu Bo int cur_fb = cur->failb; 654cc54ff62SLiu Bo 655cc54ff62SLiu Bo if (last->faila >= last->failb) { 656cc54ff62SLiu Bo fa = last->failb; 657cc54ff62SLiu Bo fb = last->faila; 658cc54ff62SLiu Bo } 659cc54ff62SLiu Bo 660cc54ff62SLiu Bo if (cur->faila >= cur->failb) { 661cc54ff62SLiu Bo cur_fa = cur->failb; 662cc54ff62SLiu Bo cur_fb = cur->faila; 663cc54ff62SLiu Bo } 664cc54ff62SLiu Bo 665cc54ff62SLiu Bo if (fa != cur_fa || fb != cur_fb) 666cc54ff62SLiu Bo return 0; 667cc54ff62SLiu Bo } 66853b381b3SDavid Woodhouse return 1; 66953b381b3SDavid Woodhouse } 67053b381b3SDavid Woodhouse 6713e77605dSQu Wenruo static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 6723e77605dSQu Wenruo unsigned int stripe_nr, 6733e77605dSQu Wenruo unsigned int sector_nr) 6743e77605dSQu Wenruo { 6753e77605dSQu Wenruo ASSERT(stripe_nr < rbio->real_stripes); 6763e77605dSQu Wenruo ASSERT(sector_nr < rbio->stripe_nsectors); 6773e77605dSQu Wenruo 6783e77605dSQu Wenruo return stripe_nr * rbio->stripe_nsectors + sector_nr; 6793e77605dSQu Wenruo } 6803e77605dSQu Wenruo 6813e77605dSQu Wenruo /* Return a sector from rbio->stripe_sectors, not from the bio list */ 6823e77605dSQu Wenruo static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 6833e77605dSQu Wenruo unsigned int stripe_nr, 6843e77605dSQu Wenruo unsigned int sector_nr) 6853e77605dSQu Wenruo { 6863e77605dSQu Wenruo return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 6873e77605dSQu Wenruo sector_nr)]; 6883e77605dSQu Wenruo } 6893e77605dSQu Wenruo 6901145059aSQu Wenruo /* Grab a sector inside P stripe */ 6911145059aSQu Wenruo static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 6921145059aSQu Wenruo unsigned int sector_nr) 693b7178a5fSZhao Lei { 6941145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 695b7178a5fSZhao Lei } 696b7178a5fSZhao Lei 6971145059aSQu Wenruo /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 6981145059aSQu Wenruo static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 6991145059aSQu Wenruo unsigned int sector_nr) 70053b381b3SDavid Woodhouse { 7012c8cdd6eSMiao Xie if (rbio->nr_data + 1 == rbio->real_stripes) 70253b381b3SDavid Woodhouse return NULL; 7031145059aSQu Wenruo return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 7041145059aSQu Wenruo } 7051145059aSQu Wenruo 70653b381b3SDavid Woodhouse /* 70753b381b3SDavid Woodhouse * The first stripe in the table for a logical address 70853b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 70953b381b3SDavid Woodhouse * 71053b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 71153b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 71253b381b3SDavid Woodhouse * themselves. 71353b381b3SDavid Woodhouse * 71453b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 71553b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 71653b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 71753b381b3SDavid Woodhouse * 71853b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 71953b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 72053b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 72153b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 72253b381b3SDavid Woodhouse * 1 is returned 72353b381b3SDavid Woodhouse * 72453b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 72553b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 72653b381b3SDavid Woodhouse * already been freed. 72753b381b3SDavid Woodhouse */ 72853b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 72953b381b3SDavid Woodhouse { 730721860d5SJohannes Thumshirn struct btrfs_stripe_hash *h; 73153b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 73253b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 73353b381b3SDavid Woodhouse unsigned long flags; 73453b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 7354ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 73653b381b3SDavid Woodhouse int ret = 0; 73753b381b3SDavid Woodhouse 7386a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 739721860d5SJohannes Thumshirn 74053b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 74153b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 7424c664611SQu Wenruo if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 7439d6cb1b0SJohannes Thumshirn continue; 7449d6cb1b0SJohannes Thumshirn 74553b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 74653b381b3SDavid Woodhouse 7479d6cb1b0SJohannes Thumshirn /* Can we steal this cached rbio's pages? */ 7484ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 7494ae10b3aSChris Mason list_empty(&cur->plug_list) && 7504ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 7514ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 7524ae10b3aSChris Mason list_del_init(&cur->hash_list); 753dec95574SElena Reshetova refcount_dec(&cur->refs); 7544ae10b3aSChris Mason 7554ae10b3aSChris Mason steal_rbio(cur, rbio); 7564ae10b3aSChris Mason cache_drop = cur; 7574ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 7584ae10b3aSChris Mason 7594ae10b3aSChris Mason goto lockit; 7604ae10b3aSChris Mason } 7614ae10b3aSChris Mason 7629d6cb1b0SJohannes Thumshirn /* Can we merge into the lock owner? */ 76353b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 76453b381b3SDavid Woodhouse merge_rbio(cur, rbio); 76553b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 76653b381b3SDavid Woodhouse freeit = rbio; 76753b381b3SDavid Woodhouse ret = 1; 76853b381b3SDavid Woodhouse goto out; 76953b381b3SDavid Woodhouse } 77053b381b3SDavid Woodhouse 7714ae10b3aSChris Mason 77253b381b3SDavid Woodhouse /* 7739d6cb1b0SJohannes Thumshirn * We couldn't merge with the running rbio, see if we can merge 7749d6cb1b0SJohannes Thumshirn * with the pending ones. We don't have to check for rmw_locked 7759d6cb1b0SJohannes Thumshirn * because there is no way they are inside finish_rmw right now 77653b381b3SDavid Woodhouse */ 7779d6cb1b0SJohannes Thumshirn list_for_each_entry(pending, &cur->plug_list, plug_list) { 77853b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 77953b381b3SDavid Woodhouse merge_rbio(pending, rbio); 78053b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 78153b381b3SDavid Woodhouse freeit = rbio; 78253b381b3SDavid Woodhouse ret = 1; 78353b381b3SDavid Woodhouse goto out; 78453b381b3SDavid Woodhouse } 78553b381b3SDavid Woodhouse } 78653b381b3SDavid Woodhouse 7879d6cb1b0SJohannes Thumshirn /* 7889d6cb1b0SJohannes Thumshirn * No merging, put us on the tail of the plug list, our rbio 7899d6cb1b0SJohannes Thumshirn * will be started with the currently running rbio unlocks 79053b381b3SDavid Woodhouse */ 79153b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 79253b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 79353b381b3SDavid Woodhouse ret = 1; 79453b381b3SDavid Woodhouse goto out; 79553b381b3SDavid Woodhouse } 7964ae10b3aSChris Mason lockit: 797dec95574SElena Reshetova refcount_inc(&rbio->refs); 79853b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 79953b381b3SDavid Woodhouse out: 80053b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 8014ae10b3aSChris Mason if (cache_drop) 8024ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 80353b381b3SDavid Woodhouse if (freeit) 80453b381b3SDavid Woodhouse __free_raid_bio(freeit); 80553b381b3SDavid Woodhouse return ret; 80653b381b3SDavid Woodhouse } 80753b381b3SDavid Woodhouse 80853b381b3SDavid Woodhouse /* 80953b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 81053b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 81153b381b3SDavid Woodhouse */ 81253b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 81353b381b3SDavid Woodhouse { 81453b381b3SDavid Woodhouse int bucket; 81553b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 81653b381b3SDavid Woodhouse unsigned long flags; 8174ae10b3aSChris Mason int keep_cache = 0; 81853b381b3SDavid Woodhouse 81953b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 8206a258d72SQu Wenruo h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 82153b381b3SDavid Woodhouse 8224ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 8234ae10b3aSChris Mason cache_rbio(rbio); 8244ae10b3aSChris Mason 82553b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 82653b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 82753b381b3SDavid Woodhouse 82853b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 8294ae10b3aSChris Mason /* 8304ae10b3aSChris Mason * if we're still cached and there is no other IO 8314ae10b3aSChris Mason * to perform, just leave this rbio here for others 8324ae10b3aSChris Mason * to steal from later 8334ae10b3aSChris Mason */ 8344ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 8354ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 8364ae10b3aSChris Mason keep_cache = 1; 8374ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 8384ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 8394ae10b3aSChris Mason goto done; 8404ae10b3aSChris Mason } 84153b381b3SDavid Woodhouse 84253b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 843dec95574SElena Reshetova refcount_dec(&rbio->refs); 84453b381b3SDavid Woodhouse 84553b381b3SDavid Woodhouse /* 84653b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 84753b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 84853b381b3SDavid Woodhouse * hand the lock over to one of them. 84953b381b3SDavid Woodhouse */ 85053b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 85153b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 85253b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 85353b381b3SDavid Woodhouse 85453b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 85553b381b3SDavid Woodhouse plug_list); 85653b381b3SDavid Woodhouse 85753b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 85853b381b3SDavid Woodhouse 85953b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 860dec95574SElena Reshetova refcount_inc(&next->refs); 86153b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 86253b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 86353b381b3SDavid Woodhouse 8641b94b556SMiao Xie if (next->operation == BTRFS_RBIO_READ_REBUILD) 865e66d8d5aSDavid Sterba start_async_work(next, read_rebuild_work); 866b4ee1782SOmar Sandoval else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 867b4ee1782SOmar Sandoval steal_rbio(rbio, next); 868e66d8d5aSDavid Sterba start_async_work(next, read_rebuild_work); 869b4ee1782SOmar Sandoval } else if (next->operation == BTRFS_RBIO_WRITE) { 8704ae10b3aSChris Mason steal_rbio(rbio, next); 871cf6a4a75SDavid Sterba start_async_work(next, rmw_work); 8725a6ac9eaSMiao Xie } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 8735a6ac9eaSMiao Xie steal_rbio(rbio, next); 874a81b747dSDavid Sterba start_async_work(next, scrub_parity_work); 8754ae10b3aSChris Mason } 87653b381b3SDavid Woodhouse 87753b381b3SDavid Woodhouse goto done_nolock; 87853b381b3SDavid Woodhouse } 87953b381b3SDavid Woodhouse } 8804ae10b3aSChris Mason done: 88153b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 88253b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 88353b381b3SDavid Woodhouse 88453b381b3SDavid Woodhouse done_nolock: 8854ae10b3aSChris Mason if (!keep_cache) 8864ae10b3aSChris Mason remove_rbio_from_cache(rbio); 88753b381b3SDavid Woodhouse } 88853b381b3SDavid Woodhouse 88953b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio) 89053b381b3SDavid Woodhouse { 89153b381b3SDavid Woodhouse int i; 89253b381b3SDavid Woodhouse 893dec95574SElena Reshetova if (!refcount_dec_and_test(&rbio->refs)) 89453b381b3SDavid Woodhouse return; 89553b381b3SDavid Woodhouse 8964ae10b3aSChris Mason WARN_ON(!list_empty(&rbio->stripe_cache)); 89753b381b3SDavid Woodhouse WARN_ON(!list_empty(&rbio->hash_list)); 89853b381b3SDavid Woodhouse WARN_ON(!bio_list_empty(&rbio->bio_list)); 89953b381b3SDavid Woodhouse 90053b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 90153b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) { 90253b381b3SDavid Woodhouse __free_page(rbio->stripe_pages[i]); 90353b381b3SDavid Woodhouse rbio->stripe_pages[i] = NULL; 90453b381b3SDavid Woodhouse } 90553b381b3SDavid Woodhouse } 906af8e2d1dSMiao Xie 9074c664611SQu Wenruo btrfs_put_bioc(rbio->bioc); 90853b381b3SDavid Woodhouse kfree(rbio); 90953b381b3SDavid Woodhouse } 91053b381b3SDavid Woodhouse 9117583d8d0SLiu Bo static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 91253b381b3SDavid Woodhouse { 9137583d8d0SLiu Bo struct bio *next; 9147583d8d0SLiu Bo 9157583d8d0SLiu Bo while (cur) { 9167583d8d0SLiu Bo next = cur->bi_next; 9177583d8d0SLiu Bo cur->bi_next = NULL; 9187583d8d0SLiu Bo cur->bi_status = err; 9197583d8d0SLiu Bo bio_endio(cur); 9207583d8d0SLiu Bo cur = next; 9217583d8d0SLiu Bo } 92253b381b3SDavid Woodhouse } 92353b381b3SDavid Woodhouse 92453b381b3SDavid Woodhouse /* 92553b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 92653b381b3SDavid Woodhouse * bio_list and calls end_io on them 92753b381b3SDavid Woodhouse */ 9284e4cbee9SChristoph Hellwig static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 92953b381b3SDavid Woodhouse { 93053b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 9317583d8d0SLiu Bo struct bio *extra; 9324245215dSMiao Xie 9334245215dSMiao Xie if (rbio->generic_bio_cnt) 9346a258d72SQu Wenruo btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); 9354245215dSMiao Xie 9367583d8d0SLiu Bo /* 9377583d8d0SLiu Bo * At this moment, rbio->bio_list is empty, however since rbio does not 9387583d8d0SLiu Bo * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 9397583d8d0SLiu Bo * hash list, rbio may be merged with others so that rbio->bio_list 9407583d8d0SLiu Bo * becomes non-empty. 9417583d8d0SLiu Bo * Once unlock_stripe() is done, rbio->bio_list will not be updated any 9427583d8d0SLiu Bo * more and we can call bio_endio() on all queued bios. 9437583d8d0SLiu Bo */ 9447583d8d0SLiu Bo unlock_stripe(rbio); 9457583d8d0SLiu Bo extra = bio_list_get(&rbio->bio_list); 9467583d8d0SLiu Bo __free_raid_bio(rbio); 94753b381b3SDavid Woodhouse 9487583d8d0SLiu Bo rbio_endio_bio_list(cur, err); 9497583d8d0SLiu Bo if (extra) 9507583d8d0SLiu Bo rbio_endio_bio_list(extra, err); 95153b381b3SDavid Woodhouse } 95253b381b3SDavid Woodhouse 95353b381b3SDavid Woodhouse /* 95453b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 95553b381b3SDavid Woodhouse * get here, we've written a full stripe 95653b381b3SDavid Woodhouse */ 9574246a0b6SChristoph Hellwig static void raid_write_end_io(struct bio *bio) 95853b381b3SDavid Woodhouse { 95953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 9604e4cbee9SChristoph Hellwig blk_status_t err = bio->bi_status; 961a6111d11SZhao Lei int max_errors; 96253b381b3SDavid Woodhouse 96353b381b3SDavid Woodhouse if (err) 96453b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 96553b381b3SDavid Woodhouse 96653b381b3SDavid Woodhouse bio_put(bio); 96753b381b3SDavid Woodhouse 968b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 96953b381b3SDavid Woodhouse return; 97053b381b3SDavid Woodhouse 97158efbc9fSOmar Sandoval err = BLK_STS_OK; 97253b381b3SDavid Woodhouse 97353b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 974a6111d11SZhao Lei max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 9754c664611SQu Wenruo 0 : rbio->bioc->max_errors; 976a6111d11SZhao Lei if (atomic_read(&rbio->error) > max_errors) 9774e4cbee9SChristoph Hellwig err = BLK_STS_IOERR; 97853b381b3SDavid Woodhouse 9794246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 98053b381b3SDavid Woodhouse } 98153b381b3SDavid Woodhouse 9823e77605dSQu Wenruo /** 9833e77605dSQu Wenruo * Get a sector pointer specified by its @stripe_nr and @sector_nr 9843e77605dSQu Wenruo * 9853e77605dSQu Wenruo * @rbio: The raid bio 9863e77605dSQu Wenruo * @stripe_nr: Stripe number, valid range [0, real_stripe) 9873e77605dSQu Wenruo * @sector_nr: Sector number inside the stripe, 9883e77605dSQu Wenruo * valid range [0, stripe_nsectors) 9893e77605dSQu Wenruo * @bio_list_only: Whether to use sectors inside the bio list only. 9903e77605dSQu Wenruo * 9913e77605dSQu Wenruo * The read/modify/write code wants to reuse the original bio page as much 9923e77605dSQu Wenruo * as possible, and only use stripe_sectors as fallback. 9933e77605dSQu Wenruo */ 9943e77605dSQu Wenruo static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 9953e77605dSQu Wenruo int stripe_nr, int sector_nr, 9963e77605dSQu Wenruo bool bio_list_only) 9973e77605dSQu Wenruo { 9983e77605dSQu Wenruo struct sector_ptr *sector; 9993e77605dSQu Wenruo int index; 10003e77605dSQu Wenruo 10013e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 10023e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 10033e77605dSQu Wenruo 10043e77605dSQu Wenruo index = stripe_nr * rbio->stripe_nsectors + sector_nr; 10053e77605dSQu Wenruo ASSERT(index >= 0 && index < rbio->nr_sectors); 10063e77605dSQu Wenruo 10073e77605dSQu Wenruo spin_lock_irq(&rbio->bio_list_lock); 10083e77605dSQu Wenruo sector = &rbio->bio_sectors[index]; 10093e77605dSQu Wenruo if (sector->page || bio_list_only) { 10103e77605dSQu Wenruo /* Don't return sector without a valid page pointer */ 10113e77605dSQu Wenruo if (!sector->page) 10123e77605dSQu Wenruo sector = NULL; 10133e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 10143e77605dSQu Wenruo return sector; 10153e77605dSQu Wenruo } 10163e77605dSQu Wenruo spin_unlock_irq(&rbio->bio_list_lock); 10173e77605dSQu Wenruo 10183e77605dSQu Wenruo return &rbio->stripe_sectors[index]; 10193e77605dSQu Wenruo } 10203e77605dSQu Wenruo 102153b381b3SDavid Woodhouse /* 102253b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 102353b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 102453b381b3SDavid Woodhouse */ 10252ff7e61eSJeff Mahoney static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 10264c664611SQu Wenruo struct btrfs_io_context *bioc, 1027cc353a8bSQu Wenruo u32 stripe_len) 102853b381b3SDavid Woodhouse { 1029843de58bSQu Wenruo const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 1030843de58bSQu Wenruo const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; 1031843de58bSQu Wenruo const unsigned int num_pages = stripe_npages * real_stripes; 103294efbe19SQu Wenruo const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; 103394efbe19SQu Wenruo const unsigned int num_sectors = stripe_nsectors * real_stripes; 103453b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 103553b381b3SDavid Woodhouse int nr_data = 0; 103653b381b3SDavid Woodhouse void *p; 103753b381b3SDavid Woodhouse 1038843de58bSQu Wenruo ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); 103994efbe19SQu Wenruo /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 104094efbe19SQu Wenruo ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 1041843de58bSQu Wenruo 10421389053eSKees Cook rbio = kzalloc(sizeof(*rbio) + 10431389053eSKees Cook sizeof(*rbio->stripe_pages) * num_pages + 104400425dd9SQu Wenruo sizeof(*rbio->bio_sectors) * num_sectors + 1045eb357060SQu Wenruo sizeof(*rbio->stripe_sectors) * num_sectors + 10461389053eSKees Cook sizeof(*rbio->finish_pointers) * real_stripes + 104794efbe19SQu Wenruo sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + 104894efbe19SQu Wenruo sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), 10491389053eSKees Cook GFP_NOFS); 1050af8e2d1dSMiao Xie if (!rbio) 105153b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 105253b381b3SDavid Woodhouse 105353b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 105453b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 105553b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 10564ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 105753b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 10584c664611SQu Wenruo rbio->bioc = bioc; 105953b381b3SDavid Woodhouse rbio->stripe_len = stripe_len; 106053b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 106194efbe19SQu Wenruo rbio->nr_sectors = num_sectors; 10622c8cdd6eSMiao Xie rbio->real_stripes = real_stripes; 10635a6ac9eaSMiao Xie rbio->stripe_npages = stripe_npages; 106494efbe19SQu Wenruo rbio->stripe_nsectors = stripe_nsectors; 106553b381b3SDavid Woodhouse rbio->faila = -1; 106653b381b3SDavid Woodhouse rbio->failb = -1; 1067dec95574SElena Reshetova refcount_set(&rbio->refs, 1); 1068b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 1069b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, 0); 107053b381b3SDavid Woodhouse 107153b381b3SDavid Woodhouse /* 1072ac26df8bSQu Wenruo * The stripe_pages, bio_sectors, etc arrays point to the extra memory 1073ac26df8bSQu Wenruo * we allocated past the end of the rbio. 107453b381b3SDavid Woodhouse */ 107553b381b3SDavid Woodhouse p = rbio + 1; 10761389053eSKees Cook #define CONSUME_ALLOC(ptr, count) do { \ 10771389053eSKees Cook ptr = p; \ 10781389053eSKees Cook p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 10791389053eSKees Cook } while (0) 10801389053eSKees Cook CONSUME_ALLOC(rbio->stripe_pages, num_pages); 108100425dd9SQu Wenruo CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 1082eb357060SQu Wenruo CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 10831389053eSKees Cook CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 108494efbe19SQu Wenruo CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); 108594efbe19SQu Wenruo CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); 10861389053eSKees Cook #undef CONSUME_ALLOC 108753b381b3SDavid Woodhouse 10884c664611SQu Wenruo if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) 108910f11900SZhao Lei nr_data = real_stripes - 1; 10904c664611SQu Wenruo else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) 10912c8cdd6eSMiao Xie nr_data = real_stripes - 2; 109253b381b3SDavid Woodhouse else 109310f11900SZhao Lei BUG(); 109453b381b3SDavid Woodhouse 109553b381b3SDavid Woodhouse rbio->nr_data = nr_data; 109653b381b3SDavid Woodhouse return rbio; 109753b381b3SDavid Woodhouse } 109853b381b3SDavid Woodhouse 109953b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 110053b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 110153b381b3SDavid Woodhouse { 1102eb357060SQu Wenruo int ret; 1103eb357060SQu Wenruo 1104eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 1105eb357060SQu Wenruo if (ret < 0) 1106eb357060SQu Wenruo return ret; 1107eb357060SQu Wenruo /* Mapping all sectors */ 1108eb357060SQu Wenruo index_stripe_sectors(rbio); 1109eb357060SQu Wenruo return 0; 111053b381b3SDavid Woodhouse } 111153b381b3SDavid Woodhouse 1112b7178a5fSZhao Lei /* only allocate pages for p/q stripes */ 111353b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 111453b381b3SDavid Woodhouse { 1115f77183dcSQu Wenruo const int data_pages = rbio->nr_data * rbio->stripe_npages; 1116eb357060SQu Wenruo int ret; 111753b381b3SDavid Woodhouse 1118eb357060SQu Wenruo ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1119dd137dd1SSweet Tea Dorminy rbio->stripe_pages + data_pages); 1120eb357060SQu Wenruo if (ret < 0) 1121eb357060SQu Wenruo return ret; 1122eb357060SQu Wenruo 1123eb357060SQu Wenruo index_stripe_sectors(rbio); 1124eb357060SQu Wenruo return 0; 112553b381b3SDavid Woodhouse } 112653b381b3SDavid Woodhouse 112753b381b3SDavid Woodhouse /* 11283e77605dSQu Wenruo * Add a single sector @sector into our list of bios for IO. 11293e77605dSQu Wenruo * 11303e77605dSQu Wenruo * Return 0 if everything went well. 11313e77605dSQu Wenruo * Return <0 for error. 113253b381b3SDavid Woodhouse */ 11333e77605dSQu Wenruo static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 113453b381b3SDavid Woodhouse struct bio_list *bio_list, 11353e77605dSQu Wenruo struct sector_ptr *sector, 11363e77605dSQu Wenruo unsigned int stripe_nr, 11373e77605dSQu Wenruo unsigned int sector_nr, 1138e01bf588SChristoph Hellwig unsigned long bio_max_len, 1139e01bf588SChristoph Hellwig unsigned int opf) 114053b381b3SDavid Woodhouse { 11413e77605dSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 114253b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 114353b381b3SDavid Woodhouse int ret; 114453b381b3SDavid Woodhouse struct bio *bio; 11454c664611SQu Wenruo struct btrfs_io_stripe *stripe; 114653b381b3SDavid Woodhouse u64 disk_start; 114753b381b3SDavid Woodhouse 11483e77605dSQu Wenruo /* 11493e77605dSQu Wenruo * Note: here stripe_nr has taken device replace into consideration, 11503e77605dSQu Wenruo * thus it can be larger than rbio->real_stripe. 11513e77605dSQu Wenruo * So here we check against bioc->num_stripes, not rbio->real_stripes. 11523e77605dSQu Wenruo */ 11533e77605dSQu Wenruo ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 11543e77605dSQu Wenruo ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 11553e77605dSQu Wenruo ASSERT(sector->page); 11563e77605dSQu Wenruo 11573e77605dSQu Wenruo /* We don't yet support subpage, thus pgoff should always be 0 */ 11583e77605dSQu Wenruo ASSERT(sector->pgoff == 0); 11593e77605dSQu Wenruo 11604c664611SQu Wenruo stripe = &rbio->bioc->stripes[stripe_nr]; 11613e77605dSQu Wenruo disk_start = stripe->physical + sector_nr * sectorsize; 116253b381b3SDavid Woodhouse 116353b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 116453b381b3SDavid Woodhouse if (!stripe->dev->bdev) 116553b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 116653b381b3SDavid Woodhouse 116753b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 116853b381b3SDavid Woodhouse if (last) { 11691201b58bSDavid Sterba u64 last_end = last->bi_iter.bi_sector << 9; 11704f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 117153b381b3SDavid Woodhouse 117253b381b3SDavid Woodhouse /* 117353b381b3SDavid Woodhouse * we can't merge these if they are from different 117453b381b3SDavid Woodhouse * devices or if they are not contiguous 117553b381b3SDavid Woodhouse */ 1176f90ae76aSNikolay Borisov if (last_end == disk_start && !last->bi_status && 1177309dca30SChristoph Hellwig last->bi_bdev == stripe->dev->bdev) { 11783e77605dSQu Wenruo ret = bio_add_page(last, sector->page, sectorsize, 11793e77605dSQu Wenruo sector->pgoff); 11803e77605dSQu Wenruo if (ret == sectorsize) 118153b381b3SDavid Woodhouse return 0; 118253b381b3SDavid Woodhouse } 118353b381b3SDavid Woodhouse } 118453b381b3SDavid Woodhouse 118553b381b3SDavid Woodhouse /* put a new bio on the list */ 1186e1b4b44eSChristoph Hellwig bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), 1187e1b4b44eSChristoph Hellwig opf, GFP_NOFS); 11884f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 1189e01bf588SChristoph Hellwig bio->bi_private = rbio; 119053b381b3SDavid Woodhouse 11913e77605dSQu Wenruo bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 119253b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 119353b381b3SDavid Woodhouse return 0; 119453b381b3SDavid Woodhouse } 119553b381b3SDavid Woodhouse 119653b381b3SDavid Woodhouse /* 119753b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 119853b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 119953b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 120053b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 120153b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 120253b381b3SDavid Woodhouse */ 120353b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 120453b381b3SDavid Woodhouse { 120553b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 12062c8cdd6eSMiao Xie BUG_ON(rbio->faila == rbio->real_stripes - 1); 120753b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 120853b381b3SDavid Woodhouse } else { 120953b381b3SDavid Woodhouse finish_rmw(rbio); 121053b381b3SDavid Woodhouse } 121153b381b3SDavid Woodhouse } 121253b381b3SDavid Woodhouse 121300425dd9SQu Wenruo static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 121400425dd9SQu Wenruo { 121500425dd9SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 121600425dd9SQu Wenruo struct bio_vec bvec; 121700425dd9SQu Wenruo struct bvec_iter iter; 121800425dd9SQu Wenruo u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 121900425dd9SQu Wenruo rbio->bioc->raid_map[0]; 122000425dd9SQu Wenruo 122100425dd9SQu Wenruo if (bio_flagged(bio, BIO_CLONED)) 122200425dd9SQu Wenruo bio->bi_iter = btrfs_bio(bio)->iter; 122300425dd9SQu Wenruo 122400425dd9SQu Wenruo bio_for_each_segment(bvec, bio, iter) { 122500425dd9SQu Wenruo u32 bvec_offset; 122600425dd9SQu Wenruo 122700425dd9SQu Wenruo for (bvec_offset = 0; bvec_offset < bvec.bv_len; 122800425dd9SQu Wenruo bvec_offset += sectorsize, offset += sectorsize) { 122900425dd9SQu Wenruo int index = offset / sectorsize; 123000425dd9SQu Wenruo struct sector_ptr *sector = &rbio->bio_sectors[index]; 123100425dd9SQu Wenruo 123200425dd9SQu Wenruo sector->page = bvec.bv_page; 123300425dd9SQu Wenruo sector->pgoff = bvec.bv_offset + bvec_offset; 123400425dd9SQu Wenruo ASSERT(sector->pgoff < PAGE_SIZE); 123500425dd9SQu Wenruo } 123600425dd9SQu Wenruo } 123700425dd9SQu Wenruo } 123800425dd9SQu Wenruo 123953b381b3SDavid Woodhouse /* 124053b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 124153b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 124253b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 124353b381b3SDavid Woodhouse * reconstruction. 124453b381b3SDavid Woodhouse * 124553b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 124653b381b3SDavid Woodhouse */ 124753b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 124853b381b3SDavid Woodhouse { 124953b381b3SDavid Woodhouse struct bio *bio; 125053b381b3SDavid Woodhouse 125153b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 125200425dd9SQu Wenruo bio_list_for_each(bio, &rbio->bio_list) 125300425dd9SQu Wenruo index_one_bio(rbio, bio); 125400425dd9SQu Wenruo 125553b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 125653b381b3SDavid Woodhouse } 125753b381b3SDavid Woodhouse 125853b381b3SDavid Woodhouse /* 125953b381b3SDavid Woodhouse * this is called from one of two situations. We either 126053b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 126153b381b3SDavid Woodhouse * the missing bits off disk. 126253b381b3SDavid Woodhouse * 126353b381b3SDavid Woodhouse * This will calculate the parity and then send down any 126453b381b3SDavid Woodhouse * changed blocks. 126553b381b3SDavid Woodhouse */ 126653b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 126753b381b3SDavid Woodhouse { 12684c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 12691145059aSQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 12701389053eSKees Cook void **pointers = rbio->finish_pointers; 127153b381b3SDavid Woodhouse int nr_data = rbio->nr_data; 127253b381b3SDavid Woodhouse int stripe; 12733e77605dSQu Wenruo int sectornr; 1274c17af965SDavid Sterba bool has_qstripe; 127553b381b3SDavid Woodhouse struct bio_list bio_list; 127653b381b3SDavid Woodhouse struct bio *bio; 127753b381b3SDavid Woodhouse int ret; 127853b381b3SDavid Woodhouse 127953b381b3SDavid Woodhouse bio_list_init(&bio_list); 128053b381b3SDavid Woodhouse 1281c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 1282c17af965SDavid Sterba has_qstripe = false; 1283c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 1284c17af965SDavid Sterba has_qstripe = true; 1285c17af965SDavid Sterba else 128653b381b3SDavid Woodhouse BUG(); 128753b381b3SDavid Woodhouse 128853b381b3SDavid Woodhouse /* at this point we either have a full stripe, 128953b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 129053b381b3SDavid Woodhouse * recalculate the parity and write the new results. 129153b381b3SDavid Woodhouse * 129253b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 129353b381b3SDavid Woodhouse * bio list here, anyone else that wants to 129453b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 129553b381b3SDavid Woodhouse */ 129653b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 129753b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 129853b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 129953b381b3SDavid Woodhouse 1300b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 130153b381b3SDavid Woodhouse 130253b381b3SDavid Woodhouse /* 130353b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 130453b381b3SDavid Woodhouse * bio list one last time and map the page pointers 13054ae10b3aSChris Mason * 13064ae10b3aSChris Mason * We don't cache full rbios because we're assuming 13074ae10b3aSChris Mason * the higher layers are unlikely to use this area of 13084ae10b3aSChris Mason * the disk again soon. If they do use it again, 13094ae10b3aSChris Mason * hopefully they will send another full bio. 131053b381b3SDavid Woodhouse */ 131153b381b3SDavid Woodhouse index_rbio_pages(rbio); 13124ae10b3aSChris Mason if (!rbio_is_full(rbio)) 13134ae10b3aSChris Mason cache_rbio_pages(rbio); 13144ae10b3aSChris Mason else 13154ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 131653b381b3SDavid Woodhouse 13173e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 13181145059aSQu Wenruo struct sector_ptr *sector; 13191145059aSQu Wenruo 13201145059aSQu Wenruo /* First collect one sector from each data stripe */ 132153b381b3SDavid Woodhouse for (stripe = 0; stripe < nr_data; stripe++) { 13221145059aSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 13231145059aSQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 13241145059aSQu Wenruo sector->pgoff; 132553b381b3SDavid Woodhouse } 132653b381b3SDavid Woodhouse 13271145059aSQu Wenruo /* Then add the parity stripe */ 13281145059aSQu Wenruo sector = rbio_pstripe_sector(rbio, sectornr); 13291145059aSQu Wenruo sector->uptodate = 1; 13301145059aSQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 133153b381b3SDavid Woodhouse 1332c17af965SDavid Sterba if (has_qstripe) { 133353b381b3SDavid Woodhouse /* 13341145059aSQu Wenruo * RAID6, add the qstripe and call the library function 13351145059aSQu Wenruo * to fill in our p/q 133653b381b3SDavid Woodhouse */ 13371145059aSQu Wenruo sector = rbio_qstripe_sector(rbio, sectornr); 13381145059aSQu Wenruo sector->uptodate = 1; 13391145059aSQu Wenruo pointers[stripe++] = kmap_local_page(sector->page) + 13401145059aSQu Wenruo sector->pgoff; 134153b381b3SDavid Woodhouse 13421145059aSQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 134353b381b3SDavid Woodhouse pointers); 134453b381b3SDavid Woodhouse } else { 134553b381b3SDavid Woodhouse /* raid5 */ 13461145059aSQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 13471145059aSQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 134853b381b3SDavid Woodhouse } 134994a0b58dSIra Weiny for (stripe = stripe - 1; stripe >= 0; stripe--) 135094a0b58dSIra Weiny kunmap_local(pointers[stripe]); 135153b381b3SDavid Woodhouse } 135253b381b3SDavid Woodhouse 135353b381b3SDavid Woodhouse /* 135453b381b3SDavid Woodhouse * time to start writing. Make bios for everything from the 135553b381b3SDavid Woodhouse * higher layers (the bio_list in our rbio) and our p/q. Ignore 135653b381b3SDavid Woodhouse * everything else. 135753b381b3SDavid Woodhouse */ 13582c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 13593e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 13603e77605dSQu Wenruo struct sector_ptr *sector; 13613e77605dSQu Wenruo 136253b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 13633e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13643e77605dSQu Wenruo if (!sector) 136553b381b3SDavid Woodhouse continue; 136653b381b3SDavid Woodhouse } else { 13673e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 136853b381b3SDavid Woodhouse } 136953b381b3SDavid Woodhouse 13703e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 13713e77605dSQu Wenruo sectornr, rbio->stripe_len, 1372e01bf588SChristoph Hellwig REQ_OP_WRITE); 137353b381b3SDavid Woodhouse if (ret) 137453b381b3SDavid Woodhouse goto cleanup; 137553b381b3SDavid Woodhouse } 137653b381b3SDavid Woodhouse } 137753b381b3SDavid Woodhouse 13784c664611SQu Wenruo if (likely(!bioc->num_tgtdevs)) 13792c8cdd6eSMiao Xie goto write_data; 13802c8cdd6eSMiao Xie 13812c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 13824c664611SQu Wenruo if (!bioc->tgtdev_map[stripe]) 13832c8cdd6eSMiao Xie continue; 13842c8cdd6eSMiao Xie 13853e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 13863e77605dSQu Wenruo struct sector_ptr *sector; 13873e77605dSQu Wenruo 13882c8cdd6eSMiao Xie if (stripe < rbio->nr_data) { 13893e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 13903e77605dSQu Wenruo if (!sector) 13912c8cdd6eSMiao Xie continue; 13922c8cdd6eSMiao Xie } else { 13933e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 13942c8cdd6eSMiao Xie } 13952c8cdd6eSMiao Xie 13963e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 13974c664611SQu Wenruo rbio->bioc->tgtdev_map[stripe], 13983e77605dSQu Wenruo sectornr, rbio->stripe_len, 1399e01bf588SChristoph Hellwig REQ_OP_WRITE); 14002c8cdd6eSMiao Xie if (ret) 14012c8cdd6eSMiao Xie goto cleanup; 14022c8cdd6eSMiao Xie } 14032c8cdd6eSMiao Xie } 14042c8cdd6eSMiao Xie 14052c8cdd6eSMiao Xie write_data: 1406b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1407b89e1b01SMiao Xie BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 140853b381b3SDavid Woodhouse 1409bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 141053b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 14114e49ea4aSMike Christie 14124e49ea4aSMike Christie submit_bio(bio); 141353b381b3SDavid Woodhouse } 141453b381b3SDavid Woodhouse return; 141553b381b3SDavid Woodhouse 141653b381b3SDavid Woodhouse cleanup: 141758efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1418785884fcSLiu Bo 1419785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1420785884fcSLiu Bo bio_put(bio); 142153b381b3SDavid Woodhouse } 142253b381b3SDavid Woodhouse 142353b381b3SDavid Woodhouse /* 142453b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 142553b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 142653b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 142753b381b3SDavid Woodhouse */ 142853b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 142953b381b3SDavid Woodhouse struct bio *bio) 143053b381b3SDavid Woodhouse { 14314f024f37SKent Overstreet u64 physical = bio->bi_iter.bi_sector; 143253b381b3SDavid Woodhouse int i; 14334c664611SQu Wenruo struct btrfs_io_stripe *stripe; 143453b381b3SDavid Woodhouse 143553b381b3SDavid Woodhouse physical <<= 9; 143653b381b3SDavid Woodhouse 14374c664611SQu Wenruo for (i = 0; i < rbio->bioc->num_stripes; i++) { 14384c664611SQu Wenruo stripe = &rbio->bioc->stripes[i]; 143983025863SNikolay Borisov if (in_range(physical, stripe->physical, rbio->stripe_len) && 1440309dca30SChristoph Hellwig stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 144153b381b3SDavid Woodhouse return i; 144253b381b3SDavid Woodhouse } 144353b381b3SDavid Woodhouse } 144453b381b3SDavid Woodhouse return -1; 144553b381b3SDavid Woodhouse } 144653b381b3SDavid Woodhouse 144753b381b3SDavid Woodhouse /* 144853b381b3SDavid Woodhouse * helper to find the stripe number for a given 144953b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 145053b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 145153b381b3SDavid Woodhouse */ 145253b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 145353b381b3SDavid Woodhouse struct bio *bio) 145453b381b3SDavid Woodhouse { 14551201b58bSDavid Sterba u64 logical = bio->bi_iter.bi_sector << 9; 145653b381b3SDavid Woodhouse int i; 145753b381b3SDavid Woodhouse 145853b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 14594c664611SQu Wenruo u64 stripe_start = rbio->bioc->raid_map[i]; 146083025863SNikolay Borisov 146183025863SNikolay Borisov if (in_range(logical, stripe_start, rbio->stripe_len)) 146253b381b3SDavid Woodhouse return i; 146353b381b3SDavid Woodhouse } 146453b381b3SDavid Woodhouse return -1; 146553b381b3SDavid Woodhouse } 146653b381b3SDavid Woodhouse 146753b381b3SDavid Woodhouse /* 146853b381b3SDavid Woodhouse * returns -EIO if we had too many failures 146953b381b3SDavid Woodhouse */ 147053b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 147153b381b3SDavid Woodhouse { 147253b381b3SDavid Woodhouse unsigned long flags; 147353b381b3SDavid Woodhouse int ret = 0; 147453b381b3SDavid Woodhouse 147553b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 147653b381b3SDavid Woodhouse 147753b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 147853b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 147953b381b3SDavid Woodhouse goto out; 148053b381b3SDavid Woodhouse 148153b381b3SDavid Woodhouse if (rbio->faila == -1) { 148253b381b3SDavid Woodhouse /* first failure on this rbio */ 148353b381b3SDavid Woodhouse rbio->faila = failed; 1484b89e1b01SMiao Xie atomic_inc(&rbio->error); 148553b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 148653b381b3SDavid Woodhouse /* second failure on this rbio */ 148753b381b3SDavid Woodhouse rbio->failb = failed; 1488b89e1b01SMiao Xie atomic_inc(&rbio->error); 148953b381b3SDavid Woodhouse } else { 149053b381b3SDavid Woodhouse ret = -EIO; 149153b381b3SDavid Woodhouse } 149253b381b3SDavid Woodhouse out: 149353b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 149453b381b3SDavid Woodhouse 149553b381b3SDavid Woodhouse return ret; 149653b381b3SDavid Woodhouse } 149753b381b3SDavid Woodhouse 149853b381b3SDavid Woodhouse /* 149953b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 150053b381b3SDavid Woodhouse * bio. 150153b381b3SDavid Woodhouse */ 150253b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 150353b381b3SDavid Woodhouse struct bio *bio) 150453b381b3SDavid Woodhouse { 150553b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 150653b381b3SDavid Woodhouse 150753b381b3SDavid Woodhouse if (failed < 0) 150853b381b3SDavid Woodhouse return -EIO; 150953b381b3SDavid Woodhouse 151053b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 151153b381b3SDavid Woodhouse } 151253b381b3SDavid Woodhouse 151353b381b3SDavid Woodhouse /* 15145fdb7afcSQu Wenruo * For subpage case, we can no longer set page Uptodate directly for 15155fdb7afcSQu Wenruo * stripe_pages[], thus we need to locate the sector. 15165fdb7afcSQu Wenruo */ 15175fdb7afcSQu Wenruo static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 15185fdb7afcSQu Wenruo struct page *page, 15195fdb7afcSQu Wenruo unsigned int pgoff) 15205fdb7afcSQu Wenruo { 15215fdb7afcSQu Wenruo int i; 15225fdb7afcSQu Wenruo 15235fdb7afcSQu Wenruo for (i = 0; i < rbio->nr_sectors; i++) { 15245fdb7afcSQu Wenruo struct sector_ptr *sector = &rbio->stripe_sectors[i]; 15255fdb7afcSQu Wenruo 15265fdb7afcSQu Wenruo if (sector->page == page && sector->pgoff == pgoff) 15275fdb7afcSQu Wenruo return sector; 15285fdb7afcSQu Wenruo } 15295fdb7afcSQu Wenruo return NULL; 15305fdb7afcSQu Wenruo } 15315fdb7afcSQu Wenruo 15325fdb7afcSQu Wenruo /* 153353b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 153453b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 153553b381b3SDavid Woodhouse */ 15365fdb7afcSQu Wenruo static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 153753b381b3SDavid Woodhouse { 15385fdb7afcSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 15390198e5b7SLiu Bo struct bio_vec *bvec; 15406dc4f100SMing Lei struct bvec_iter_all iter_all; 154153b381b3SDavid Woodhouse 15420198e5b7SLiu Bo ASSERT(!bio_flagged(bio, BIO_CLONED)); 15436592e58cSFilipe Manana 15445fdb7afcSQu Wenruo bio_for_each_segment_all(bvec, bio, iter_all) { 15455fdb7afcSQu Wenruo struct sector_ptr *sector; 15465fdb7afcSQu Wenruo int pgoff; 15475fdb7afcSQu Wenruo 15485fdb7afcSQu Wenruo for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 15495fdb7afcSQu Wenruo pgoff += sectorsize) { 15505fdb7afcSQu Wenruo sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 15515fdb7afcSQu Wenruo ASSERT(sector); 15525fdb7afcSQu Wenruo if (sector) 15535fdb7afcSQu Wenruo sector->uptodate = 1; 15545fdb7afcSQu Wenruo } 15555fdb7afcSQu Wenruo } 155653b381b3SDavid Woodhouse } 155753b381b3SDavid Woodhouse 155853b381b3SDavid Woodhouse /* 155953b381b3SDavid Woodhouse * end io for the read phase of the rmw cycle. All the bios here are physical 156053b381b3SDavid Woodhouse * stripe bios we've read from the disk so we can recalculate the parity of the 156153b381b3SDavid Woodhouse * stripe. 156253b381b3SDavid Woodhouse * 156353b381b3SDavid Woodhouse * This will usually kick off finish_rmw once all the bios are read in, but it 156453b381b3SDavid Woodhouse * may trigger parity reconstruction if we had any errors along the way 156553b381b3SDavid Woodhouse */ 15664246a0b6SChristoph Hellwig static void raid_rmw_end_io(struct bio *bio) 156753b381b3SDavid Woodhouse { 156853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 156953b381b3SDavid Woodhouse 15704e4cbee9SChristoph Hellwig if (bio->bi_status) 157153b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 157253b381b3SDavid Woodhouse else 15735fdb7afcSQu Wenruo set_bio_pages_uptodate(rbio, bio); 157453b381b3SDavid Woodhouse 157553b381b3SDavid Woodhouse bio_put(bio); 157653b381b3SDavid Woodhouse 1577b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 157853b381b3SDavid Woodhouse return; 157953b381b3SDavid Woodhouse 15804c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 158153b381b3SDavid Woodhouse goto cleanup; 158253b381b3SDavid Woodhouse 158353b381b3SDavid Woodhouse /* 158453b381b3SDavid Woodhouse * this will normally call finish_rmw to start our write 158553b381b3SDavid Woodhouse * but if there are any failed stripes we'll reconstruct 158653b381b3SDavid Woodhouse * from parity first 158753b381b3SDavid Woodhouse */ 158853b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 158953b381b3SDavid Woodhouse return; 159053b381b3SDavid Woodhouse 159153b381b3SDavid Woodhouse cleanup: 159253b381b3SDavid Woodhouse 159358efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 159453b381b3SDavid Woodhouse } 159553b381b3SDavid Woodhouse 159653b381b3SDavid Woodhouse /* 159753b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 159853b381b3SDavid Woodhouse * unlock after all the writes are done 159953b381b3SDavid Woodhouse */ 160053b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 160153b381b3SDavid Woodhouse { 160253b381b3SDavid Woodhouse int bios_to_read = 0; 160353b381b3SDavid Woodhouse struct bio_list bio_list; 160453b381b3SDavid Woodhouse int ret; 16053e77605dSQu Wenruo int sectornr; 160653b381b3SDavid Woodhouse int stripe; 160753b381b3SDavid Woodhouse struct bio *bio; 160853b381b3SDavid Woodhouse 160953b381b3SDavid Woodhouse bio_list_init(&bio_list); 161053b381b3SDavid Woodhouse 161153b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 161253b381b3SDavid Woodhouse if (ret) 161353b381b3SDavid Woodhouse goto cleanup; 161453b381b3SDavid Woodhouse 161553b381b3SDavid Woodhouse index_rbio_pages(rbio); 161653b381b3SDavid Woodhouse 1617b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 161853b381b3SDavid Woodhouse /* 161953b381b3SDavid Woodhouse * build a list of bios to read all the missing parts of this 162053b381b3SDavid Woodhouse * stripe 162153b381b3SDavid Woodhouse */ 162253b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->nr_data; stripe++) { 16233e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 16243e77605dSQu Wenruo struct sector_ptr *sector; 16253e77605dSQu Wenruo 162653b381b3SDavid Woodhouse /* 16273e77605dSQu Wenruo * We want to find all the sectors missing from the 16283e77605dSQu Wenruo * rbio and read them from the disk. If * sector_in_rbio() 16293e77605dSQu Wenruo * finds a page in the bio list we don't need to read 16303e77605dSQu Wenruo * it off the stripe. 163153b381b3SDavid Woodhouse */ 16323e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 16333e77605dSQu Wenruo if (sector) 163453b381b3SDavid Woodhouse continue; 163553b381b3SDavid Woodhouse 16363e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 16374ae10b3aSChris Mason /* 16383e77605dSQu Wenruo * The bio cache may have handed us an uptodate page. 16393e77605dSQu Wenruo * If so, be happy and use it. 16404ae10b3aSChris Mason */ 16413e77605dSQu Wenruo if (sector->uptodate) 16424ae10b3aSChris Mason continue; 16434ae10b3aSChris Mason 16443e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 16453e77605dSQu Wenruo stripe, sectornr, rbio->stripe_len, 1646e01bf588SChristoph Hellwig REQ_OP_READ); 164753b381b3SDavid Woodhouse if (ret) 164853b381b3SDavid Woodhouse goto cleanup; 164953b381b3SDavid Woodhouse } 165053b381b3SDavid Woodhouse } 165153b381b3SDavid Woodhouse 165253b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 165353b381b3SDavid Woodhouse if (!bios_to_read) { 165453b381b3SDavid Woodhouse /* 165553b381b3SDavid Woodhouse * this can happen if others have merged with 165653b381b3SDavid Woodhouse * us, it means there is nothing left to read. 165753b381b3SDavid Woodhouse * But if there are missing devices it may not be 165853b381b3SDavid Woodhouse * safe to do the full stripe write yet. 165953b381b3SDavid Woodhouse */ 166053b381b3SDavid Woodhouse goto finish; 166153b381b3SDavid Woodhouse } 166253b381b3SDavid Woodhouse 166353b381b3SDavid Woodhouse /* 16644c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 16654c664611SQu Wenruo * touch it after that. 166653b381b3SDavid Woodhouse */ 1667b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 1668bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 166953b381b3SDavid Woodhouse bio->bi_end_io = raid_rmw_end_io; 167053b381b3SDavid Woodhouse 16716a258d72SQu Wenruo btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 167253b381b3SDavid Woodhouse 16734e49ea4aSMike Christie submit_bio(bio); 167453b381b3SDavid Woodhouse } 167553b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 167653b381b3SDavid Woodhouse return 0; 167753b381b3SDavid Woodhouse 167853b381b3SDavid Woodhouse cleanup: 167958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 1680785884fcSLiu Bo 1681785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 1682785884fcSLiu Bo bio_put(bio); 1683785884fcSLiu Bo 168453b381b3SDavid Woodhouse return -EIO; 168553b381b3SDavid Woodhouse 168653b381b3SDavid Woodhouse finish: 168753b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 168853b381b3SDavid Woodhouse return 0; 168953b381b3SDavid Woodhouse } 169053b381b3SDavid Woodhouse 169153b381b3SDavid Woodhouse /* 169253b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 169353b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 169453b381b3SDavid Woodhouse */ 169553b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 169653b381b3SDavid Woodhouse { 169753b381b3SDavid Woodhouse int ret; 169853b381b3SDavid Woodhouse 169953b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 17003cd846d1SMiao Xie if (ret) { 17013cd846d1SMiao Xie __free_raid_bio(rbio); 170253b381b3SDavid Woodhouse return ret; 17033cd846d1SMiao Xie } 170453b381b3SDavid Woodhouse 170553b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 170653b381b3SDavid Woodhouse if (ret == 0) 170753b381b3SDavid Woodhouse finish_rmw(rbio); 170853b381b3SDavid Woodhouse return 0; 170953b381b3SDavid Woodhouse } 171053b381b3SDavid Woodhouse 171153b381b3SDavid Woodhouse /* 171253b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 171353b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 171453b381b3SDavid Woodhouse * rbio before calculating new parity 171553b381b3SDavid Woodhouse */ 171653b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 171753b381b3SDavid Woodhouse { 171853b381b3SDavid Woodhouse int ret; 171953b381b3SDavid Woodhouse 172053b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 172153b381b3SDavid Woodhouse if (ret == 0) 1722cf6a4a75SDavid Sterba start_async_work(rbio, rmw_work); 172353b381b3SDavid Woodhouse return 0; 172453b381b3SDavid Woodhouse } 172553b381b3SDavid Woodhouse 172653b381b3SDavid Woodhouse /* 172753b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 172853b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 172953b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 173053b381b3SDavid Woodhouse * go directly to finish_rmw 173153b381b3SDavid Woodhouse */ 173253b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 173353b381b3SDavid Woodhouse { 173453b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 173553b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 173653b381b3SDavid Woodhouse return partial_stripe_write(rbio); 173753b381b3SDavid Woodhouse return full_stripe_write(rbio); 173853b381b3SDavid Woodhouse } 173953b381b3SDavid Woodhouse 174053b381b3SDavid Woodhouse /* 17416ac0f488SChris Mason * We use plugging call backs to collect full stripes. 17426ac0f488SChris Mason * Any time we get a partial stripe write while plugged 17436ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 17446ac0f488SChris Mason * we sort the list by logical block number and merge 17456ac0f488SChris Mason * everything we can into the same rbios 17466ac0f488SChris Mason */ 17476ac0f488SChris Mason struct btrfs_plug_cb { 17486ac0f488SChris Mason struct blk_plug_cb cb; 17496ac0f488SChris Mason struct btrfs_fs_info *info; 17506ac0f488SChris Mason struct list_head rbio_list; 17516ac0f488SChris Mason struct btrfs_work work; 17526ac0f488SChris Mason }; 17536ac0f488SChris Mason 17546ac0f488SChris Mason /* 17556ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 17566ac0f488SChris Mason */ 17574f0f586bSSami Tolvanen static int plug_cmp(void *priv, const struct list_head *a, 17584f0f586bSSami Tolvanen const struct list_head *b) 17596ac0f488SChris Mason { 1760214cc184SDavid Sterba const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 17616ac0f488SChris Mason plug_list); 1762214cc184SDavid Sterba const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 17636ac0f488SChris Mason plug_list); 17644f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 17654f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 17666ac0f488SChris Mason 17676ac0f488SChris Mason if (a_sector < b_sector) 17686ac0f488SChris Mason return -1; 17696ac0f488SChris Mason if (a_sector > b_sector) 17706ac0f488SChris Mason return 1; 17716ac0f488SChris Mason return 0; 17726ac0f488SChris Mason } 17736ac0f488SChris Mason 17746ac0f488SChris Mason static void run_plug(struct btrfs_plug_cb *plug) 17756ac0f488SChris Mason { 17766ac0f488SChris Mason struct btrfs_raid_bio *cur; 17776ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 17786ac0f488SChris Mason 17796ac0f488SChris Mason /* 17806ac0f488SChris Mason * sort our plug list then try to merge 17816ac0f488SChris Mason * everything we can in hopes of creating full 17826ac0f488SChris Mason * stripes. 17836ac0f488SChris Mason */ 17846ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 17856ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 17866ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 17876ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 17886ac0f488SChris Mason list_del_init(&cur->plug_list); 17896ac0f488SChris Mason 17906ac0f488SChris Mason if (rbio_is_full(cur)) { 1791c7b562c5SDavid Sterba int ret; 1792c7b562c5SDavid Sterba 17936ac0f488SChris Mason /* we have a full stripe, send it down */ 1794c7b562c5SDavid Sterba ret = full_stripe_write(cur); 1795c7b562c5SDavid Sterba BUG_ON(ret); 17966ac0f488SChris Mason continue; 17976ac0f488SChris Mason } 17986ac0f488SChris Mason if (last) { 17996ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 18006ac0f488SChris Mason merge_rbio(last, cur); 18016ac0f488SChris Mason __free_raid_bio(cur); 18026ac0f488SChris Mason continue; 18036ac0f488SChris Mason 18046ac0f488SChris Mason } 18056ac0f488SChris Mason __raid56_parity_write(last); 18066ac0f488SChris Mason } 18076ac0f488SChris Mason last = cur; 18086ac0f488SChris Mason } 18096ac0f488SChris Mason if (last) { 18106ac0f488SChris Mason __raid56_parity_write(last); 18116ac0f488SChris Mason } 18126ac0f488SChris Mason kfree(plug); 18136ac0f488SChris Mason } 18146ac0f488SChris Mason 18156ac0f488SChris Mason /* 18166ac0f488SChris Mason * if the unplug comes from schedule, we have to push the 18176ac0f488SChris Mason * work off to a helper thread 18186ac0f488SChris Mason */ 18196ac0f488SChris Mason static void unplug_work(struct btrfs_work *work) 18206ac0f488SChris Mason { 18216ac0f488SChris Mason struct btrfs_plug_cb *plug; 18226ac0f488SChris Mason plug = container_of(work, struct btrfs_plug_cb, work); 18236ac0f488SChris Mason run_plug(plug); 18246ac0f488SChris Mason } 18256ac0f488SChris Mason 18266ac0f488SChris Mason static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 18276ac0f488SChris Mason { 18286ac0f488SChris Mason struct btrfs_plug_cb *plug; 18296ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 18306ac0f488SChris Mason 18316ac0f488SChris Mason if (from_schedule) { 1832a0cac0ecSOmar Sandoval btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1833d05a33acSQu Wenruo btrfs_queue_work(plug->info->rmw_workers, 18346ac0f488SChris Mason &plug->work); 18356ac0f488SChris Mason return; 18366ac0f488SChris Mason } 18376ac0f488SChris Mason run_plug(plug); 18386ac0f488SChris Mason } 18396ac0f488SChris Mason 18406ac0f488SChris Mason /* 184153b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 184253b381b3SDavid Woodhouse */ 1843cc353a8bSQu Wenruo int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) 184453b381b3SDavid Woodhouse { 18456a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 184653b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 18476ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 18486ac0f488SChris Mason struct blk_plug_cb *cb; 18494245215dSMiao Xie int ret; 185053b381b3SDavid Woodhouse 18514c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 1852af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 18534c664611SQu Wenruo btrfs_put_bioc(bioc); 185453b381b3SDavid Woodhouse return PTR_ERR(rbio); 1855af8e2d1dSMiao Xie } 185653b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 18574f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 18581b94b556SMiao Xie rbio->operation = BTRFS_RBIO_WRITE; 18596ac0f488SChris Mason 18600b246afaSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 18614245215dSMiao Xie rbio->generic_bio_cnt = 1; 18624245215dSMiao Xie 18636ac0f488SChris Mason /* 18646ac0f488SChris Mason * don't plug on full rbios, just get them out the door 18656ac0f488SChris Mason * as quickly as we can 18666ac0f488SChris Mason */ 18674245215dSMiao Xie if (rbio_is_full(rbio)) { 18684245215dSMiao Xie ret = full_stripe_write(rbio); 18694245215dSMiao Xie if (ret) 18700b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 18714245215dSMiao Xie return ret; 18724245215dSMiao Xie } 18736ac0f488SChris Mason 18740b246afaSJeff Mahoney cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 18756ac0f488SChris Mason if (cb) { 18766ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 18776ac0f488SChris Mason if (!plug->info) { 18780b246afaSJeff Mahoney plug->info = fs_info; 18796ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 18806ac0f488SChris Mason } 18816ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 18824245215dSMiao Xie ret = 0; 18836ac0f488SChris Mason } else { 18844245215dSMiao Xie ret = __raid56_parity_write(rbio); 18854245215dSMiao Xie if (ret) 18860b246afaSJeff Mahoney btrfs_bio_counter_dec(fs_info); 188753b381b3SDavid Woodhouse } 18884245215dSMiao Xie return ret; 18896ac0f488SChris Mason } 189053b381b3SDavid Woodhouse 189153b381b3SDavid Woodhouse /* 189253b381b3SDavid Woodhouse * all parity reconstruction happens here. We've read in everything 189353b381b3SDavid Woodhouse * we can find from the drives and this does the heavy lifting of 189453b381b3SDavid Woodhouse * sorting the good from the bad. 189553b381b3SDavid Woodhouse */ 189653b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 189753b381b3SDavid Woodhouse { 189807e4d380SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 189907e4d380SQu Wenruo int sectornr, stripe; 190053b381b3SDavid Woodhouse void **pointers; 190194a0b58dSIra Weiny void **unmap_array; 190253b381b3SDavid Woodhouse int faila = -1, failb = -1; 190358efbc9fSOmar Sandoval blk_status_t err; 190453b381b3SDavid Woodhouse int i; 190553b381b3SDavid Woodhouse 190607e4d380SQu Wenruo /* 190707e4d380SQu Wenruo * This array stores the pointer for each sector, thus it has the extra 190807e4d380SQu Wenruo * pgoff value added from each sector 190907e4d380SQu Wenruo */ 191031e818feSDavid Sterba pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 191153b381b3SDavid Woodhouse if (!pointers) { 191258efbc9fSOmar Sandoval err = BLK_STS_RESOURCE; 191353b381b3SDavid Woodhouse goto cleanup_io; 191453b381b3SDavid Woodhouse } 191553b381b3SDavid Woodhouse 191694a0b58dSIra Weiny /* 191794a0b58dSIra Weiny * Store copy of pointers that does not get reordered during 191894a0b58dSIra Weiny * reconstruction so that kunmap_local works. 191994a0b58dSIra Weiny */ 192094a0b58dSIra Weiny unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 192194a0b58dSIra Weiny if (!unmap_array) { 192294a0b58dSIra Weiny err = BLK_STS_RESOURCE; 192394a0b58dSIra Weiny goto cleanup_pointers; 192494a0b58dSIra Weiny } 192594a0b58dSIra Weiny 192653b381b3SDavid Woodhouse faila = rbio->faila; 192753b381b3SDavid Woodhouse failb = rbio->failb; 192853b381b3SDavid Woodhouse 1929b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1930b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 193153b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 193253b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 193353b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 193453b381b3SDavid Woodhouse } 193553b381b3SDavid Woodhouse 193653b381b3SDavid Woodhouse index_rbio_pages(rbio); 193753b381b3SDavid Woodhouse 193807e4d380SQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 193907e4d380SQu Wenruo struct sector_ptr *sector; 194007e4d380SQu Wenruo 19415a6ac9eaSMiao Xie /* 19425a6ac9eaSMiao Xie * Now we just use bitmap to mark the horizontal stripes in 19435a6ac9eaSMiao Xie * which we have data when doing parity scrub. 19445a6ac9eaSMiao Xie */ 19455a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 194607e4d380SQu Wenruo !test_bit(sectornr, rbio->dbitmap)) 19475a6ac9eaSMiao Xie continue; 19485a6ac9eaSMiao Xie 194994a0b58dSIra Weiny /* 195007e4d380SQu Wenruo * Setup our array of pointers with sectors from each stripe 195194a0b58dSIra Weiny * 195294a0b58dSIra Weiny * NOTE: store a duplicate array of pointers to preserve the 195394a0b58dSIra Weiny * pointer order 195453b381b3SDavid Woodhouse */ 19552c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 195653b381b3SDavid Woodhouse /* 195707e4d380SQu Wenruo * If we're rebuilding a read, we have to use 195853b381b3SDavid Woodhouse * pages from the bio list 195953b381b3SDavid Woodhouse */ 1960b4ee1782SOmar Sandoval if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1961b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 196253b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 196307e4d380SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 196453b381b3SDavid Woodhouse } else { 196507e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 196653b381b3SDavid Woodhouse } 196707e4d380SQu Wenruo ASSERT(sector->page); 196807e4d380SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 196907e4d380SQu Wenruo sector->pgoff; 197094a0b58dSIra Weiny unmap_array[stripe] = pointers[stripe]; 197153b381b3SDavid Woodhouse } 197253b381b3SDavid Woodhouse 197307e4d380SQu Wenruo /* All raid6 handling here */ 19744c664611SQu Wenruo if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 197507e4d380SQu Wenruo /* Single failure, rebuild from parity raid5 style */ 197653b381b3SDavid Woodhouse if (failb < 0) { 197753b381b3SDavid Woodhouse if (faila == rbio->nr_data) { 197853b381b3SDavid Woodhouse /* 197953b381b3SDavid Woodhouse * Just the P stripe has failed, without 198053b381b3SDavid Woodhouse * a bad data or Q stripe. 198153b381b3SDavid Woodhouse * TODO, we should redo the xor here. 198253b381b3SDavid Woodhouse */ 198358efbc9fSOmar Sandoval err = BLK_STS_IOERR; 198453b381b3SDavid Woodhouse goto cleanup; 198553b381b3SDavid Woodhouse } 198653b381b3SDavid Woodhouse /* 198753b381b3SDavid Woodhouse * a single failure in raid6 is rebuilt 198853b381b3SDavid Woodhouse * in the pstripe code below 198953b381b3SDavid Woodhouse */ 199053b381b3SDavid Woodhouse goto pstripe; 199153b381b3SDavid Woodhouse } 199253b381b3SDavid Woodhouse 199353b381b3SDavid Woodhouse /* make sure our ps and qs are in order */ 1994b7d2083aSNikolay Borisov if (faila > failb) 1995b7d2083aSNikolay Borisov swap(faila, failb); 199653b381b3SDavid Woodhouse 199753b381b3SDavid Woodhouse /* if the q stripe is failed, do a pstripe reconstruction 199853b381b3SDavid Woodhouse * from the xors. 199953b381b3SDavid Woodhouse * If both the q stripe and the P stripe are failed, we're 200053b381b3SDavid Woodhouse * here due to a crc mismatch and we can't give them the 200153b381b3SDavid Woodhouse * data they want 200253b381b3SDavid Woodhouse */ 20034c664611SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 20044c664611SQu Wenruo if (rbio->bioc->raid_map[faila] == 20058e5cfb55SZhao Lei RAID5_P_STRIPE) { 200658efbc9fSOmar Sandoval err = BLK_STS_IOERR; 200753b381b3SDavid Woodhouse goto cleanup; 200853b381b3SDavid Woodhouse } 200953b381b3SDavid Woodhouse /* 201053b381b3SDavid Woodhouse * otherwise we have one bad data stripe and 201153b381b3SDavid Woodhouse * a good P stripe. raid5! 201253b381b3SDavid Woodhouse */ 201353b381b3SDavid Woodhouse goto pstripe; 201453b381b3SDavid Woodhouse } 201553b381b3SDavid Woodhouse 20164c664611SQu Wenruo if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 20172c8cdd6eSMiao Xie raid6_datap_recov(rbio->real_stripes, 201807e4d380SQu Wenruo sectorsize, faila, pointers); 201953b381b3SDavid Woodhouse } else { 20202c8cdd6eSMiao Xie raid6_2data_recov(rbio->real_stripes, 202107e4d380SQu Wenruo sectorsize, faila, failb, 202253b381b3SDavid Woodhouse pointers); 202353b381b3SDavid Woodhouse } 202453b381b3SDavid Woodhouse } else { 202553b381b3SDavid Woodhouse void *p; 202653b381b3SDavid Woodhouse 202753b381b3SDavid Woodhouse /* rebuild from P stripe here (raid5 or raid6) */ 202853b381b3SDavid Woodhouse BUG_ON(failb != -1); 202953b381b3SDavid Woodhouse pstripe: 203053b381b3SDavid Woodhouse /* Copy parity block into failed block to start with */ 203107e4d380SQu Wenruo memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 203253b381b3SDavid Woodhouse 203353b381b3SDavid Woodhouse /* rearrange the pointer array */ 203453b381b3SDavid Woodhouse p = pointers[faila]; 203553b381b3SDavid Woodhouse for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 203653b381b3SDavid Woodhouse pointers[stripe] = pointers[stripe + 1]; 203753b381b3SDavid Woodhouse pointers[rbio->nr_data - 1] = p; 203853b381b3SDavid Woodhouse 203953b381b3SDavid Woodhouse /* xor in the rest */ 204007e4d380SQu Wenruo run_xor(pointers, rbio->nr_data - 1, sectorsize); 204153b381b3SDavid Woodhouse } 204253b381b3SDavid Woodhouse /* if we're doing this rebuild as part of an rmw, go through 204353b381b3SDavid Woodhouse * and set all of our private rbio pages in the 204453b381b3SDavid Woodhouse * failed stripes as uptodate. This way finish_rmw will 204553b381b3SDavid Woodhouse * know they can be trusted. If this was a read reconstruction, 204653b381b3SDavid Woodhouse * other endio functions will fiddle the uptodate bits 204753b381b3SDavid Woodhouse */ 20481b94b556SMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) { 204907e4d380SQu Wenruo for (i = 0; i < rbio->stripe_nsectors; i++) { 205053b381b3SDavid Woodhouse if (faila != -1) { 205107e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, faila, i); 205207e4d380SQu Wenruo sector->uptodate = 1; 205353b381b3SDavid Woodhouse } 205453b381b3SDavid Woodhouse if (failb != -1) { 205507e4d380SQu Wenruo sector = rbio_stripe_sector(rbio, failb, i); 205607e4d380SQu Wenruo sector->uptodate = 1; 205753b381b3SDavid Woodhouse } 205853b381b3SDavid Woodhouse } 205953b381b3SDavid Woodhouse } 206094a0b58dSIra Weiny for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 206194a0b58dSIra Weiny kunmap_local(unmap_array[stripe]); 206253b381b3SDavid Woodhouse } 206353b381b3SDavid Woodhouse 206458efbc9fSOmar Sandoval err = BLK_STS_OK; 206553b381b3SDavid Woodhouse cleanup: 206694a0b58dSIra Weiny kfree(unmap_array); 206794a0b58dSIra Weiny cleanup_pointers: 206853b381b3SDavid Woodhouse kfree(pointers); 206953b381b3SDavid Woodhouse 207053b381b3SDavid Woodhouse cleanup_io: 2071580c6efaSLiu Bo /* 2072580c6efaSLiu Bo * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2073580c6efaSLiu Bo * valid rbio which is consistent with ondisk content, thus such a 2074580c6efaSLiu Bo * valid rbio can be cached to avoid further disk reads. 2075580c6efaSLiu Bo */ 2076580c6efaSLiu Bo if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2077580c6efaSLiu Bo rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 207844ac474dSLiu Bo /* 207944ac474dSLiu Bo * - In case of two failures, where rbio->failb != -1: 208044ac474dSLiu Bo * 208144ac474dSLiu Bo * Do not cache this rbio since the above read reconstruction 208244ac474dSLiu Bo * (raid6_datap_recov() or raid6_2data_recov()) may have 208344ac474dSLiu Bo * changed some content of stripes which are not identical to 208444ac474dSLiu Bo * on-disk content any more, otherwise, a later write/recover 208544ac474dSLiu Bo * may steal stripe_pages from this rbio and end up with 208644ac474dSLiu Bo * corruptions or rebuild failures. 208744ac474dSLiu Bo * 208844ac474dSLiu Bo * - In case of single failure, where rbio->failb == -1: 208944ac474dSLiu Bo * 209044ac474dSLiu Bo * Cache this rbio iff the above read reconstruction is 209152042d8eSAndrea Gelmini * executed without problems. 209244ac474dSLiu Bo */ 209344ac474dSLiu Bo if (err == BLK_STS_OK && rbio->failb < 0) 20944ae10b3aSChris Mason cache_rbio_pages(rbio); 20954ae10b3aSChris Mason else 20964ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 20974ae10b3aSChris Mason 20984246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 209958efbc9fSOmar Sandoval } else if (err == BLK_STS_OK) { 210053b381b3SDavid Woodhouse rbio->faila = -1; 210153b381b3SDavid Woodhouse rbio->failb = -1; 21025a6ac9eaSMiao Xie 21035a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) 210453b381b3SDavid Woodhouse finish_rmw(rbio); 21055a6ac9eaSMiao Xie else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 21065a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 21075a6ac9eaSMiao Xie else 21085a6ac9eaSMiao Xie BUG(); 210953b381b3SDavid Woodhouse } else { 21104246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 211153b381b3SDavid Woodhouse } 211253b381b3SDavid Woodhouse } 211353b381b3SDavid Woodhouse 211453b381b3SDavid Woodhouse /* 211553b381b3SDavid Woodhouse * This is called only for stripes we've read from disk to 211653b381b3SDavid Woodhouse * reconstruct the parity. 211753b381b3SDavid Woodhouse */ 21184246a0b6SChristoph Hellwig static void raid_recover_end_io(struct bio *bio) 211953b381b3SDavid Woodhouse { 212053b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 212153b381b3SDavid Woodhouse 212253b381b3SDavid Woodhouse /* 212353b381b3SDavid Woodhouse * we only read stripe pages off the disk, set them 212453b381b3SDavid Woodhouse * up to date if there were no errors 212553b381b3SDavid Woodhouse */ 21264e4cbee9SChristoph Hellwig if (bio->bi_status) 212753b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 212853b381b3SDavid Woodhouse else 21295fdb7afcSQu Wenruo set_bio_pages_uptodate(rbio, bio); 213053b381b3SDavid Woodhouse bio_put(bio); 213153b381b3SDavid Woodhouse 2132b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 213353b381b3SDavid Woodhouse return; 213453b381b3SDavid Woodhouse 21354c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 213658efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 213753b381b3SDavid Woodhouse else 213853b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 213953b381b3SDavid Woodhouse } 214053b381b3SDavid Woodhouse 214153b381b3SDavid Woodhouse /* 214253b381b3SDavid Woodhouse * reads everything we need off the disk to reconstruct 214353b381b3SDavid Woodhouse * the parity. endio handlers trigger final reconstruction 214453b381b3SDavid Woodhouse * when the IO is done. 214553b381b3SDavid Woodhouse * 214653b381b3SDavid Woodhouse * This is used both for reads from the higher layers and for 214753b381b3SDavid Woodhouse * parity construction required to finish a rmw cycle. 214853b381b3SDavid Woodhouse */ 214953b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 215053b381b3SDavid Woodhouse { 215153b381b3SDavid Woodhouse int bios_to_read = 0; 215253b381b3SDavid Woodhouse struct bio_list bio_list; 215353b381b3SDavid Woodhouse int ret; 21543e77605dSQu Wenruo int sectornr; 215553b381b3SDavid Woodhouse int stripe; 215653b381b3SDavid Woodhouse struct bio *bio; 215753b381b3SDavid Woodhouse 215853b381b3SDavid Woodhouse bio_list_init(&bio_list); 215953b381b3SDavid Woodhouse 216053b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 216153b381b3SDavid Woodhouse if (ret) 216253b381b3SDavid Woodhouse goto cleanup; 216353b381b3SDavid Woodhouse 2164b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 216553b381b3SDavid Woodhouse 216653b381b3SDavid Woodhouse /* 21674ae10b3aSChris Mason * read everything that hasn't failed. Thanks to the 21684ae10b3aSChris Mason * stripe cache, it is possible that some or all of these 21694ae10b3aSChris Mason * pages are going to be uptodate. 217053b381b3SDavid Woodhouse */ 21712c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 21725588383eSLiu Bo if (rbio->faila == stripe || rbio->failb == stripe) { 2173b89e1b01SMiao Xie atomic_inc(&rbio->error); 217453b381b3SDavid Woodhouse continue; 21755588383eSLiu Bo } 217653b381b3SDavid Woodhouse 21773e77605dSQu Wenruo for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 21783e77605dSQu Wenruo struct sector_ptr *sector; 217953b381b3SDavid Woodhouse 218053b381b3SDavid Woodhouse /* 218153b381b3SDavid Woodhouse * the rmw code may have already read this 218253b381b3SDavid Woodhouse * page in 218353b381b3SDavid Woodhouse */ 21843e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 21853e77605dSQu Wenruo if (sector->uptodate) 218653b381b3SDavid Woodhouse continue; 218753b381b3SDavid Woodhouse 21883e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 21893e77605dSQu Wenruo stripe, sectornr, rbio->stripe_len, 2190e01bf588SChristoph Hellwig REQ_OP_READ); 219153b381b3SDavid Woodhouse if (ret < 0) 219253b381b3SDavid Woodhouse goto cleanup; 219353b381b3SDavid Woodhouse } 219453b381b3SDavid Woodhouse } 219553b381b3SDavid Woodhouse 219653b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 219753b381b3SDavid Woodhouse if (!bios_to_read) { 219853b381b3SDavid Woodhouse /* 219953b381b3SDavid Woodhouse * we might have no bios to read just because the pages 220053b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 220153b381b3SDavid Woodhouse * the devices were gone. 220253b381b3SDavid Woodhouse */ 22034c664611SQu Wenruo if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 220453b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 2205813f8a0eSNikolay Borisov return 0; 220653b381b3SDavid Woodhouse } else { 220753b381b3SDavid Woodhouse goto cleanup; 220853b381b3SDavid Woodhouse } 220953b381b3SDavid Woodhouse } 221053b381b3SDavid Woodhouse 221153b381b3SDavid Woodhouse /* 22124c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 22134c664611SQu Wenruo * touch it after that. 221453b381b3SDavid Woodhouse */ 2215b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2216bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 221753b381b3SDavid Woodhouse bio->bi_end_io = raid_recover_end_io; 221853b381b3SDavid Woodhouse 22196a258d72SQu Wenruo btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 222053b381b3SDavid Woodhouse 22214e49ea4aSMike Christie submit_bio(bio); 222253b381b3SDavid Woodhouse } 2223813f8a0eSNikolay Borisov 222453b381b3SDavid Woodhouse return 0; 222553b381b3SDavid Woodhouse 222653b381b3SDavid Woodhouse cleanup: 2227b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2228b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 222958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2230785884fcSLiu Bo 2231785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2232785884fcSLiu Bo bio_put(bio); 2233785884fcSLiu Bo 223453b381b3SDavid Woodhouse return -EIO; 223553b381b3SDavid Woodhouse } 223653b381b3SDavid Woodhouse 223753b381b3SDavid Woodhouse /* 223853b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 223953b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 224053b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 224153b381b3SDavid Woodhouse * of the drive. 224253b381b3SDavid Woodhouse */ 22436a258d72SQu Wenruo int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2244cc353a8bSQu Wenruo u32 stripe_len, int mirror_num, int generic_io) 224553b381b3SDavid Woodhouse { 22466a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 224753b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 224853b381b3SDavid Woodhouse int ret; 224953b381b3SDavid Woodhouse 2250abad60c6SLiu Bo if (generic_io) { 22514c664611SQu Wenruo ASSERT(bioc->mirror_num == mirror_num); 2252c3a3b19bSQu Wenruo btrfs_bio(bio)->mirror_num = mirror_num; 2253abad60c6SLiu Bo } 2254abad60c6SLiu Bo 22554c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 2256af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 22576e9606d2SZhao Lei if (generic_io) 22584c664611SQu Wenruo btrfs_put_bioc(bioc); 225953b381b3SDavid Woodhouse return PTR_ERR(rbio); 2260af8e2d1dSMiao Xie } 226153b381b3SDavid Woodhouse 22621b94b556SMiao Xie rbio->operation = BTRFS_RBIO_READ_REBUILD; 226353b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 22644f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 226553b381b3SDavid Woodhouse 226653b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 226753b381b3SDavid Woodhouse if (rbio->faila == -1) { 22680b246afaSJeff Mahoney btrfs_warn(fs_info, 22694c664611SQu Wenruo "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 22701201b58bSDavid Sterba __func__, bio->bi_iter.bi_sector << 9, 22714c664611SQu Wenruo (u64)bio->bi_iter.bi_size, bioc->map_type); 22726e9606d2SZhao Lei if (generic_io) 22734c664611SQu Wenruo btrfs_put_bioc(bioc); 227453b381b3SDavid Woodhouse kfree(rbio); 227553b381b3SDavid Woodhouse return -EIO; 227653b381b3SDavid Woodhouse } 227753b381b3SDavid Woodhouse 22784245215dSMiao Xie if (generic_io) { 22790b246afaSJeff Mahoney btrfs_bio_counter_inc_noblocked(fs_info); 22804245215dSMiao Xie rbio->generic_bio_cnt = 1; 22814245215dSMiao Xie } else { 22824c664611SQu Wenruo btrfs_get_bioc(bioc); 22834245215dSMiao Xie } 22844245215dSMiao Xie 228553b381b3SDavid Woodhouse /* 22868810f751SLiu Bo * Loop retry: 22878810f751SLiu Bo * for 'mirror == 2', reconstruct from all other stripes. 22888810f751SLiu Bo * for 'mirror_num > 2', select a stripe to fail on every retry. 228953b381b3SDavid Woodhouse */ 22908810f751SLiu Bo if (mirror_num > 2) { 22918810f751SLiu Bo /* 22928810f751SLiu Bo * 'mirror == 3' is to fail the p stripe and 22938810f751SLiu Bo * reconstruct from the q stripe. 'mirror > 3' is to 22948810f751SLiu Bo * fail a data stripe and reconstruct from p+q stripe. 22958810f751SLiu Bo */ 22968810f751SLiu Bo rbio->failb = rbio->real_stripes - (mirror_num - 1); 22978810f751SLiu Bo ASSERT(rbio->failb > 0); 22988810f751SLiu Bo if (rbio->failb <= rbio->faila) 22998810f751SLiu Bo rbio->failb--; 23008810f751SLiu Bo } 230153b381b3SDavid Woodhouse 230253b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 230353b381b3SDavid Woodhouse 230453b381b3SDavid Woodhouse /* 230553b381b3SDavid Woodhouse * __raid56_parity_recover will end the bio with 230653b381b3SDavid Woodhouse * any errors it hits. We don't want to return 230753b381b3SDavid Woodhouse * its error value up the stack because our caller 230853b381b3SDavid Woodhouse * will end up calling bio_endio with any nonzero 230953b381b3SDavid Woodhouse * return 231053b381b3SDavid Woodhouse */ 231153b381b3SDavid Woodhouse if (ret == 0) 231253b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 231353b381b3SDavid Woodhouse /* 231453b381b3SDavid Woodhouse * our rbio has been added to the list of 231553b381b3SDavid Woodhouse * rbios that will be handled after the 231653b381b3SDavid Woodhouse * currently lock owner is done 231753b381b3SDavid Woodhouse */ 231853b381b3SDavid Woodhouse return 0; 231953b381b3SDavid Woodhouse 232053b381b3SDavid Woodhouse } 232153b381b3SDavid Woodhouse 232253b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work) 232353b381b3SDavid Woodhouse { 232453b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 232553b381b3SDavid Woodhouse 232653b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 232753b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 232853b381b3SDavid Woodhouse } 232953b381b3SDavid Woodhouse 233053b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work) 233153b381b3SDavid Woodhouse { 233253b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 233353b381b3SDavid Woodhouse 233453b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 233553b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 233653b381b3SDavid Woodhouse } 23375a6ac9eaSMiao Xie 23385a6ac9eaSMiao Xie /* 23395a6ac9eaSMiao Xie * The following code is used to scrub/replace the parity stripe 23405a6ac9eaSMiao Xie * 23414c664611SQu Wenruo * Caller must have already increased bio_counter for getting @bioc. 2342ae6529c3SQu Wenruo * 23435a6ac9eaSMiao Xie * Note: We need make sure all the pages that add into the scrub/replace 23445a6ac9eaSMiao Xie * raid bio are correct and not be changed during the scrub/replace. That 23455a6ac9eaSMiao Xie * is those pages just hold metadata or file data with checksum. 23465a6ac9eaSMiao Xie */ 23475a6ac9eaSMiao Xie 23486a258d72SQu Wenruo struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 23496a258d72SQu Wenruo struct btrfs_io_context *bioc, 2350cc353a8bSQu Wenruo u32 stripe_len, struct btrfs_device *scrub_dev, 23515a6ac9eaSMiao Xie unsigned long *dbitmap, int stripe_nsectors) 23525a6ac9eaSMiao Xie { 23536a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 23545a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 23555a6ac9eaSMiao Xie int i; 23565a6ac9eaSMiao Xie 23574c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, stripe_len); 23585a6ac9eaSMiao Xie if (IS_ERR(rbio)) 23595a6ac9eaSMiao Xie return NULL; 23605a6ac9eaSMiao Xie bio_list_add(&rbio->bio_list, bio); 23615a6ac9eaSMiao Xie /* 23625a6ac9eaSMiao Xie * This is a special bio which is used to hold the completion handler 23635a6ac9eaSMiao Xie * and make the scrub rbio is similar to the other types 23645a6ac9eaSMiao Xie */ 23655a6ac9eaSMiao Xie ASSERT(!bio->bi_iter.bi_size); 23665a6ac9eaSMiao Xie rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 23675a6ac9eaSMiao Xie 23689cd3a7ebSLiu Bo /* 23694c664611SQu Wenruo * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 23709cd3a7ebSLiu Bo * to the end position, so this search can start from the first parity 23719cd3a7ebSLiu Bo * stripe. 23729cd3a7ebSLiu Bo */ 23739cd3a7ebSLiu Bo for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 23744c664611SQu Wenruo if (bioc->stripes[i].dev == scrub_dev) { 23755a6ac9eaSMiao Xie rbio->scrubp = i; 23765a6ac9eaSMiao Xie break; 23775a6ac9eaSMiao Xie } 23785a6ac9eaSMiao Xie } 23799cd3a7ebSLiu Bo ASSERT(i < rbio->real_stripes); 23805a6ac9eaSMiao Xie 23815a6ac9eaSMiao Xie /* Now we just support the sectorsize equals to page size */ 23820b246afaSJeff Mahoney ASSERT(fs_info->sectorsize == PAGE_SIZE); 23835a6ac9eaSMiao Xie ASSERT(rbio->stripe_npages == stripe_nsectors); 23845a6ac9eaSMiao Xie bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 23855a6ac9eaSMiao Xie 2386ae6529c3SQu Wenruo /* 23874c664611SQu Wenruo * We have already increased bio_counter when getting bioc, record it 2388ae6529c3SQu Wenruo * so we can free it at rbio_orig_end_io(). 2389ae6529c3SQu Wenruo */ 2390ae6529c3SQu Wenruo rbio->generic_bio_cnt = 1; 2391ae6529c3SQu Wenruo 23925a6ac9eaSMiao Xie return rbio; 23935a6ac9eaSMiao Xie } 23945a6ac9eaSMiao Xie 2395b4ee1782SOmar Sandoval /* Used for both parity scrub and missing. */ 2396b4ee1782SOmar Sandoval void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 23976346f6bfSQu Wenruo unsigned int pgoff, u64 logical) 23985a6ac9eaSMiao Xie { 23996346f6bfSQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 24005a6ac9eaSMiao Xie int stripe_offset; 24015a6ac9eaSMiao Xie int index; 24025a6ac9eaSMiao Xie 24034c664611SQu Wenruo ASSERT(logical >= rbio->bioc->raid_map[0]); 24046346f6bfSQu Wenruo ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 24055a6ac9eaSMiao Xie rbio->stripe_len * rbio->nr_data); 24064c664611SQu Wenruo stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 24076346f6bfSQu Wenruo index = stripe_offset / sectorsize; 24086346f6bfSQu Wenruo rbio->bio_sectors[index].page = page; 24096346f6bfSQu Wenruo rbio->bio_sectors[index].pgoff = pgoff; 24105a6ac9eaSMiao Xie } 24115a6ac9eaSMiao Xie 24125a6ac9eaSMiao Xie /* 24135a6ac9eaSMiao Xie * We just scrub the parity that we have correct data on the same horizontal, 24145a6ac9eaSMiao Xie * so we needn't allocate all pages for all the stripes. 24155a6ac9eaSMiao Xie */ 24165a6ac9eaSMiao Xie static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 24175a6ac9eaSMiao Xie { 2418*3907ce29SQu Wenruo const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2419*3907ce29SQu Wenruo int stripe; 2420*3907ce29SQu Wenruo int sectornr; 24215a6ac9eaSMiao Xie 2422*3907ce29SQu Wenruo for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 2423*3907ce29SQu Wenruo for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2424*3907ce29SQu Wenruo struct page *page; 2425*3907ce29SQu Wenruo int index = (stripe * rbio->stripe_nsectors + sectornr) * 2426*3907ce29SQu Wenruo sectorsize >> PAGE_SHIFT; 2427*3907ce29SQu Wenruo 24285a6ac9eaSMiao Xie if (rbio->stripe_pages[index]) 24295a6ac9eaSMiao Xie continue; 24305a6ac9eaSMiao Xie 2431b0ee5e1eSDavid Sterba page = alloc_page(GFP_NOFS); 24325a6ac9eaSMiao Xie if (!page) 24335a6ac9eaSMiao Xie return -ENOMEM; 24345a6ac9eaSMiao Xie rbio->stripe_pages[index] = page; 24355a6ac9eaSMiao Xie } 24365a6ac9eaSMiao Xie } 2437eb357060SQu Wenruo index_stripe_sectors(rbio); 24385a6ac9eaSMiao Xie return 0; 24395a6ac9eaSMiao Xie } 24405a6ac9eaSMiao Xie 24415a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 24425a6ac9eaSMiao Xie int need_check) 24435a6ac9eaSMiao Xie { 24444c664611SQu Wenruo struct btrfs_io_context *bioc = rbio->bioc; 244546900662SQu Wenruo const u32 sectorsize = bioc->fs_info->sectorsize; 24461389053eSKees Cook void **pointers = rbio->finish_pointers; 24471389053eSKees Cook unsigned long *pbitmap = rbio->finish_pbitmap; 24485a6ac9eaSMiao Xie int nr_data = rbio->nr_data; 24495a6ac9eaSMiao Xie int stripe; 24503e77605dSQu Wenruo int sectornr; 2451c17af965SDavid Sterba bool has_qstripe; 245246900662SQu Wenruo struct sector_ptr p_sector = { 0 }; 245346900662SQu Wenruo struct sector_ptr q_sector = { 0 }; 24545a6ac9eaSMiao Xie struct bio_list bio_list; 24555a6ac9eaSMiao Xie struct bio *bio; 245676035976SMiao Xie int is_replace = 0; 24575a6ac9eaSMiao Xie int ret; 24585a6ac9eaSMiao Xie 24595a6ac9eaSMiao Xie bio_list_init(&bio_list); 24605a6ac9eaSMiao Xie 2461c17af965SDavid Sterba if (rbio->real_stripes - rbio->nr_data == 1) 2462c17af965SDavid Sterba has_qstripe = false; 2463c17af965SDavid Sterba else if (rbio->real_stripes - rbio->nr_data == 2) 2464c17af965SDavid Sterba has_qstripe = true; 2465c17af965SDavid Sterba else 24665a6ac9eaSMiao Xie BUG(); 24675a6ac9eaSMiao Xie 24684c664611SQu Wenruo if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 246976035976SMiao Xie is_replace = 1; 24703e77605dSQu Wenruo bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); 247176035976SMiao Xie } 247276035976SMiao Xie 24735a6ac9eaSMiao Xie /* 24745a6ac9eaSMiao Xie * Because the higher layers(scrubber) are unlikely to 24755a6ac9eaSMiao Xie * use this area of the disk again soon, so don't cache 24765a6ac9eaSMiao Xie * it. 24775a6ac9eaSMiao Xie */ 24785a6ac9eaSMiao Xie clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 24795a6ac9eaSMiao Xie 24805a6ac9eaSMiao Xie if (!need_check) 24815a6ac9eaSMiao Xie goto writeback; 24825a6ac9eaSMiao Xie 248346900662SQu Wenruo p_sector.page = alloc_page(GFP_NOFS); 248446900662SQu Wenruo if (!p_sector.page) 24855a6ac9eaSMiao Xie goto cleanup; 248646900662SQu Wenruo p_sector.pgoff = 0; 248746900662SQu Wenruo p_sector.uptodate = 1; 24885a6ac9eaSMiao Xie 2489c17af965SDavid Sterba if (has_qstripe) { 2490d70cef0dSIra Weiny /* RAID6, allocate and map temp space for the Q stripe */ 249146900662SQu Wenruo q_sector.page = alloc_page(GFP_NOFS); 249246900662SQu Wenruo if (!q_sector.page) { 249346900662SQu Wenruo __free_page(p_sector.page); 249446900662SQu Wenruo p_sector.page = NULL; 24955a6ac9eaSMiao Xie goto cleanup; 24965a6ac9eaSMiao Xie } 249746900662SQu Wenruo q_sector.pgoff = 0; 249846900662SQu Wenruo q_sector.uptodate = 1; 249946900662SQu Wenruo pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 25005a6ac9eaSMiao Xie } 25015a6ac9eaSMiao Xie 25025a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 25035a6ac9eaSMiao Xie 2504d70cef0dSIra Weiny /* Map the parity stripe just once */ 250546900662SQu Wenruo pointers[nr_data] = kmap_local_page(p_sector.page); 2506d70cef0dSIra Weiny 25073e77605dSQu Wenruo for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 250846900662SQu Wenruo struct sector_ptr *sector; 25095a6ac9eaSMiao Xie void *parity; 251046900662SQu Wenruo 25115a6ac9eaSMiao Xie /* first collect one page from each data stripe */ 25125a6ac9eaSMiao Xie for (stripe = 0; stripe < nr_data; stripe++) { 251346900662SQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 0); 251446900662SQu Wenruo pointers[stripe] = kmap_local_page(sector->page) + 251546900662SQu Wenruo sector->pgoff; 25165a6ac9eaSMiao Xie } 25175a6ac9eaSMiao Xie 2518c17af965SDavid Sterba if (has_qstripe) { 2519d70cef0dSIra Weiny /* RAID6, call the library function to fill in our P/Q */ 252046900662SQu Wenruo raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 25215a6ac9eaSMiao Xie pointers); 25225a6ac9eaSMiao Xie } else { 25235a6ac9eaSMiao Xie /* raid5 */ 252446900662SQu Wenruo memcpy(pointers[nr_data], pointers[0], sectorsize); 252546900662SQu Wenruo run_xor(pointers + 1, nr_data - 1, sectorsize); 25265a6ac9eaSMiao Xie } 25275a6ac9eaSMiao Xie 252801327610SNicholas D Steeves /* Check scrubbing parity and repair it */ 252946900662SQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 253046900662SQu Wenruo parity = kmap_local_page(sector->page) + sector->pgoff; 253146900662SQu Wenruo if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 253246900662SQu Wenruo memcpy(parity, pointers[rbio->scrubp], sectorsize); 25335a6ac9eaSMiao Xie else 25345a6ac9eaSMiao Xie /* Parity is right, needn't writeback */ 25353e77605dSQu Wenruo bitmap_clear(rbio->dbitmap, sectornr, 1); 253658c1a35cSIra Weiny kunmap_local(parity); 25375a6ac9eaSMiao Xie 253894a0b58dSIra Weiny for (stripe = nr_data - 1; stripe >= 0; stripe--) 253994a0b58dSIra Weiny kunmap_local(pointers[stripe]); 25405a6ac9eaSMiao Xie } 25415a6ac9eaSMiao Xie 254294a0b58dSIra Weiny kunmap_local(pointers[nr_data]); 254346900662SQu Wenruo __free_page(p_sector.page); 254446900662SQu Wenruo p_sector.page = NULL; 254546900662SQu Wenruo if (q_sector.page) { 254694a0b58dSIra Weiny kunmap_local(pointers[rbio->real_stripes - 1]); 254746900662SQu Wenruo __free_page(q_sector.page); 254846900662SQu Wenruo q_sector.page = NULL; 2549d70cef0dSIra Weiny } 25505a6ac9eaSMiao Xie 25515a6ac9eaSMiao Xie writeback: 25525a6ac9eaSMiao Xie /* 25535a6ac9eaSMiao Xie * time to start writing. Make bios for everything from the 25545a6ac9eaSMiao Xie * higher layers (the bio_list in our rbio) and our p/q. Ignore 25555a6ac9eaSMiao Xie * everything else. 25565a6ac9eaSMiao Xie */ 25573e77605dSQu Wenruo for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { 25583e77605dSQu Wenruo struct sector_ptr *sector; 25595a6ac9eaSMiao Xie 25603e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 25613e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 25623e77605dSQu Wenruo sectornr, rbio->stripe_len, REQ_OP_WRITE); 25635a6ac9eaSMiao Xie if (ret) 25645a6ac9eaSMiao Xie goto cleanup; 25655a6ac9eaSMiao Xie } 25665a6ac9eaSMiao Xie 256776035976SMiao Xie if (!is_replace) 256876035976SMiao Xie goto submit_write; 256976035976SMiao Xie 25703e77605dSQu Wenruo for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 25713e77605dSQu Wenruo struct sector_ptr *sector; 257276035976SMiao Xie 25733e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 25743e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 25754c664611SQu Wenruo bioc->tgtdev_map[rbio->scrubp], 25763e77605dSQu Wenruo sectornr, rbio->stripe_len, REQ_OP_WRITE); 257776035976SMiao Xie if (ret) 257876035976SMiao Xie goto cleanup; 257976035976SMiao Xie } 258076035976SMiao Xie 258176035976SMiao Xie submit_write: 25825a6ac9eaSMiao Xie nr_data = bio_list_size(&bio_list); 25835a6ac9eaSMiao Xie if (!nr_data) { 25845a6ac9eaSMiao Xie /* Every parity is right */ 258558efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_OK); 25865a6ac9eaSMiao Xie return; 25875a6ac9eaSMiao Xie } 25885a6ac9eaSMiao Xie 25895a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, nr_data); 25905a6ac9eaSMiao Xie 2591bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 2592a6111d11SZhao Lei bio->bi_end_io = raid_write_end_io; 25934e49ea4aSMike Christie 25944e49ea4aSMike Christie submit_bio(bio); 25955a6ac9eaSMiao Xie } 25965a6ac9eaSMiao Xie return; 25975a6ac9eaSMiao Xie 25985a6ac9eaSMiao Xie cleanup: 259958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2600785884fcSLiu Bo 2601785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2602785884fcSLiu Bo bio_put(bio); 26035a6ac9eaSMiao Xie } 26045a6ac9eaSMiao Xie 26055a6ac9eaSMiao Xie static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 26065a6ac9eaSMiao Xie { 26075a6ac9eaSMiao Xie if (stripe >= 0 && stripe < rbio->nr_data) 26085a6ac9eaSMiao Xie return 1; 26095a6ac9eaSMiao Xie return 0; 26105a6ac9eaSMiao Xie } 26115a6ac9eaSMiao Xie 26125a6ac9eaSMiao Xie /* 26135a6ac9eaSMiao Xie * While we're doing the parity check and repair, we could have errors 26145a6ac9eaSMiao Xie * in reading pages off the disk. This checks for errors and if we're 26155a6ac9eaSMiao Xie * not able to read the page it'll trigger parity reconstruction. The 26165a6ac9eaSMiao Xie * parity scrub will be finished after we've reconstructed the failed 26175a6ac9eaSMiao Xie * stripes 26185a6ac9eaSMiao Xie */ 26195a6ac9eaSMiao Xie static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 26205a6ac9eaSMiao Xie { 26214c664611SQu Wenruo if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 26225a6ac9eaSMiao Xie goto cleanup; 26235a6ac9eaSMiao Xie 26245a6ac9eaSMiao Xie if (rbio->faila >= 0 || rbio->failb >= 0) { 26255a6ac9eaSMiao Xie int dfail = 0, failp = -1; 26265a6ac9eaSMiao Xie 26275a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->faila)) 26285a6ac9eaSMiao Xie dfail++; 26295a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->faila)) 26305a6ac9eaSMiao Xie failp = rbio->faila; 26315a6ac9eaSMiao Xie 26325a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->failb)) 26335a6ac9eaSMiao Xie dfail++; 26345a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->failb)) 26355a6ac9eaSMiao Xie failp = rbio->failb; 26365a6ac9eaSMiao Xie 26375a6ac9eaSMiao Xie /* 26385a6ac9eaSMiao Xie * Because we can not use a scrubbing parity to repair 26395a6ac9eaSMiao Xie * the data, so the capability of the repair is declined. 26405a6ac9eaSMiao Xie * (In the case of RAID5, we can not repair anything) 26415a6ac9eaSMiao Xie */ 26424c664611SQu Wenruo if (dfail > rbio->bioc->max_errors - 1) 26435a6ac9eaSMiao Xie goto cleanup; 26445a6ac9eaSMiao Xie 26455a6ac9eaSMiao Xie /* 26465a6ac9eaSMiao Xie * If all data is good, only parity is correctly, just 26475a6ac9eaSMiao Xie * repair the parity. 26485a6ac9eaSMiao Xie */ 26495a6ac9eaSMiao Xie if (dfail == 0) { 26505a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 26515a6ac9eaSMiao Xie return; 26525a6ac9eaSMiao Xie } 26535a6ac9eaSMiao Xie 26545a6ac9eaSMiao Xie /* 26555a6ac9eaSMiao Xie * Here means we got one corrupted data stripe and one 26565a6ac9eaSMiao Xie * corrupted parity on RAID6, if the corrupted parity 265701327610SNicholas D Steeves * is scrubbing parity, luckily, use the other one to repair 26585a6ac9eaSMiao Xie * the data, or we can not repair the data stripe. 26595a6ac9eaSMiao Xie */ 26605a6ac9eaSMiao Xie if (failp != rbio->scrubp) 26615a6ac9eaSMiao Xie goto cleanup; 26625a6ac9eaSMiao Xie 26635a6ac9eaSMiao Xie __raid_recover_end_io(rbio); 26645a6ac9eaSMiao Xie } else { 26655a6ac9eaSMiao Xie finish_parity_scrub(rbio, 1); 26665a6ac9eaSMiao Xie } 26675a6ac9eaSMiao Xie return; 26685a6ac9eaSMiao Xie 26695a6ac9eaSMiao Xie cleanup: 267058efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 26715a6ac9eaSMiao Xie } 26725a6ac9eaSMiao Xie 26735a6ac9eaSMiao Xie /* 26745a6ac9eaSMiao Xie * end io for the read phase of the rmw cycle. All the bios here are physical 26755a6ac9eaSMiao Xie * stripe bios we've read from the disk so we can recalculate the parity of the 26765a6ac9eaSMiao Xie * stripe. 26775a6ac9eaSMiao Xie * 26785a6ac9eaSMiao Xie * This will usually kick off finish_rmw once all the bios are read in, but it 26795a6ac9eaSMiao Xie * may trigger parity reconstruction if we had any errors along the way 26805a6ac9eaSMiao Xie */ 26814246a0b6SChristoph Hellwig static void raid56_parity_scrub_end_io(struct bio *bio) 26825a6ac9eaSMiao Xie { 26835a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio = bio->bi_private; 26845a6ac9eaSMiao Xie 26854e4cbee9SChristoph Hellwig if (bio->bi_status) 26865a6ac9eaSMiao Xie fail_bio_stripe(rbio, bio); 26875a6ac9eaSMiao Xie else 26885fdb7afcSQu Wenruo set_bio_pages_uptodate(rbio, bio); 26895a6ac9eaSMiao Xie 26905a6ac9eaSMiao Xie bio_put(bio); 26915a6ac9eaSMiao Xie 26925a6ac9eaSMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 26935a6ac9eaSMiao Xie return; 26945a6ac9eaSMiao Xie 26955a6ac9eaSMiao Xie /* 26965a6ac9eaSMiao Xie * this will normally call finish_rmw to start our write 26975a6ac9eaSMiao Xie * but if there are any failed stripes we'll reconstruct 26985a6ac9eaSMiao Xie * from parity first 26995a6ac9eaSMiao Xie */ 27005a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 27015a6ac9eaSMiao Xie } 27025a6ac9eaSMiao Xie 27035a6ac9eaSMiao Xie static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 27045a6ac9eaSMiao Xie { 27055a6ac9eaSMiao Xie int bios_to_read = 0; 27065a6ac9eaSMiao Xie struct bio_list bio_list; 27075a6ac9eaSMiao Xie int ret; 27083e77605dSQu Wenruo int sectornr; 27095a6ac9eaSMiao Xie int stripe; 27105a6ac9eaSMiao Xie struct bio *bio; 27115a6ac9eaSMiao Xie 2712785884fcSLiu Bo bio_list_init(&bio_list); 2713785884fcSLiu Bo 27145a6ac9eaSMiao Xie ret = alloc_rbio_essential_pages(rbio); 27155a6ac9eaSMiao Xie if (ret) 27165a6ac9eaSMiao Xie goto cleanup; 27175a6ac9eaSMiao Xie 27185a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 27195a6ac9eaSMiao Xie /* 27205a6ac9eaSMiao Xie * build a list of bios to read all the missing parts of this 27215a6ac9eaSMiao Xie * stripe 27225a6ac9eaSMiao Xie */ 27232c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 27243e77605dSQu Wenruo for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { 27253e77605dSQu Wenruo struct sector_ptr *sector; 27265a6ac9eaSMiao Xie /* 27273e77605dSQu Wenruo * We want to find all the sectors missing from the 27283e77605dSQu Wenruo * rbio and read them from the disk. If * sector_in_rbio() 27293e77605dSQu Wenruo * finds a sector in the bio list we don't need to read 27303e77605dSQu Wenruo * it off the stripe. 27315a6ac9eaSMiao Xie */ 27323e77605dSQu Wenruo sector = sector_in_rbio(rbio, stripe, sectornr, 1); 27333e77605dSQu Wenruo if (sector) 27345a6ac9eaSMiao Xie continue; 27355a6ac9eaSMiao Xie 27363e77605dSQu Wenruo sector = rbio_stripe_sector(rbio, stripe, sectornr); 27375a6ac9eaSMiao Xie /* 27383e77605dSQu Wenruo * The bio cache may have handed us an uptodate sector. 27393e77605dSQu Wenruo * If so, be happy and use it. 27405a6ac9eaSMiao Xie */ 27413e77605dSQu Wenruo if (sector->uptodate) 27425a6ac9eaSMiao Xie continue; 27435a6ac9eaSMiao Xie 27443e77605dSQu Wenruo ret = rbio_add_io_sector(rbio, &bio_list, sector, 27453e77605dSQu Wenruo stripe, sectornr, rbio->stripe_len, 27463e77605dSQu Wenruo REQ_OP_READ); 27475a6ac9eaSMiao Xie if (ret) 27485a6ac9eaSMiao Xie goto cleanup; 27495a6ac9eaSMiao Xie } 27505a6ac9eaSMiao Xie } 27515a6ac9eaSMiao Xie 27525a6ac9eaSMiao Xie bios_to_read = bio_list_size(&bio_list); 27535a6ac9eaSMiao Xie if (!bios_to_read) { 27545a6ac9eaSMiao Xie /* 27555a6ac9eaSMiao Xie * this can happen if others have merged with 27565a6ac9eaSMiao Xie * us, it means there is nothing left to read. 27575a6ac9eaSMiao Xie * But if there are missing devices it may not be 27585a6ac9eaSMiao Xie * safe to do the full stripe write yet. 27595a6ac9eaSMiao Xie */ 27605a6ac9eaSMiao Xie goto finish; 27615a6ac9eaSMiao Xie } 27625a6ac9eaSMiao Xie 27635a6ac9eaSMiao Xie /* 27644c664611SQu Wenruo * The bioc may be freed once we submit the last bio. Make sure not to 27654c664611SQu Wenruo * touch it after that. 27665a6ac9eaSMiao Xie */ 27675a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 2768bf28a605SNikolay Borisov while ((bio = bio_list_pop(&bio_list))) { 27695a6ac9eaSMiao Xie bio->bi_end_io = raid56_parity_scrub_end_io; 27705a6ac9eaSMiao Xie 27716a258d72SQu Wenruo btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 27725a6ac9eaSMiao Xie 27734e49ea4aSMike Christie submit_bio(bio); 27745a6ac9eaSMiao Xie } 27755a6ac9eaSMiao Xie /* the actual write will happen once the reads are done */ 27765a6ac9eaSMiao Xie return; 27775a6ac9eaSMiao Xie 27785a6ac9eaSMiao Xie cleanup: 277958efbc9fSOmar Sandoval rbio_orig_end_io(rbio, BLK_STS_IOERR); 2780785884fcSLiu Bo 2781785884fcSLiu Bo while ((bio = bio_list_pop(&bio_list))) 2782785884fcSLiu Bo bio_put(bio); 2783785884fcSLiu Bo 27845a6ac9eaSMiao Xie return; 27855a6ac9eaSMiao Xie 27865a6ac9eaSMiao Xie finish: 27875a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 27885a6ac9eaSMiao Xie } 27895a6ac9eaSMiao Xie 27905a6ac9eaSMiao Xie static void scrub_parity_work(struct btrfs_work *work) 27915a6ac9eaSMiao Xie { 27925a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 27935a6ac9eaSMiao Xie 27945a6ac9eaSMiao Xie rbio = container_of(work, struct btrfs_raid_bio, work); 27955a6ac9eaSMiao Xie raid56_parity_scrub_stripe(rbio); 27965a6ac9eaSMiao Xie } 27975a6ac9eaSMiao Xie 27985a6ac9eaSMiao Xie void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 27995a6ac9eaSMiao Xie { 28005a6ac9eaSMiao Xie if (!lock_stripe_add(rbio)) 2801a81b747dSDavid Sterba start_async_work(rbio, scrub_parity_work); 28025a6ac9eaSMiao Xie } 2803b4ee1782SOmar Sandoval 2804b4ee1782SOmar Sandoval /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2805b4ee1782SOmar Sandoval 2806b4ee1782SOmar Sandoval struct btrfs_raid_bio * 28076a258d72SQu Wenruo raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, 28086a258d72SQu Wenruo u64 length) 2809b4ee1782SOmar Sandoval { 28106a258d72SQu Wenruo struct btrfs_fs_info *fs_info = bioc->fs_info; 2811b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2812b4ee1782SOmar Sandoval 28134c664611SQu Wenruo rbio = alloc_rbio(fs_info, bioc, length); 2814b4ee1782SOmar Sandoval if (IS_ERR(rbio)) 2815b4ee1782SOmar Sandoval return NULL; 2816b4ee1782SOmar Sandoval 2817b4ee1782SOmar Sandoval rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2818b4ee1782SOmar Sandoval bio_list_add(&rbio->bio_list, bio); 2819b4ee1782SOmar Sandoval /* 2820b4ee1782SOmar Sandoval * This is a special bio which is used to hold the completion handler 2821b4ee1782SOmar Sandoval * and make the scrub rbio is similar to the other types 2822b4ee1782SOmar Sandoval */ 2823b4ee1782SOmar Sandoval ASSERT(!bio->bi_iter.bi_size); 2824b4ee1782SOmar Sandoval 2825b4ee1782SOmar Sandoval rbio->faila = find_logical_bio_stripe(rbio, bio); 2826b4ee1782SOmar Sandoval if (rbio->faila == -1) { 2827b4ee1782SOmar Sandoval BUG(); 2828b4ee1782SOmar Sandoval kfree(rbio); 2829b4ee1782SOmar Sandoval return NULL; 2830b4ee1782SOmar Sandoval } 2831b4ee1782SOmar Sandoval 2832ae6529c3SQu Wenruo /* 28334c664611SQu Wenruo * When we get bioc, we have already increased bio_counter, record it 2834ae6529c3SQu Wenruo * so we can free it at rbio_orig_end_io() 2835ae6529c3SQu Wenruo */ 2836ae6529c3SQu Wenruo rbio->generic_bio_cnt = 1; 2837ae6529c3SQu Wenruo 2838b4ee1782SOmar Sandoval return rbio; 2839b4ee1782SOmar Sandoval } 2840b4ee1782SOmar Sandoval 2841b4ee1782SOmar Sandoval void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2842b4ee1782SOmar Sandoval { 2843b4ee1782SOmar Sandoval if (!lock_stripe_add(rbio)) 2844e66d8d5aSDavid Sterba start_async_work(rbio, read_rebuild_work); 2845b4ee1782SOmar Sandoval } 2846