153b381b3SDavid Woodhouse /* 253b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 353b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 453b381b3SDavid Woodhouse * 553b381b3SDavid Woodhouse * This program is free software; you can redistribute it and/or 653b381b3SDavid Woodhouse * modify it under the terms of the GNU General Public 753b381b3SDavid Woodhouse * License v2 as published by the Free Software Foundation. 853b381b3SDavid Woodhouse * 953b381b3SDavid Woodhouse * This program is distributed in the hope that it will be useful, 1053b381b3SDavid Woodhouse * but WITHOUT ANY WARRANTY; without even the implied warranty of 1153b381b3SDavid Woodhouse * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1253b381b3SDavid Woodhouse * General Public License for more details. 1353b381b3SDavid Woodhouse * 1453b381b3SDavid Woodhouse * You should have received a copy of the GNU General Public 1553b381b3SDavid Woodhouse * License along with this program; if not, write to the 1653b381b3SDavid Woodhouse * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 1753b381b3SDavid Woodhouse * Boston, MA 021110-1307, USA. 1853b381b3SDavid Woodhouse */ 1953b381b3SDavid Woodhouse #include <linux/sched.h> 2053b381b3SDavid Woodhouse #include <linux/wait.h> 2153b381b3SDavid Woodhouse #include <linux/bio.h> 2253b381b3SDavid Woodhouse #include <linux/slab.h> 2353b381b3SDavid Woodhouse #include <linux/buffer_head.h> 2453b381b3SDavid Woodhouse #include <linux/blkdev.h> 2553b381b3SDavid Woodhouse #include <linux/random.h> 2653b381b3SDavid Woodhouse #include <linux/iocontext.h> 2753b381b3SDavid Woodhouse #include <linux/capability.h> 2853b381b3SDavid Woodhouse #include <linux/ratelimit.h> 2953b381b3SDavid Woodhouse #include <linux/kthread.h> 3053b381b3SDavid Woodhouse #include <linux/raid/pq.h> 3153b381b3SDavid Woodhouse #include <linux/hash.h> 3253b381b3SDavid Woodhouse #include <linux/list_sort.h> 3353b381b3SDavid Woodhouse #include <linux/raid/xor.h> 34d7011f5bSGeert Uytterhoeven #include <linux/vmalloc.h> 3553b381b3SDavid Woodhouse #include <asm/div64.h> 3653b381b3SDavid Woodhouse #include "ctree.h" 3753b381b3SDavid Woodhouse #include "extent_map.h" 3853b381b3SDavid Woodhouse #include "disk-io.h" 3953b381b3SDavid Woodhouse #include "transaction.h" 4053b381b3SDavid Woodhouse #include "print-tree.h" 4153b381b3SDavid Woodhouse #include "volumes.h" 4253b381b3SDavid Woodhouse #include "raid56.h" 4353b381b3SDavid Woodhouse #include "async-thread.h" 4453b381b3SDavid Woodhouse #include "check-integrity.h" 4553b381b3SDavid Woodhouse #include "rcu-string.h" 4653b381b3SDavid Woodhouse 4753b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 4853b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 4953b381b3SDavid Woodhouse 504ae10b3aSChris Mason /* 514ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 524ae10b3aSChris Mason * of past RMW 534ae10b3aSChris Mason */ 544ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 554ae10b3aSChris Mason 564ae10b3aSChris Mason /* 574ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 584ae10b3aSChris Mason */ 594ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 604ae10b3aSChris Mason 614ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 624ae10b3aSChris Mason 631b94b556SMiao Xie enum btrfs_rbio_ops { 64b4ee1782SOmar Sandoval BTRFS_RBIO_WRITE, 65b4ee1782SOmar Sandoval BTRFS_RBIO_READ_REBUILD, 66b4ee1782SOmar Sandoval BTRFS_RBIO_PARITY_SCRUB, 67b4ee1782SOmar Sandoval BTRFS_RBIO_REBUILD_MISSING, 681b94b556SMiao Xie }; 691b94b556SMiao Xie 7053b381b3SDavid Woodhouse struct btrfs_raid_bio { 7153b381b3SDavid Woodhouse struct btrfs_fs_info *fs_info; 7253b381b3SDavid Woodhouse struct btrfs_bio *bbio; 7353b381b3SDavid Woodhouse 7453b381b3SDavid Woodhouse /* while we're doing rmw on a stripe 7553b381b3SDavid Woodhouse * we put it into a hash table so we can 7653b381b3SDavid Woodhouse * lock the stripe and merge more rbios 7753b381b3SDavid Woodhouse * into it. 7853b381b3SDavid Woodhouse */ 7953b381b3SDavid Woodhouse struct list_head hash_list; 8053b381b3SDavid Woodhouse 8153b381b3SDavid Woodhouse /* 824ae10b3aSChris Mason * LRU list for the stripe cache 834ae10b3aSChris Mason */ 844ae10b3aSChris Mason struct list_head stripe_cache; 854ae10b3aSChris Mason 864ae10b3aSChris Mason /* 8753b381b3SDavid Woodhouse * for scheduling work in the helper threads 8853b381b3SDavid Woodhouse */ 8953b381b3SDavid Woodhouse struct btrfs_work work; 9053b381b3SDavid Woodhouse 9153b381b3SDavid Woodhouse /* 9253b381b3SDavid Woodhouse * bio list and bio_list_lock are used 9353b381b3SDavid Woodhouse * to add more bios into the stripe 9453b381b3SDavid Woodhouse * in hopes of avoiding the full rmw 9553b381b3SDavid Woodhouse */ 9653b381b3SDavid Woodhouse struct bio_list bio_list; 9753b381b3SDavid Woodhouse spinlock_t bio_list_lock; 9853b381b3SDavid Woodhouse 996ac0f488SChris Mason /* also protected by the bio_list_lock, the 1006ac0f488SChris Mason * plug list is used by the plugging code 1016ac0f488SChris Mason * to collect partial bios while plugged. The 1026ac0f488SChris Mason * stripe locking code also uses it to hand off 10353b381b3SDavid Woodhouse * the stripe lock to the next pending IO 10453b381b3SDavid Woodhouse */ 10553b381b3SDavid Woodhouse struct list_head plug_list; 10653b381b3SDavid Woodhouse 10753b381b3SDavid Woodhouse /* 10853b381b3SDavid Woodhouse * flags that tell us if it is safe to 10953b381b3SDavid Woodhouse * merge with this bio 11053b381b3SDavid Woodhouse */ 11153b381b3SDavid Woodhouse unsigned long flags; 11253b381b3SDavid Woodhouse 11353b381b3SDavid Woodhouse /* size of each individual stripe on disk */ 11453b381b3SDavid Woodhouse int stripe_len; 11553b381b3SDavid Woodhouse 11653b381b3SDavid Woodhouse /* number of data stripes (no p/q) */ 11753b381b3SDavid Woodhouse int nr_data; 11853b381b3SDavid Woodhouse 1192c8cdd6eSMiao Xie int real_stripes; 1202c8cdd6eSMiao Xie 1215a6ac9eaSMiao Xie int stripe_npages; 12253b381b3SDavid Woodhouse /* 12353b381b3SDavid Woodhouse * set if we're doing a parity rebuild 12453b381b3SDavid Woodhouse * for a read from higher up, which is handled 12553b381b3SDavid Woodhouse * differently from a parity rebuild as part of 12653b381b3SDavid Woodhouse * rmw 12753b381b3SDavid Woodhouse */ 1281b94b556SMiao Xie enum btrfs_rbio_ops operation; 12953b381b3SDavid Woodhouse 13053b381b3SDavid Woodhouse /* first bad stripe */ 13153b381b3SDavid Woodhouse int faila; 13253b381b3SDavid Woodhouse 13353b381b3SDavid Woodhouse /* second bad stripe (for raid6 use) */ 13453b381b3SDavid Woodhouse int failb; 13553b381b3SDavid Woodhouse 1365a6ac9eaSMiao Xie int scrubp; 13753b381b3SDavid Woodhouse /* 13853b381b3SDavid Woodhouse * number of pages needed to represent the full 13953b381b3SDavid Woodhouse * stripe 14053b381b3SDavid Woodhouse */ 14153b381b3SDavid Woodhouse int nr_pages; 14253b381b3SDavid Woodhouse 14353b381b3SDavid Woodhouse /* 14453b381b3SDavid Woodhouse * size of all the bios in the bio_list. This 14553b381b3SDavid Woodhouse * helps us decide if the rbio maps to a full 14653b381b3SDavid Woodhouse * stripe or not 14753b381b3SDavid Woodhouse */ 14853b381b3SDavid Woodhouse int bio_list_bytes; 14953b381b3SDavid Woodhouse 1504245215dSMiao Xie int generic_bio_cnt; 1514245215dSMiao Xie 15253b381b3SDavid Woodhouse atomic_t refs; 15353b381b3SDavid Woodhouse 154b89e1b01SMiao Xie atomic_t stripes_pending; 155b89e1b01SMiao Xie 156b89e1b01SMiao Xie atomic_t error; 15753b381b3SDavid Woodhouse /* 15853b381b3SDavid Woodhouse * these are two arrays of pointers. We allocate the 15953b381b3SDavid Woodhouse * rbio big enough to hold them both and setup their 16053b381b3SDavid Woodhouse * locations when the rbio is allocated 16153b381b3SDavid Woodhouse */ 16253b381b3SDavid Woodhouse 16353b381b3SDavid Woodhouse /* pointers to pages that we allocated for 16453b381b3SDavid Woodhouse * reading/writing stripes directly from the disk (including P/Q) 16553b381b3SDavid Woodhouse */ 16653b381b3SDavid Woodhouse struct page **stripe_pages; 16753b381b3SDavid Woodhouse 16853b381b3SDavid Woodhouse /* 16953b381b3SDavid Woodhouse * pointers to the pages in the bio_list. Stored 17053b381b3SDavid Woodhouse * here for faster lookup 17153b381b3SDavid Woodhouse */ 17253b381b3SDavid Woodhouse struct page **bio_pages; 1735a6ac9eaSMiao Xie 1745a6ac9eaSMiao Xie /* 1755a6ac9eaSMiao Xie * bitmap to record which horizontal stripe has data 1765a6ac9eaSMiao Xie */ 1775a6ac9eaSMiao Xie unsigned long *dbitmap; 17853b381b3SDavid Woodhouse }; 17953b381b3SDavid Woodhouse 18053b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 18153b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 18253b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work); 18353b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work); 18453b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 18553b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio); 18653b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 18753b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 18853b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio); 18953b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 19053b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 19153b381b3SDavid Woodhouse 1925a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 1935a6ac9eaSMiao Xie int need_check); 1945a6ac9eaSMiao Xie static void async_scrub_parity(struct btrfs_raid_bio *rbio); 1955a6ac9eaSMiao Xie 19653b381b3SDavid Woodhouse /* 19753b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 19853b381b3SDavid Woodhouse * bios in hopes of making a full stripe 19953b381b3SDavid Woodhouse */ 20053b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 20153b381b3SDavid Woodhouse { 20253b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 20353b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 20453b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 20553b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 20653b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 20753b381b3SDavid Woodhouse int i; 20883c8266aSDavid Sterba int table_size; 20953b381b3SDavid Woodhouse 21053b381b3SDavid Woodhouse if (info->stripe_hash_table) 21153b381b3SDavid Woodhouse return 0; 21253b381b3SDavid Woodhouse 21383c8266aSDavid Sterba /* 21483c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 21583c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 21683c8266aSDavid Sterba * 21783c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 21883c8266aSDavid Sterba * of a failing mount. 21983c8266aSDavid Sterba */ 22083c8266aSDavid Sterba table_size = sizeof(*table) + sizeof(*h) * num_entries; 22183c8266aSDavid Sterba table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 22283c8266aSDavid Sterba if (!table) { 22383c8266aSDavid Sterba table = vzalloc(table_size); 22453b381b3SDavid Woodhouse if (!table) 22553b381b3SDavid Woodhouse return -ENOMEM; 22683c8266aSDavid Sterba } 22753b381b3SDavid Woodhouse 2284ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 2294ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 2304ae10b3aSChris Mason 23153b381b3SDavid Woodhouse h = table->table; 23253b381b3SDavid Woodhouse 23353b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 23453b381b3SDavid Woodhouse cur = h + i; 23553b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 23653b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 23753b381b3SDavid Woodhouse init_waitqueue_head(&cur->wait); 23853b381b3SDavid Woodhouse } 23953b381b3SDavid Woodhouse 24053b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 241f749303bSWang Shilong if (x) 242f749303bSWang Shilong kvfree(x); 24353b381b3SDavid Woodhouse return 0; 24453b381b3SDavid Woodhouse } 24553b381b3SDavid Woodhouse 24653b381b3SDavid Woodhouse /* 2474ae10b3aSChris Mason * caching an rbio means to copy anything from the 2484ae10b3aSChris Mason * bio_pages array into the stripe_pages array. We 2494ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 2504ae10b3aSChris Mason * to indicate if it has valid data 2514ae10b3aSChris Mason * 2524ae10b3aSChris Mason * once the caching is done, we set the cache ready 2534ae10b3aSChris Mason * bit. 2544ae10b3aSChris Mason */ 2554ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 2564ae10b3aSChris Mason { 2574ae10b3aSChris Mason int i; 2584ae10b3aSChris Mason char *s; 2594ae10b3aSChris Mason char *d; 2604ae10b3aSChris Mason int ret; 2614ae10b3aSChris Mason 2624ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 2634ae10b3aSChris Mason if (ret) 2644ae10b3aSChris Mason return; 2654ae10b3aSChris Mason 2664ae10b3aSChris Mason for (i = 0; i < rbio->nr_pages; i++) { 2674ae10b3aSChris Mason if (!rbio->bio_pages[i]) 2684ae10b3aSChris Mason continue; 2694ae10b3aSChris Mason 2704ae10b3aSChris Mason s = kmap(rbio->bio_pages[i]); 2714ae10b3aSChris Mason d = kmap(rbio->stripe_pages[i]); 2724ae10b3aSChris Mason 2734ae10b3aSChris Mason memcpy(d, s, PAGE_CACHE_SIZE); 2744ae10b3aSChris Mason 2754ae10b3aSChris Mason kunmap(rbio->bio_pages[i]); 2764ae10b3aSChris Mason kunmap(rbio->stripe_pages[i]); 2774ae10b3aSChris Mason SetPageUptodate(rbio->stripe_pages[i]); 2784ae10b3aSChris Mason } 2794ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2804ae10b3aSChris Mason } 2814ae10b3aSChris Mason 2824ae10b3aSChris Mason /* 28353b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 28453b381b3SDavid Woodhouse */ 28553b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 28653b381b3SDavid Woodhouse { 2878e5cfb55SZhao Lei u64 num = rbio->bbio->raid_map[0]; 28853b381b3SDavid Woodhouse 28953b381b3SDavid Woodhouse /* 29053b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 29153b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 29253b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 29353b381b3SDavid Woodhouse * returns just one or two different values. 29453b381b3SDavid Woodhouse * 29553b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 29653b381b3SDavid Woodhouse */ 29753b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 29853b381b3SDavid Woodhouse } 29953b381b3SDavid Woodhouse 30053b381b3SDavid Woodhouse /* 3014ae10b3aSChris Mason * stealing an rbio means taking all the uptodate pages from the stripe 3024ae10b3aSChris Mason * array in the source rbio and putting them into the destination rbio 3034ae10b3aSChris Mason */ 3044ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 3054ae10b3aSChris Mason { 3064ae10b3aSChris Mason int i; 3074ae10b3aSChris Mason struct page *s; 3084ae10b3aSChris Mason struct page *d; 3094ae10b3aSChris Mason 3104ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 3114ae10b3aSChris Mason return; 3124ae10b3aSChris Mason 3134ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 3144ae10b3aSChris Mason s = src->stripe_pages[i]; 3154ae10b3aSChris Mason if (!s || !PageUptodate(s)) { 3164ae10b3aSChris Mason continue; 3174ae10b3aSChris Mason } 3184ae10b3aSChris Mason 3194ae10b3aSChris Mason d = dest->stripe_pages[i]; 3204ae10b3aSChris Mason if (d) 3214ae10b3aSChris Mason __free_page(d); 3224ae10b3aSChris Mason 3234ae10b3aSChris Mason dest->stripe_pages[i] = s; 3244ae10b3aSChris Mason src->stripe_pages[i] = NULL; 3254ae10b3aSChris Mason } 3264ae10b3aSChris Mason } 3274ae10b3aSChris Mason 3284ae10b3aSChris Mason /* 32953b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 33053b381b3SDavid Woodhouse * splice it into the destination. The victim should 33153b381b3SDavid Woodhouse * be discarded afterwards. 33253b381b3SDavid Woodhouse * 33353b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 33453b381b3SDavid Woodhouse */ 33553b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 33653b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 33753b381b3SDavid Woodhouse { 33853b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 33953b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 3404245215dSMiao Xie dest->generic_bio_cnt += victim->generic_bio_cnt; 34153b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 34253b381b3SDavid Woodhouse } 34353b381b3SDavid Woodhouse 34453b381b3SDavid Woodhouse /* 3454ae10b3aSChris Mason * used to prune items that are in the cache. The caller 3464ae10b3aSChris Mason * must hold the hash table lock. 3474ae10b3aSChris Mason */ 3484ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3494ae10b3aSChris Mason { 3504ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 3514ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3524ae10b3aSChris Mason struct btrfs_stripe_hash *h; 3534ae10b3aSChris Mason int freeit = 0; 3544ae10b3aSChris Mason 3554ae10b3aSChris Mason /* 3564ae10b3aSChris Mason * check the bit again under the hash table lock. 3574ae10b3aSChris Mason */ 3584ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 3594ae10b3aSChris Mason return; 3604ae10b3aSChris Mason 3614ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 3624ae10b3aSChris Mason h = table->table + bucket; 3634ae10b3aSChris Mason 3644ae10b3aSChris Mason /* hold the lock for the bucket because we may be 3654ae10b3aSChris Mason * removing it from the hash table 3664ae10b3aSChris Mason */ 3674ae10b3aSChris Mason spin_lock(&h->lock); 3684ae10b3aSChris Mason 3694ae10b3aSChris Mason /* 3704ae10b3aSChris Mason * hold the lock for the bio list because we need 3714ae10b3aSChris Mason * to make sure the bio list is empty 3724ae10b3aSChris Mason */ 3734ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 3744ae10b3aSChris Mason 3754ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 3764ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 3774ae10b3aSChris Mason table->cache_size -= 1; 3784ae10b3aSChris Mason freeit = 1; 3794ae10b3aSChris Mason 3804ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 3814ae10b3aSChris Mason * still involved in an IO. We take it out 3824ae10b3aSChris Mason * of the cache list, and drop the ref that 3834ae10b3aSChris Mason * was held for the list. 3844ae10b3aSChris Mason * 3854ae10b3aSChris Mason * If the bio_list was empty, we also remove 3864ae10b3aSChris Mason * the rbio from the hash_table, and drop 3874ae10b3aSChris Mason * the corresponding ref 3884ae10b3aSChris Mason */ 3894ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 3904ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 3914ae10b3aSChris Mason list_del_init(&rbio->hash_list); 3924ae10b3aSChris Mason atomic_dec(&rbio->refs); 3934ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 3944ae10b3aSChris Mason } 3954ae10b3aSChris Mason } 3964ae10b3aSChris Mason } 3974ae10b3aSChris Mason 3984ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 3994ae10b3aSChris Mason spin_unlock(&h->lock); 4004ae10b3aSChris Mason 4014ae10b3aSChris Mason if (freeit) 4024ae10b3aSChris Mason __free_raid_bio(rbio); 4034ae10b3aSChris Mason } 4044ae10b3aSChris Mason 4054ae10b3aSChris Mason /* 4064ae10b3aSChris Mason * prune a given rbio from the cache 4074ae10b3aSChris Mason */ 4084ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 4094ae10b3aSChris Mason { 4104ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4114ae10b3aSChris Mason unsigned long flags; 4124ae10b3aSChris Mason 4134ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4144ae10b3aSChris Mason return; 4154ae10b3aSChris Mason 4164ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 4174ae10b3aSChris Mason 4184ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4194ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4204ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4214ae10b3aSChris Mason } 4224ae10b3aSChris Mason 4234ae10b3aSChris Mason /* 4244ae10b3aSChris Mason * remove everything in the cache 4254ae10b3aSChris Mason */ 42648a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 4274ae10b3aSChris Mason { 4284ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4294ae10b3aSChris Mason unsigned long flags; 4304ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 4314ae10b3aSChris Mason 4324ae10b3aSChris Mason table = info->stripe_hash_table; 4334ae10b3aSChris Mason 4344ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4354ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 4364ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 4374ae10b3aSChris Mason struct btrfs_raid_bio, 4384ae10b3aSChris Mason stripe_cache); 4394ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4404ae10b3aSChris Mason } 4414ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4424ae10b3aSChris Mason } 4434ae10b3aSChris Mason 4444ae10b3aSChris Mason /* 4454ae10b3aSChris Mason * remove all cached entries and free the hash table 4464ae10b3aSChris Mason * used by unmount 44753b381b3SDavid Woodhouse */ 44853b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 44953b381b3SDavid Woodhouse { 45053b381b3SDavid Woodhouse if (!info->stripe_hash_table) 45153b381b3SDavid Woodhouse return; 4524ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 453f749303bSWang Shilong kvfree(info->stripe_hash_table); 45453b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 45553b381b3SDavid Woodhouse } 45653b381b3SDavid Woodhouse 45753b381b3SDavid Woodhouse /* 4584ae10b3aSChris Mason * insert an rbio into the stripe cache. It 4594ae10b3aSChris Mason * must have already been prepared by calling 4604ae10b3aSChris Mason * cache_rbio_pages 4614ae10b3aSChris Mason * 4624ae10b3aSChris Mason * If this rbio was already cached, it gets 4634ae10b3aSChris Mason * moved to the front of the lru. 4644ae10b3aSChris Mason * 4654ae10b3aSChris Mason * If the size of the rbio cache is too big, we 4664ae10b3aSChris Mason * prune an item. 4674ae10b3aSChris Mason */ 4684ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 4694ae10b3aSChris Mason { 4704ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4714ae10b3aSChris Mason unsigned long flags; 4724ae10b3aSChris Mason 4734ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 4744ae10b3aSChris Mason return; 4754ae10b3aSChris Mason 4764ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 4774ae10b3aSChris Mason 4784ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4794ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4804ae10b3aSChris Mason 4814ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 4824ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 4834ae10b3aSChris Mason atomic_inc(&rbio->refs); 4844ae10b3aSChris Mason 4854ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 4864ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 4874ae10b3aSChris Mason } else { 4884ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 4894ae10b3aSChris Mason table->cache_size += 1; 4904ae10b3aSChris Mason } 4914ae10b3aSChris Mason 4924ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4934ae10b3aSChris Mason 4944ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 4954ae10b3aSChris Mason struct btrfs_raid_bio *found; 4964ae10b3aSChris Mason 4974ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 4984ae10b3aSChris Mason struct btrfs_raid_bio, 4994ae10b3aSChris Mason stripe_cache); 5004ae10b3aSChris Mason 5014ae10b3aSChris Mason if (found != rbio) 5024ae10b3aSChris Mason __remove_rbio_from_cache(found); 5034ae10b3aSChris Mason } 5044ae10b3aSChris Mason 5054ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 5064ae10b3aSChris Mason } 5074ae10b3aSChris Mason 5084ae10b3aSChris Mason /* 50953b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 51053b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 51153b381b3SDavid Woodhouse * loop through. 51253b381b3SDavid Woodhouse */ 51353b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 51453b381b3SDavid Woodhouse { 51553b381b3SDavid Woodhouse int src_off = 0; 51653b381b3SDavid Woodhouse int xor_src_cnt = 0; 51753b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 51853b381b3SDavid Woodhouse 51953b381b3SDavid Woodhouse while(src_cnt > 0) { 52053b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 52153b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 52253b381b3SDavid Woodhouse 52353b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 52453b381b3SDavid Woodhouse src_off += xor_src_cnt; 52553b381b3SDavid Woodhouse } 52653b381b3SDavid Woodhouse } 52753b381b3SDavid Woodhouse 52853b381b3SDavid Woodhouse /* 52953b381b3SDavid Woodhouse * returns true if the bio list inside this rbio 53053b381b3SDavid Woodhouse * covers an entire stripe (no rmw required). 53153b381b3SDavid Woodhouse * Must be called with the bio list lock held, or 53253b381b3SDavid Woodhouse * at a time when you know it is impossible to add 53353b381b3SDavid Woodhouse * new bios into the list 53453b381b3SDavid Woodhouse */ 53553b381b3SDavid Woodhouse static int __rbio_is_full(struct btrfs_raid_bio *rbio) 53653b381b3SDavid Woodhouse { 53753b381b3SDavid Woodhouse unsigned long size = rbio->bio_list_bytes; 53853b381b3SDavid Woodhouse int ret = 1; 53953b381b3SDavid Woodhouse 54053b381b3SDavid Woodhouse if (size != rbio->nr_data * rbio->stripe_len) 54153b381b3SDavid Woodhouse ret = 0; 54253b381b3SDavid Woodhouse 54353b381b3SDavid Woodhouse BUG_ON(size > rbio->nr_data * rbio->stripe_len); 54453b381b3SDavid Woodhouse return ret; 54553b381b3SDavid Woodhouse } 54653b381b3SDavid Woodhouse 54753b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 54853b381b3SDavid Woodhouse { 54953b381b3SDavid Woodhouse unsigned long flags; 55053b381b3SDavid Woodhouse int ret; 55153b381b3SDavid Woodhouse 55253b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 55353b381b3SDavid Woodhouse ret = __rbio_is_full(rbio); 55453b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 55553b381b3SDavid Woodhouse return ret; 55653b381b3SDavid Woodhouse } 55753b381b3SDavid Woodhouse 55853b381b3SDavid Woodhouse /* 55953b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 56053b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 56153b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 56253b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 56353b381b3SDavid Woodhouse * locked for final IO 56453b381b3SDavid Woodhouse * 56553b381b3SDavid Woodhouse * The caller is responsible for locking such that 56653b381b3SDavid Woodhouse * rmw_locked is safe to test 56753b381b3SDavid Woodhouse */ 56853b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 56953b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 57053b381b3SDavid Woodhouse { 57153b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 57253b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 57353b381b3SDavid Woodhouse return 0; 57453b381b3SDavid Woodhouse 5754ae10b3aSChris Mason /* 5764ae10b3aSChris Mason * we can't merge with cached rbios, since the 5774ae10b3aSChris Mason * idea is that when we merge the destination 5784ae10b3aSChris Mason * rbio is going to run our IO for us. We can 5794ae10b3aSChris Mason * steal from cached rbio's though, other functions 5804ae10b3aSChris Mason * handle that. 5814ae10b3aSChris Mason */ 5824ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 5834ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 5844ae10b3aSChris Mason return 0; 5854ae10b3aSChris Mason 5868e5cfb55SZhao Lei if (last->bbio->raid_map[0] != 5878e5cfb55SZhao Lei cur->bbio->raid_map[0]) 58853b381b3SDavid Woodhouse return 0; 58953b381b3SDavid Woodhouse 5905a6ac9eaSMiao Xie /* we can't merge with different operations */ 5915a6ac9eaSMiao Xie if (last->operation != cur->operation) 59253b381b3SDavid Woodhouse return 0; 5935a6ac9eaSMiao Xie /* 5945a6ac9eaSMiao Xie * We've need read the full stripe from the drive. 5955a6ac9eaSMiao Xie * check and repair the parity and write the new results. 5965a6ac9eaSMiao Xie * 5975a6ac9eaSMiao Xie * We're not allowed to add any new bios to the 5985a6ac9eaSMiao Xie * bio list here, anyone else that wants to 5995a6ac9eaSMiao Xie * change this stripe needs to do their own rmw. 6005a6ac9eaSMiao Xie */ 6015a6ac9eaSMiao Xie if (last->operation == BTRFS_RBIO_PARITY_SCRUB || 6025a6ac9eaSMiao Xie cur->operation == BTRFS_RBIO_PARITY_SCRUB) 6035a6ac9eaSMiao Xie return 0; 60453b381b3SDavid Woodhouse 605b4ee1782SOmar Sandoval if (last->operation == BTRFS_RBIO_REBUILD_MISSING || 606b4ee1782SOmar Sandoval cur->operation == BTRFS_RBIO_REBUILD_MISSING) 607b4ee1782SOmar Sandoval return 0; 608b4ee1782SOmar Sandoval 60953b381b3SDavid Woodhouse return 1; 61053b381b3SDavid Woodhouse } 61153b381b3SDavid Woodhouse 612*b7178a5fSZhao Lei static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 613*b7178a5fSZhao Lei int index) 614*b7178a5fSZhao Lei { 615*b7178a5fSZhao Lei return stripe * rbio->stripe_npages + index; 616*b7178a5fSZhao Lei } 617*b7178a5fSZhao Lei 618*b7178a5fSZhao Lei /* 619*b7178a5fSZhao Lei * these are just the pages from the rbio array, not from anything 620*b7178a5fSZhao Lei * the FS sent down to us 621*b7178a5fSZhao Lei */ 622*b7178a5fSZhao Lei static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 623*b7178a5fSZhao Lei int index) 624*b7178a5fSZhao Lei { 625*b7178a5fSZhao Lei return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 626*b7178a5fSZhao Lei } 627*b7178a5fSZhao Lei 62853b381b3SDavid Woodhouse /* 62953b381b3SDavid Woodhouse * helper to index into the pstripe 63053b381b3SDavid Woodhouse */ 63153b381b3SDavid Woodhouse static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 63253b381b3SDavid Woodhouse { 633*b7178a5fSZhao Lei return rbio_stripe_page(rbio, rbio->nr_data, index); 63453b381b3SDavid Woodhouse } 63553b381b3SDavid Woodhouse 63653b381b3SDavid Woodhouse /* 63753b381b3SDavid Woodhouse * helper to index into the qstripe, returns null 63853b381b3SDavid Woodhouse * if there is no qstripe 63953b381b3SDavid Woodhouse */ 64053b381b3SDavid Woodhouse static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 64153b381b3SDavid Woodhouse { 6422c8cdd6eSMiao Xie if (rbio->nr_data + 1 == rbio->real_stripes) 64353b381b3SDavid Woodhouse return NULL; 644*b7178a5fSZhao Lei return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 64553b381b3SDavid Woodhouse } 64653b381b3SDavid Woodhouse 64753b381b3SDavid Woodhouse /* 64853b381b3SDavid Woodhouse * The first stripe in the table for a logical address 64953b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 65053b381b3SDavid Woodhouse * 65153b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 65253b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 65353b381b3SDavid Woodhouse * themselves. 65453b381b3SDavid Woodhouse * 65553b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 65653b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 65753b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 65853b381b3SDavid Woodhouse * 65953b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 66053b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 66153b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 66253b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 66353b381b3SDavid Woodhouse * 1 is returned 66453b381b3SDavid Woodhouse * 66553b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 66653b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 66753b381b3SDavid Woodhouse * already been freed. 66853b381b3SDavid Woodhouse */ 66953b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 67053b381b3SDavid Woodhouse { 67153b381b3SDavid Woodhouse int bucket = rbio_bucket(rbio); 67253b381b3SDavid Woodhouse struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 67353b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 67453b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 67553b381b3SDavid Woodhouse unsigned long flags; 67653b381b3SDavid Woodhouse DEFINE_WAIT(wait); 67753b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 6784ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 67953b381b3SDavid Woodhouse int ret = 0; 68053b381b3SDavid Woodhouse int walk = 0; 68153b381b3SDavid Woodhouse 68253b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 68353b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 68453b381b3SDavid Woodhouse walk++; 6858e5cfb55SZhao Lei if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { 68653b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 68753b381b3SDavid Woodhouse 6884ae10b3aSChris Mason /* can we steal this cached rbio's pages? */ 6894ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 6904ae10b3aSChris Mason list_empty(&cur->plug_list) && 6914ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 6924ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 6934ae10b3aSChris Mason list_del_init(&cur->hash_list); 6944ae10b3aSChris Mason atomic_dec(&cur->refs); 6954ae10b3aSChris Mason 6964ae10b3aSChris Mason steal_rbio(cur, rbio); 6974ae10b3aSChris Mason cache_drop = cur; 6984ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 6994ae10b3aSChris Mason 7004ae10b3aSChris Mason goto lockit; 7014ae10b3aSChris Mason } 7024ae10b3aSChris Mason 70353b381b3SDavid Woodhouse /* can we merge into the lock owner? */ 70453b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 70553b381b3SDavid Woodhouse merge_rbio(cur, rbio); 70653b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 70753b381b3SDavid Woodhouse freeit = rbio; 70853b381b3SDavid Woodhouse ret = 1; 70953b381b3SDavid Woodhouse goto out; 71053b381b3SDavid Woodhouse } 71153b381b3SDavid Woodhouse 7124ae10b3aSChris Mason 71353b381b3SDavid Woodhouse /* 71453b381b3SDavid Woodhouse * we couldn't merge with the running 71553b381b3SDavid Woodhouse * rbio, see if we can merge with the 71653b381b3SDavid Woodhouse * pending ones. We don't have to 71753b381b3SDavid Woodhouse * check for rmw_locked because there 71853b381b3SDavid Woodhouse * is no way they are inside finish_rmw 71953b381b3SDavid Woodhouse * right now 72053b381b3SDavid Woodhouse */ 72153b381b3SDavid Woodhouse list_for_each_entry(pending, &cur->plug_list, 72253b381b3SDavid Woodhouse plug_list) { 72353b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 72453b381b3SDavid Woodhouse merge_rbio(pending, rbio); 72553b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 72653b381b3SDavid Woodhouse freeit = rbio; 72753b381b3SDavid Woodhouse ret = 1; 72853b381b3SDavid Woodhouse goto out; 72953b381b3SDavid Woodhouse } 73053b381b3SDavid Woodhouse } 73153b381b3SDavid Woodhouse 73253b381b3SDavid Woodhouse /* no merging, put us on the tail of the plug list, 73353b381b3SDavid Woodhouse * our rbio will be started with the currently 73453b381b3SDavid Woodhouse * running rbio unlocks 73553b381b3SDavid Woodhouse */ 73653b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 73753b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 73853b381b3SDavid Woodhouse ret = 1; 73953b381b3SDavid Woodhouse goto out; 74053b381b3SDavid Woodhouse } 74153b381b3SDavid Woodhouse } 7424ae10b3aSChris Mason lockit: 74353b381b3SDavid Woodhouse atomic_inc(&rbio->refs); 74453b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 74553b381b3SDavid Woodhouse out: 74653b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 7474ae10b3aSChris Mason if (cache_drop) 7484ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 74953b381b3SDavid Woodhouse if (freeit) 75053b381b3SDavid Woodhouse __free_raid_bio(freeit); 75153b381b3SDavid Woodhouse return ret; 75253b381b3SDavid Woodhouse } 75353b381b3SDavid Woodhouse 75453b381b3SDavid Woodhouse /* 75553b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 75653b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 75753b381b3SDavid Woodhouse */ 75853b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 75953b381b3SDavid Woodhouse { 76053b381b3SDavid Woodhouse int bucket; 76153b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 76253b381b3SDavid Woodhouse unsigned long flags; 7634ae10b3aSChris Mason int keep_cache = 0; 76453b381b3SDavid Woodhouse 76553b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 76653b381b3SDavid Woodhouse h = rbio->fs_info->stripe_hash_table->table + bucket; 76753b381b3SDavid Woodhouse 7684ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 7694ae10b3aSChris Mason cache_rbio(rbio); 7704ae10b3aSChris Mason 77153b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 77253b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 77353b381b3SDavid Woodhouse 77453b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 7754ae10b3aSChris Mason /* 7764ae10b3aSChris Mason * if we're still cached and there is no other IO 7774ae10b3aSChris Mason * to perform, just leave this rbio here for others 7784ae10b3aSChris Mason * to steal from later 7794ae10b3aSChris Mason */ 7804ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 7814ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 7824ae10b3aSChris Mason keep_cache = 1; 7834ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 7844ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 7854ae10b3aSChris Mason goto done; 7864ae10b3aSChris Mason } 78753b381b3SDavid Woodhouse 78853b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 78953b381b3SDavid Woodhouse atomic_dec(&rbio->refs); 79053b381b3SDavid Woodhouse 79153b381b3SDavid Woodhouse /* 79253b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 79353b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 79453b381b3SDavid Woodhouse * hand the lock over to one of them. 79553b381b3SDavid Woodhouse */ 79653b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 79753b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 79853b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 79953b381b3SDavid Woodhouse 80053b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 80153b381b3SDavid Woodhouse plug_list); 80253b381b3SDavid Woodhouse 80353b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 80453b381b3SDavid Woodhouse 80553b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 80653b381b3SDavid Woodhouse atomic_inc(&next->refs); 80753b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 80853b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 80953b381b3SDavid Woodhouse 8101b94b556SMiao Xie if (next->operation == BTRFS_RBIO_READ_REBUILD) 81153b381b3SDavid Woodhouse async_read_rebuild(next); 812b4ee1782SOmar Sandoval else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 813b4ee1782SOmar Sandoval steal_rbio(rbio, next); 814b4ee1782SOmar Sandoval async_read_rebuild(next); 815b4ee1782SOmar Sandoval } else if (next->operation == BTRFS_RBIO_WRITE) { 8164ae10b3aSChris Mason steal_rbio(rbio, next); 81753b381b3SDavid Woodhouse async_rmw_stripe(next); 8185a6ac9eaSMiao Xie } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 8195a6ac9eaSMiao Xie steal_rbio(rbio, next); 8205a6ac9eaSMiao Xie async_scrub_parity(next); 8214ae10b3aSChris Mason } 82253b381b3SDavid Woodhouse 82353b381b3SDavid Woodhouse goto done_nolock; 82433a9eca7SDavid Sterba /* 82533a9eca7SDavid Sterba * The barrier for this waitqueue_active is not needed, 82633a9eca7SDavid Sterba * we're protected by h->lock and can't miss a wakeup. 82733a9eca7SDavid Sterba */ 82853b381b3SDavid Woodhouse } else if (waitqueue_active(&h->wait)) { 82953b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 83053b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 83153b381b3SDavid Woodhouse wake_up(&h->wait); 83253b381b3SDavid Woodhouse goto done_nolock; 83353b381b3SDavid Woodhouse } 83453b381b3SDavid Woodhouse } 8354ae10b3aSChris Mason done: 83653b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 83753b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 83853b381b3SDavid Woodhouse 83953b381b3SDavid Woodhouse done_nolock: 8404ae10b3aSChris Mason if (!keep_cache) 8414ae10b3aSChris Mason remove_rbio_from_cache(rbio); 84253b381b3SDavid Woodhouse } 84353b381b3SDavid Woodhouse 84453b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio) 84553b381b3SDavid Woodhouse { 84653b381b3SDavid Woodhouse int i; 84753b381b3SDavid Woodhouse 84853b381b3SDavid Woodhouse WARN_ON(atomic_read(&rbio->refs) < 0); 84953b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->refs)) 85053b381b3SDavid Woodhouse return; 85153b381b3SDavid Woodhouse 8524ae10b3aSChris Mason WARN_ON(!list_empty(&rbio->stripe_cache)); 85353b381b3SDavid Woodhouse WARN_ON(!list_empty(&rbio->hash_list)); 85453b381b3SDavid Woodhouse WARN_ON(!bio_list_empty(&rbio->bio_list)); 85553b381b3SDavid Woodhouse 85653b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 85753b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) { 85853b381b3SDavid Woodhouse __free_page(rbio->stripe_pages[i]); 85953b381b3SDavid Woodhouse rbio->stripe_pages[i] = NULL; 86053b381b3SDavid Woodhouse } 86153b381b3SDavid Woodhouse } 862af8e2d1dSMiao Xie 8636e9606d2SZhao Lei btrfs_put_bbio(rbio->bbio); 86453b381b3SDavid Woodhouse kfree(rbio); 86553b381b3SDavid Woodhouse } 86653b381b3SDavid Woodhouse 86753b381b3SDavid Woodhouse static void free_raid_bio(struct btrfs_raid_bio *rbio) 86853b381b3SDavid Woodhouse { 86953b381b3SDavid Woodhouse unlock_stripe(rbio); 87053b381b3SDavid Woodhouse __free_raid_bio(rbio); 87153b381b3SDavid Woodhouse } 87253b381b3SDavid Woodhouse 87353b381b3SDavid Woodhouse /* 87453b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 87553b381b3SDavid Woodhouse * bio_list and calls end_io on them 87653b381b3SDavid Woodhouse */ 8774246a0b6SChristoph Hellwig static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) 87853b381b3SDavid Woodhouse { 87953b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 88053b381b3SDavid Woodhouse struct bio *next; 8814245215dSMiao Xie 8824245215dSMiao Xie if (rbio->generic_bio_cnt) 8834245215dSMiao Xie btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 8844245215dSMiao Xie 88553b381b3SDavid Woodhouse free_raid_bio(rbio); 88653b381b3SDavid Woodhouse 88753b381b3SDavid Woodhouse while (cur) { 88853b381b3SDavid Woodhouse next = cur->bi_next; 88953b381b3SDavid Woodhouse cur->bi_next = NULL; 8904246a0b6SChristoph Hellwig cur->bi_error = err; 8914246a0b6SChristoph Hellwig bio_endio(cur); 89253b381b3SDavid Woodhouse cur = next; 89353b381b3SDavid Woodhouse } 89453b381b3SDavid Woodhouse } 89553b381b3SDavid Woodhouse 89653b381b3SDavid Woodhouse /* 89753b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 89853b381b3SDavid Woodhouse * get here, we've written a full stripe 89953b381b3SDavid Woodhouse */ 9004246a0b6SChristoph Hellwig static void raid_write_end_io(struct bio *bio) 90153b381b3SDavid Woodhouse { 90253b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 9034246a0b6SChristoph Hellwig int err = bio->bi_error; 90453b381b3SDavid Woodhouse 90553b381b3SDavid Woodhouse if (err) 90653b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 90753b381b3SDavid Woodhouse 90853b381b3SDavid Woodhouse bio_put(bio); 90953b381b3SDavid Woodhouse 910b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 91153b381b3SDavid Woodhouse return; 91253b381b3SDavid Woodhouse 91353b381b3SDavid Woodhouse err = 0; 91453b381b3SDavid Woodhouse 91553b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 916b89e1b01SMiao Xie if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 91753b381b3SDavid Woodhouse err = -EIO; 91853b381b3SDavid Woodhouse 9194246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 92053b381b3SDavid Woodhouse } 92153b381b3SDavid Woodhouse 92253b381b3SDavid Woodhouse /* 92353b381b3SDavid Woodhouse * the read/modify/write code wants to use the original bio for 92453b381b3SDavid Woodhouse * any pages it included, and then use the rbio for everything 92553b381b3SDavid Woodhouse * else. This function decides if a given index (stripe number) 92653b381b3SDavid Woodhouse * and page number in that stripe fall inside the original bio 92753b381b3SDavid Woodhouse * or the rbio. 92853b381b3SDavid Woodhouse * 92953b381b3SDavid Woodhouse * if you set bio_list_only, you'll get a NULL back for any ranges 93053b381b3SDavid Woodhouse * that are outside the bio_list 93153b381b3SDavid Woodhouse * 93253b381b3SDavid Woodhouse * This doesn't take any refs on anything, you get a bare page pointer 93353b381b3SDavid Woodhouse * and the caller must bump refs as required. 93453b381b3SDavid Woodhouse * 93553b381b3SDavid Woodhouse * You must call index_rbio_pages once before you can trust 93653b381b3SDavid Woodhouse * the answers from this function. 93753b381b3SDavid Woodhouse */ 93853b381b3SDavid Woodhouse static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 93953b381b3SDavid Woodhouse int index, int pagenr, int bio_list_only) 94053b381b3SDavid Woodhouse { 94153b381b3SDavid Woodhouse int chunk_page; 94253b381b3SDavid Woodhouse struct page *p = NULL; 94353b381b3SDavid Woodhouse 94453b381b3SDavid Woodhouse chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 94553b381b3SDavid Woodhouse 94653b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 94753b381b3SDavid Woodhouse p = rbio->bio_pages[chunk_page]; 94853b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 94953b381b3SDavid Woodhouse 95053b381b3SDavid Woodhouse if (p || bio_list_only) 95153b381b3SDavid Woodhouse return p; 95253b381b3SDavid Woodhouse 95353b381b3SDavid Woodhouse return rbio->stripe_pages[chunk_page]; 95453b381b3SDavid Woodhouse } 95553b381b3SDavid Woodhouse 95653b381b3SDavid Woodhouse /* 95753b381b3SDavid Woodhouse * number of pages we need for the entire stripe across all the 95853b381b3SDavid Woodhouse * drives 95953b381b3SDavid Woodhouse */ 96053b381b3SDavid Woodhouse static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 96153b381b3SDavid Woodhouse { 962*b7178a5fSZhao Lei return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; 96353b381b3SDavid Woodhouse } 96453b381b3SDavid Woodhouse 96553b381b3SDavid Woodhouse /* 96653b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 96753b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 96853b381b3SDavid Woodhouse */ 96953b381b3SDavid Woodhouse static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 9708e5cfb55SZhao Lei struct btrfs_bio *bbio, u64 stripe_len) 97153b381b3SDavid Woodhouse { 97253b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 97353b381b3SDavid Woodhouse int nr_data = 0; 9742c8cdd6eSMiao Xie int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 9752c8cdd6eSMiao Xie int num_pages = rbio_nr_pages(stripe_len, real_stripes); 9765a6ac9eaSMiao Xie int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 97753b381b3SDavid Woodhouse void *p; 97853b381b3SDavid Woodhouse 9795a6ac9eaSMiao Xie rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + 980bfca9a6dSZhao Lei DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * 981bfca9a6dSZhao Lei sizeof(long), GFP_NOFS); 982af8e2d1dSMiao Xie if (!rbio) 98353b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 98453b381b3SDavid Woodhouse 98553b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 98653b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 98753b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 9884ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 98953b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 99053b381b3SDavid Woodhouse rbio->bbio = bbio; 99153b381b3SDavid Woodhouse rbio->fs_info = root->fs_info; 99253b381b3SDavid Woodhouse rbio->stripe_len = stripe_len; 99353b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 9942c8cdd6eSMiao Xie rbio->real_stripes = real_stripes; 9955a6ac9eaSMiao Xie rbio->stripe_npages = stripe_npages; 99653b381b3SDavid Woodhouse rbio->faila = -1; 99753b381b3SDavid Woodhouse rbio->failb = -1; 99853b381b3SDavid Woodhouse atomic_set(&rbio->refs, 1); 999b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 1000b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, 0); 100153b381b3SDavid Woodhouse 100253b381b3SDavid Woodhouse /* 100353b381b3SDavid Woodhouse * the stripe_pages and bio_pages array point to the extra 100453b381b3SDavid Woodhouse * memory we allocated past the end of the rbio 100553b381b3SDavid Woodhouse */ 100653b381b3SDavid Woodhouse p = rbio + 1; 100753b381b3SDavid Woodhouse rbio->stripe_pages = p; 100853b381b3SDavid Woodhouse rbio->bio_pages = p + sizeof(struct page *) * num_pages; 10095a6ac9eaSMiao Xie rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; 101053b381b3SDavid Woodhouse 101110f11900SZhao Lei if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 101210f11900SZhao Lei nr_data = real_stripes - 1; 101310f11900SZhao Lei else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 10142c8cdd6eSMiao Xie nr_data = real_stripes - 2; 101553b381b3SDavid Woodhouse else 101610f11900SZhao Lei BUG(); 101753b381b3SDavid Woodhouse 101853b381b3SDavid Woodhouse rbio->nr_data = nr_data; 101953b381b3SDavid Woodhouse return rbio; 102053b381b3SDavid Woodhouse } 102153b381b3SDavid Woodhouse 102253b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 102353b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 102453b381b3SDavid Woodhouse { 102553b381b3SDavid Woodhouse int i; 102653b381b3SDavid Woodhouse struct page *page; 102753b381b3SDavid Woodhouse 102853b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 102953b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 103053b381b3SDavid Woodhouse continue; 103153b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 103253b381b3SDavid Woodhouse if (!page) 103353b381b3SDavid Woodhouse return -ENOMEM; 103453b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 103553b381b3SDavid Woodhouse ClearPageUptodate(page); 103653b381b3SDavid Woodhouse } 103753b381b3SDavid Woodhouse return 0; 103853b381b3SDavid Woodhouse } 103953b381b3SDavid Woodhouse 1040*b7178a5fSZhao Lei /* only allocate pages for p/q stripes */ 104153b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 104253b381b3SDavid Woodhouse { 104353b381b3SDavid Woodhouse int i; 104453b381b3SDavid Woodhouse struct page *page; 104553b381b3SDavid Woodhouse 1046*b7178a5fSZhao Lei i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 104753b381b3SDavid Woodhouse 104853b381b3SDavid Woodhouse for (; i < rbio->nr_pages; i++) { 104953b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 105053b381b3SDavid Woodhouse continue; 105153b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 105253b381b3SDavid Woodhouse if (!page) 105353b381b3SDavid Woodhouse return -ENOMEM; 105453b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 105553b381b3SDavid Woodhouse } 105653b381b3SDavid Woodhouse return 0; 105753b381b3SDavid Woodhouse } 105853b381b3SDavid Woodhouse 105953b381b3SDavid Woodhouse /* 106053b381b3SDavid Woodhouse * add a single page from a specific stripe into our list of bios for IO 106153b381b3SDavid Woodhouse * this will try to merge into existing bios if possible, and returns 106253b381b3SDavid Woodhouse * zero if all went well. 106353b381b3SDavid Woodhouse */ 106448a3b636SEric Sandeen static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 106553b381b3SDavid Woodhouse struct bio_list *bio_list, 106653b381b3SDavid Woodhouse struct page *page, 106753b381b3SDavid Woodhouse int stripe_nr, 106853b381b3SDavid Woodhouse unsigned long page_index, 106953b381b3SDavid Woodhouse unsigned long bio_max_len) 107053b381b3SDavid Woodhouse { 107153b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 107253b381b3SDavid Woodhouse u64 last_end = 0; 107353b381b3SDavid Woodhouse int ret; 107453b381b3SDavid Woodhouse struct bio *bio; 107553b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 107653b381b3SDavid Woodhouse u64 disk_start; 107753b381b3SDavid Woodhouse 107853b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[stripe_nr]; 107953b381b3SDavid Woodhouse disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 108053b381b3SDavid Woodhouse 108153b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 108253b381b3SDavid Woodhouse if (!stripe->dev->bdev) 108353b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 108453b381b3SDavid Woodhouse 108553b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 108653b381b3SDavid Woodhouse if (last) { 10874f024f37SKent Overstreet last_end = (u64)last->bi_iter.bi_sector << 9; 10884f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 108953b381b3SDavid Woodhouse 109053b381b3SDavid Woodhouse /* 109153b381b3SDavid Woodhouse * we can't merge these if they are from different 109253b381b3SDavid Woodhouse * devices or if they are not contiguous 109353b381b3SDavid Woodhouse */ 109453b381b3SDavid Woodhouse if (last_end == disk_start && stripe->dev->bdev && 10954246a0b6SChristoph Hellwig !last->bi_error && 109653b381b3SDavid Woodhouse last->bi_bdev == stripe->dev->bdev) { 109753b381b3SDavid Woodhouse ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 109853b381b3SDavid Woodhouse if (ret == PAGE_CACHE_SIZE) 109953b381b3SDavid Woodhouse return 0; 110053b381b3SDavid Woodhouse } 110153b381b3SDavid Woodhouse } 110253b381b3SDavid Woodhouse 110353b381b3SDavid Woodhouse /* put a new bio on the list */ 11049be3395bSChris Mason bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 110553b381b3SDavid Woodhouse if (!bio) 110653b381b3SDavid Woodhouse return -ENOMEM; 110753b381b3SDavid Woodhouse 11084f024f37SKent Overstreet bio->bi_iter.bi_size = 0; 110953b381b3SDavid Woodhouse bio->bi_bdev = stripe->dev->bdev; 11104f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 111153b381b3SDavid Woodhouse 111253b381b3SDavid Woodhouse bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 111353b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 111453b381b3SDavid Woodhouse return 0; 111553b381b3SDavid Woodhouse } 111653b381b3SDavid Woodhouse 111753b381b3SDavid Woodhouse /* 111853b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 111953b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 112053b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 112153b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 112253b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 112353b381b3SDavid Woodhouse */ 112453b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 112553b381b3SDavid Woodhouse { 112653b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 11272c8cdd6eSMiao Xie BUG_ON(rbio->faila == rbio->real_stripes - 1); 112853b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 112953b381b3SDavid Woodhouse } else { 113053b381b3SDavid Woodhouse finish_rmw(rbio); 113153b381b3SDavid Woodhouse } 113253b381b3SDavid Woodhouse } 113353b381b3SDavid Woodhouse 113453b381b3SDavid Woodhouse /* 113553b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 113653b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 113753b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 113853b381b3SDavid Woodhouse * reconstruction. 113953b381b3SDavid Woodhouse * 114053b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 114153b381b3SDavid Woodhouse */ 114253b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 114353b381b3SDavid Woodhouse { 114453b381b3SDavid Woodhouse struct bio *bio; 114553b381b3SDavid Woodhouse u64 start; 114653b381b3SDavid Woodhouse unsigned long stripe_offset; 114753b381b3SDavid Woodhouse unsigned long page_index; 114853b381b3SDavid Woodhouse struct page *p; 114953b381b3SDavid Woodhouse int i; 115053b381b3SDavid Woodhouse 115153b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 115253b381b3SDavid Woodhouse bio_list_for_each(bio, &rbio->bio_list) { 11534f024f37SKent Overstreet start = (u64)bio->bi_iter.bi_sector << 9; 11548e5cfb55SZhao Lei stripe_offset = start - rbio->bbio->raid_map[0]; 115553b381b3SDavid Woodhouse page_index = stripe_offset >> PAGE_CACHE_SHIFT; 115653b381b3SDavid Woodhouse 115753b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 115853b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 115953b381b3SDavid Woodhouse rbio->bio_pages[page_index + i] = p; 116053b381b3SDavid Woodhouse } 116153b381b3SDavid Woodhouse } 116253b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 116353b381b3SDavid Woodhouse } 116453b381b3SDavid Woodhouse 116553b381b3SDavid Woodhouse /* 116653b381b3SDavid Woodhouse * this is called from one of two situations. We either 116753b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 116853b381b3SDavid Woodhouse * the missing bits off disk. 116953b381b3SDavid Woodhouse * 117053b381b3SDavid Woodhouse * This will calculate the parity and then send down any 117153b381b3SDavid Woodhouse * changed blocks. 117253b381b3SDavid Woodhouse */ 117353b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 117453b381b3SDavid Woodhouse { 117553b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 11762c8cdd6eSMiao Xie void *pointers[rbio->real_stripes]; 117753b381b3SDavid Woodhouse int stripe_len = rbio->stripe_len; 117853b381b3SDavid Woodhouse int nr_data = rbio->nr_data; 117953b381b3SDavid Woodhouse int stripe; 118053b381b3SDavid Woodhouse int pagenr; 118153b381b3SDavid Woodhouse int p_stripe = -1; 118253b381b3SDavid Woodhouse int q_stripe = -1; 118353b381b3SDavid Woodhouse struct bio_list bio_list; 118453b381b3SDavid Woodhouse struct bio *bio; 118553b381b3SDavid Woodhouse int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 118653b381b3SDavid Woodhouse int ret; 118753b381b3SDavid Woodhouse 118853b381b3SDavid Woodhouse bio_list_init(&bio_list); 118953b381b3SDavid Woodhouse 11902c8cdd6eSMiao Xie if (rbio->real_stripes - rbio->nr_data == 1) { 11912c8cdd6eSMiao Xie p_stripe = rbio->real_stripes - 1; 11922c8cdd6eSMiao Xie } else if (rbio->real_stripes - rbio->nr_data == 2) { 11932c8cdd6eSMiao Xie p_stripe = rbio->real_stripes - 2; 11942c8cdd6eSMiao Xie q_stripe = rbio->real_stripes - 1; 119553b381b3SDavid Woodhouse } else { 119653b381b3SDavid Woodhouse BUG(); 119753b381b3SDavid Woodhouse } 119853b381b3SDavid Woodhouse 119953b381b3SDavid Woodhouse /* at this point we either have a full stripe, 120053b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 120153b381b3SDavid Woodhouse * recalculate the parity and write the new results. 120253b381b3SDavid Woodhouse * 120353b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 120453b381b3SDavid Woodhouse * bio list here, anyone else that wants to 120553b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 120653b381b3SDavid Woodhouse */ 120753b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 120853b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 120953b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 121053b381b3SDavid Woodhouse 1211b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 121253b381b3SDavid Woodhouse 121353b381b3SDavid Woodhouse /* 121453b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 121553b381b3SDavid Woodhouse * bio list one last time and map the page pointers 12164ae10b3aSChris Mason * 12174ae10b3aSChris Mason * We don't cache full rbios because we're assuming 12184ae10b3aSChris Mason * the higher layers are unlikely to use this area of 12194ae10b3aSChris Mason * the disk again soon. If they do use it again, 12204ae10b3aSChris Mason * hopefully they will send another full bio. 122153b381b3SDavid Woodhouse */ 122253b381b3SDavid Woodhouse index_rbio_pages(rbio); 12234ae10b3aSChris Mason if (!rbio_is_full(rbio)) 12244ae10b3aSChris Mason cache_rbio_pages(rbio); 12254ae10b3aSChris Mason else 12264ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 122753b381b3SDavid Woodhouse 122853b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 122953b381b3SDavid Woodhouse struct page *p; 123053b381b3SDavid Woodhouse /* first collect one page from each data stripe */ 123153b381b3SDavid Woodhouse for (stripe = 0; stripe < nr_data; stripe++) { 123253b381b3SDavid Woodhouse p = page_in_rbio(rbio, stripe, pagenr, 0); 123353b381b3SDavid Woodhouse pointers[stripe] = kmap(p); 123453b381b3SDavid Woodhouse } 123553b381b3SDavid Woodhouse 123653b381b3SDavid Woodhouse /* then add the parity stripe */ 123753b381b3SDavid Woodhouse p = rbio_pstripe_page(rbio, pagenr); 123853b381b3SDavid Woodhouse SetPageUptodate(p); 123953b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 124053b381b3SDavid Woodhouse 124153b381b3SDavid Woodhouse if (q_stripe != -1) { 124253b381b3SDavid Woodhouse 124353b381b3SDavid Woodhouse /* 124453b381b3SDavid Woodhouse * raid6, add the qstripe and call the 124553b381b3SDavid Woodhouse * library function to fill in our p/q 124653b381b3SDavid Woodhouse */ 124753b381b3SDavid Woodhouse p = rbio_qstripe_page(rbio, pagenr); 124853b381b3SDavid Woodhouse SetPageUptodate(p); 124953b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 125053b381b3SDavid Woodhouse 12512c8cdd6eSMiao Xie raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 125253b381b3SDavid Woodhouse pointers); 125353b381b3SDavid Woodhouse } else { 125453b381b3SDavid Woodhouse /* raid5 */ 125553b381b3SDavid Woodhouse memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 125653b381b3SDavid Woodhouse run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 125753b381b3SDavid Woodhouse } 125853b381b3SDavid Woodhouse 125953b381b3SDavid Woodhouse 12602c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) 126153b381b3SDavid Woodhouse kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 126253b381b3SDavid Woodhouse } 126353b381b3SDavid Woodhouse 126453b381b3SDavid Woodhouse /* 126553b381b3SDavid Woodhouse * time to start writing. Make bios for everything from the 126653b381b3SDavid Woodhouse * higher layers (the bio_list in our rbio) and our p/q. Ignore 126753b381b3SDavid Woodhouse * everything else. 126853b381b3SDavid Woodhouse */ 12692c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 127053b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 127153b381b3SDavid Woodhouse struct page *page; 127253b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 127353b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 127453b381b3SDavid Woodhouse if (!page) 127553b381b3SDavid Woodhouse continue; 127653b381b3SDavid Woodhouse } else { 127753b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 127853b381b3SDavid Woodhouse } 127953b381b3SDavid Woodhouse 128053b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 128153b381b3SDavid Woodhouse page, stripe, pagenr, rbio->stripe_len); 128253b381b3SDavid Woodhouse if (ret) 128353b381b3SDavid Woodhouse goto cleanup; 128453b381b3SDavid Woodhouse } 128553b381b3SDavid Woodhouse } 128653b381b3SDavid Woodhouse 12872c8cdd6eSMiao Xie if (likely(!bbio->num_tgtdevs)) 12882c8cdd6eSMiao Xie goto write_data; 12892c8cdd6eSMiao Xie 12902c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 12912c8cdd6eSMiao Xie if (!bbio->tgtdev_map[stripe]) 12922c8cdd6eSMiao Xie continue; 12932c8cdd6eSMiao Xie 12942c8cdd6eSMiao Xie for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 12952c8cdd6eSMiao Xie struct page *page; 12962c8cdd6eSMiao Xie if (stripe < rbio->nr_data) { 12972c8cdd6eSMiao Xie page = page_in_rbio(rbio, stripe, pagenr, 1); 12982c8cdd6eSMiao Xie if (!page) 12992c8cdd6eSMiao Xie continue; 13002c8cdd6eSMiao Xie } else { 13012c8cdd6eSMiao Xie page = rbio_stripe_page(rbio, stripe, pagenr); 13022c8cdd6eSMiao Xie } 13032c8cdd6eSMiao Xie 13042c8cdd6eSMiao Xie ret = rbio_add_io_page(rbio, &bio_list, page, 13052c8cdd6eSMiao Xie rbio->bbio->tgtdev_map[stripe], 13062c8cdd6eSMiao Xie pagenr, rbio->stripe_len); 13072c8cdd6eSMiao Xie if (ret) 13082c8cdd6eSMiao Xie goto cleanup; 13092c8cdd6eSMiao Xie } 13102c8cdd6eSMiao Xie } 13112c8cdd6eSMiao Xie 13122c8cdd6eSMiao Xie write_data: 1313b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1314b89e1b01SMiao Xie BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 131553b381b3SDavid Woodhouse 131653b381b3SDavid Woodhouse while (1) { 131753b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 131853b381b3SDavid Woodhouse if (!bio) 131953b381b3SDavid Woodhouse break; 132053b381b3SDavid Woodhouse 132153b381b3SDavid Woodhouse bio->bi_private = rbio; 132253b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 132353b381b3SDavid Woodhouse submit_bio(WRITE, bio); 132453b381b3SDavid Woodhouse } 132553b381b3SDavid Woodhouse return; 132653b381b3SDavid Woodhouse 132753b381b3SDavid Woodhouse cleanup: 13284246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 132953b381b3SDavid Woodhouse } 133053b381b3SDavid Woodhouse 133153b381b3SDavid Woodhouse /* 133253b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 133353b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 133453b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 133553b381b3SDavid Woodhouse */ 133653b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 133753b381b3SDavid Woodhouse struct bio *bio) 133853b381b3SDavid Woodhouse { 13394f024f37SKent Overstreet u64 physical = bio->bi_iter.bi_sector; 134053b381b3SDavid Woodhouse u64 stripe_start; 134153b381b3SDavid Woodhouse int i; 134253b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 134353b381b3SDavid Woodhouse 134453b381b3SDavid Woodhouse physical <<= 9; 134553b381b3SDavid Woodhouse 134653b381b3SDavid Woodhouse for (i = 0; i < rbio->bbio->num_stripes; i++) { 134753b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[i]; 134853b381b3SDavid Woodhouse stripe_start = stripe->physical; 134953b381b3SDavid Woodhouse if (physical >= stripe_start && 13502c8cdd6eSMiao Xie physical < stripe_start + rbio->stripe_len && 13512c8cdd6eSMiao Xie bio->bi_bdev == stripe->dev->bdev) { 135253b381b3SDavid Woodhouse return i; 135353b381b3SDavid Woodhouse } 135453b381b3SDavid Woodhouse } 135553b381b3SDavid Woodhouse return -1; 135653b381b3SDavid Woodhouse } 135753b381b3SDavid Woodhouse 135853b381b3SDavid Woodhouse /* 135953b381b3SDavid Woodhouse * helper to find the stripe number for a given 136053b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 136153b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 136253b381b3SDavid Woodhouse */ 136353b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 136453b381b3SDavid Woodhouse struct bio *bio) 136553b381b3SDavid Woodhouse { 13664f024f37SKent Overstreet u64 logical = bio->bi_iter.bi_sector; 136753b381b3SDavid Woodhouse u64 stripe_start; 136853b381b3SDavid Woodhouse int i; 136953b381b3SDavid Woodhouse 137053b381b3SDavid Woodhouse logical <<= 9; 137153b381b3SDavid Woodhouse 137253b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 13738e5cfb55SZhao Lei stripe_start = rbio->bbio->raid_map[i]; 137453b381b3SDavid Woodhouse if (logical >= stripe_start && 137553b381b3SDavid Woodhouse logical < stripe_start + rbio->stripe_len) { 137653b381b3SDavid Woodhouse return i; 137753b381b3SDavid Woodhouse } 137853b381b3SDavid Woodhouse } 137953b381b3SDavid Woodhouse return -1; 138053b381b3SDavid Woodhouse } 138153b381b3SDavid Woodhouse 138253b381b3SDavid Woodhouse /* 138353b381b3SDavid Woodhouse * returns -EIO if we had too many failures 138453b381b3SDavid Woodhouse */ 138553b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 138653b381b3SDavid Woodhouse { 138753b381b3SDavid Woodhouse unsigned long flags; 138853b381b3SDavid Woodhouse int ret = 0; 138953b381b3SDavid Woodhouse 139053b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 139153b381b3SDavid Woodhouse 139253b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 139353b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 139453b381b3SDavid Woodhouse goto out; 139553b381b3SDavid Woodhouse 139653b381b3SDavid Woodhouse if (rbio->faila == -1) { 139753b381b3SDavid Woodhouse /* first failure on this rbio */ 139853b381b3SDavid Woodhouse rbio->faila = failed; 1399b89e1b01SMiao Xie atomic_inc(&rbio->error); 140053b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 140153b381b3SDavid Woodhouse /* second failure on this rbio */ 140253b381b3SDavid Woodhouse rbio->failb = failed; 1403b89e1b01SMiao Xie atomic_inc(&rbio->error); 140453b381b3SDavid Woodhouse } else { 140553b381b3SDavid Woodhouse ret = -EIO; 140653b381b3SDavid Woodhouse } 140753b381b3SDavid Woodhouse out: 140853b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 140953b381b3SDavid Woodhouse 141053b381b3SDavid Woodhouse return ret; 141153b381b3SDavid Woodhouse } 141253b381b3SDavid Woodhouse 141353b381b3SDavid Woodhouse /* 141453b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 141553b381b3SDavid Woodhouse * bio. 141653b381b3SDavid Woodhouse */ 141753b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 141853b381b3SDavid Woodhouse struct bio *bio) 141953b381b3SDavid Woodhouse { 142053b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 142153b381b3SDavid Woodhouse 142253b381b3SDavid Woodhouse if (failed < 0) 142353b381b3SDavid Woodhouse return -EIO; 142453b381b3SDavid Woodhouse 142553b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 142653b381b3SDavid Woodhouse } 142753b381b3SDavid Woodhouse 142853b381b3SDavid Woodhouse /* 142953b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 143053b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 143153b381b3SDavid Woodhouse */ 143253b381b3SDavid Woodhouse static void set_bio_pages_uptodate(struct bio *bio) 143353b381b3SDavid Woodhouse { 143453b381b3SDavid Woodhouse int i; 143553b381b3SDavid Woodhouse struct page *p; 143653b381b3SDavid Woodhouse 143753b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 143853b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 143953b381b3SDavid Woodhouse SetPageUptodate(p); 144053b381b3SDavid Woodhouse } 144153b381b3SDavid Woodhouse } 144253b381b3SDavid Woodhouse 144353b381b3SDavid Woodhouse /* 144453b381b3SDavid Woodhouse * end io for the read phase of the rmw cycle. All the bios here are physical 144553b381b3SDavid Woodhouse * stripe bios we've read from the disk so we can recalculate the parity of the 144653b381b3SDavid Woodhouse * stripe. 144753b381b3SDavid Woodhouse * 144853b381b3SDavid Woodhouse * This will usually kick off finish_rmw once all the bios are read in, but it 144953b381b3SDavid Woodhouse * may trigger parity reconstruction if we had any errors along the way 145053b381b3SDavid Woodhouse */ 14514246a0b6SChristoph Hellwig static void raid_rmw_end_io(struct bio *bio) 145253b381b3SDavid Woodhouse { 145353b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 145453b381b3SDavid Woodhouse 14554246a0b6SChristoph Hellwig if (bio->bi_error) 145653b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 145753b381b3SDavid Woodhouse else 145853b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 145953b381b3SDavid Woodhouse 146053b381b3SDavid Woodhouse bio_put(bio); 146153b381b3SDavid Woodhouse 1462b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 146353b381b3SDavid Woodhouse return; 146453b381b3SDavid Woodhouse 1465b89e1b01SMiao Xie if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 146653b381b3SDavid Woodhouse goto cleanup; 146753b381b3SDavid Woodhouse 146853b381b3SDavid Woodhouse /* 146953b381b3SDavid Woodhouse * this will normally call finish_rmw to start our write 147053b381b3SDavid Woodhouse * but if there are any failed stripes we'll reconstruct 147153b381b3SDavid Woodhouse * from parity first 147253b381b3SDavid Woodhouse */ 147353b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 147453b381b3SDavid Woodhouse return; 147553b381b3SDavid Woodhouse 147653b381b3SDavid Woodhouse cleanup: 147753b381b3SDavid Woodhouse 14784246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 147953b381b3SDavid Woodhouse } 148053b381b3SDavid Woodhouse 148153b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 148253b381b3SDavid Woodhouse { 14839e0af237SLiu Bo btrfs_init_work(&rbio->work, btrfs_rmw_helper, 14849e0af237SLiu Bo rmw_work, NULL, NULL); 148553b381b3SDavid Woodhouse 1486d05a33acSQu Wenruo btrfs_queue_work(rbio->fs_info->rmw_workers, 148753b381b3SDavid Woodhouse &rbio->work); 148853b381b3SDavid Woodhouse } 148953b381b3SDavid Woodhouse 149053b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio) 149153b381b3SDavid Woodhouse { 14929e0af237SLiu Bo btrfs_init_work(&rbio->work, btrfs_rmw_helper, 14939e0af237SLiu Bo read_rebuild_work, NULL, NULL); 149453b381b3SDavid Woodhouse 1495d05a33acSQu Wenruo btrfs_queue_work(rbio->fs_info->rmw_workers, 149653b381b3SDavid Woodhouse &rbio->work); 149753b381b3SDavid Woodhouse } 149853b381b3SDavid Woodhouse 149953b381b3SDavid Woodhouse /* 150053b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 150153b381b3SDavid Woodhouse * unlock after all the writes are done 150253b381b3SDavid Woodhouse */ 150353b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 150453b381b3SDavid Woodhouse { 150553b381b3SDavid Woodhouse int bios_to_read = 0; 150653b381b3SDavid Woodhouse struct bio_list bio_list; 150753b381b3SDavid Woodhouse int ret; 1508ed6078f7SDavid Sterba int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 150953b381b3SDavid Woodhouse int pagenr; 151053b381b3SDavid Woodhouse int stripe; 151153b381b3SDavid Woodhouse struct bio *bio; 151253b381b3SDavid Woodhouse 151353b381b3SDavid Woodhouse bio_list_init(&bio_list); 151453b381b3SDavid Woodhouse 151553b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 151653b381b3SDavid Woodhouse if (ret) 151753b381b3SDavid Woodhouse goto cleanup; 151853b381b3SDavid Woodhouse 151953b381b3SDavid Woodhouse index_rbio_pages(rbio); 152053b381b3SDavid Woodhouse 1521b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 152253b381b3SDavid Woodhouse /* 152353b381b3SDavid Woodhouse * build a list of bios to read all the missing parts of this 152453b381b3SDavid Woodhouse * stripe 152553b381b3SDavid Woodhouse */ 152653b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->nr_data; stripe++) { 152753b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 152853b381b3SDavid Woodhouse struct page *page; 152953b381b3SDavid Woodhouse /* 153053b381b3SDavid Woodhouse * we want to find all the pages missing from 153153b381b3SDavid Woodhouse * the rbio and read them from the disk. If 153253b381b3SDavid Woodhouse * page_in_rbio finds a page in the bio list 153353b381b3SDavid Woodhouse * we don't need to read it off the stripe. 153453b381b3SDavid Woodhouse */ 153553b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 153653b381b3SDavid Woodhouse if (page) 153753b381b3SDavid Woodhouse continue; 153853b381b3SDavid Woodhouse 153953b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 15404ae10b3aSChris Mason /* 15414ae10b3aSChris Mason * the bio cache may have handed us an uptodate 15424ae10b3aSChris Mason * page. If so, be happy and use it 15434ae10b3aSChris Mason */ 15444ae10b3aSChris Mason if (PageUptodate(page)) 15454ae10b3aSChris Mason continue; 15464ae10b3aSChris Mason 154753b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, page, 154853b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 154953b381b3SDavid Woodhouse if (ret) 155053b381b3SDavid Woodhouse goto cleanup; 155153b381b3SDavid Woodhouse } 155253b381b3SDavid Woodhouse } 155353b381b3SDavid Woodhouse 155453b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 155553b381b3SDavid Woodhouse if (!bios_to_read) { 155653b381b3SDavid Woodhouse /* 155753b381b3SDavid Woodhouse * this can happen if others have merged with 155853b381b3SDavid Woodhouse * us, it means there is nothing left to read. 155953b381b3SDavid Woodhouse * But if there are missing devices it may not be 156053b381b3SDavid Woodhouse * safe to do the full stripe write yet. 156153b381b3SDavid Woodhouse */ 156253b381b3SDavid Woodhouse goto finish; 156353b381b3SDavid Woodhouse } 156453b381b3SDavid Woodhouse 156553b381b3SDavid Woodhouse /* 156653b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 156753b381b3SDavid Woodhouse * not to touch it after that 156853b381b3SDavid Woodhouse */ 1569b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 157053b381b3SDavid Woodhouse while (1) { 157153b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 157253b381b3SDavid Woodhouse if (!bio) 157353b381b3SDavid Woodhouse break; 157453b381b3SDavid Woodhouse 157553b381b3SDavid Woodhouse bio->bi_private = rbio; 157653b381b3SDavid Woodhouse bio->bi_end_io = raid_rmw_end_io; 157753b381b3SDavid Woodhouse 157853b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 157953b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 158053b381b3SDavid Woodhouse 158153b381b3SDavid Woodhouse submit_bio(READ, bio); 158253b381b3SDavid Woodhouse } 158353b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 158453b381b3SDavid Woodhouse return 0; 158553b381b3SDavid Woodhouse 158653b381b3SDavid Woodhouse cleanup: 15874246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 158853b381b3SDavid Woodhouse return -EIO; 158953b381b3SDavid Woodhouse 159053b381b3SDavid Woodhouse finish: 159153b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 159253b381b3SDavid Woodhouse return 0; 159353b381b3SDavid Woodhouse } 159453b381b3SDavid Woodhouse 159553b381b3SDavid Woodhouse /* 159653b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 159753b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 159853b381b3SDavid Woodhouse */ 159953b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 160053b381b3SDavid Woodhouse { 160153b381b3SDavid Woodhouse int ret; 160253b381b3SDavid Woodhouse 160353b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 16043cd846d1SMiao Xie if (ret) { 16053cd846d1SMiao Xie __free_raid_bio(rbio); 160653b381b3SDavid Woodhouse return ret; 16073cd846d1SMiao Xie } 160853b381b3SDavid Woodhouse 160953b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 161053b381b3SDavid Woodhouse if (ret == 0) 161153b381b3SDavid Woodhouse finish_rmw(rbio); 161253b381b3SDavid Woodhouse return 0; 161353b381b3SDavid Woodhouse } 161453b381b3SDavid Woodhouse 161553b381b3SDavid Woodhouse /* 161653b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 161753b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 161853b381b3SDavid Woodhouse * rbio before calculating new parity 161953b381b3SDavid Woodhouse */ 162053b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 162153b381b3SDavid Woodhouse { 162253b381b3SDavid Woodhouse int ret; 162353b381b3SDavid Woodhouse 162453b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 162553b381b3SDavid Woodhouse if (ret == 0) 162653b381b3SDavid Woodhouse async_rmw_stripe(rbio); 162753b381b3SDavid Woodhouse return 0; 162853b381b3SDavid Woodhouse } 162953b381b3SDavid Woodhouse 163053b381b3SDavid Woodhouse /* 163153b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 163253b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 163353b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 163453b381b3SDavid Woodhouse * go directly to finish_rmw 163553b381b3SDavid Woodhouse */ 163653b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 163753b381b3SDavid Woodhouse { 163853b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 163953b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 164053b381b3SDavid Woodhouse return partial_stripe_write(rbio); 164153b381b3SDavid Woodhouse return full_stripe_write(rbio); 164253b381b3SDavid Woodhouse } 164353b381b3SDavid Woodhouse 164453b381b3SDavid Woodhouse /* 16456ac0f488SChris Mason * We use plugging call backs to collect full stripes. 16466ac0f488SChris Mason * Any time we get a partial stripe write while plugged 16476ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 16486ac0f488SChris Mason * we sort the list by logical block number and merge 16496ac0f488SChris Mason * everything we can into the same rbios 16506ac0f488SChris Mason */ 16516ac0f488SChris Mason struct btrfs_plug_cb { 16526ac0f488SChris Mason struct blk_plug_cb cb; 16536ac0f488SChris Mason struct btrfs_fs_info *info; 16546ac0f488SChris Mason struct list_head rbio_list; 16556ac0f488SChris Mason struct btrfs_work work; 16566ac0f488SChris Mason }; 16576ac0f488SChris Mason 16586ac0f488SChris Mason /* 16596ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 16606ac0f488SChris Mason */ 16616ac0f488SChris Mason static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 16626ac0f488SChris Mason { 16636ac0f488SChris Mason struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 16646ac0f488SChris Mason plug_list); 16656ac0f488SChris Mason struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 16666ac0f488SChris Mason plug_list); 16674f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 16684f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 16696ac0f488SChris Mason 16706ac0f488SChris Mason if (a_sector < b_sector) 16716ac0f488SChris Mason return -1; 16726ac0f488SChris Mason if (a_sector > b_sector) 16736ac0f488SChris Mason return 1; 16746ac0f488SChris Mason return 0; 16756ac0f488SChris Mason } 16766ac0f488SChris Mason 16776ac0f488SChris Mason static void run_plug(struct btrfs_plug_cb *plug) 16786ac0f488SChris Mason { 16796ac0f488SChris Mason struct btrfs_raid_bio *cur; 16806ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 16816ac0f488SChris Mason 16826ac0f488SChris Mason /* 16836ac0f488SChris Mason * sort our plug list then try to merge 16846ac0f488SChris Mason * everything we can in hopes of creating full 16856ac0f488SChris Mason * stripes. 16866ac0f488SChris Mason */ 16876ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 16886ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 16896ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 16906ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 16916ac0f488SChris Mason list_del_init(&cur->plug_list); 16926ac0f488SChris Mason 16936ac0f488SChris Mason if (rbio_is_full(cur)) { 16946ac0f488SChris Mason /* we have a full stripe, send it down */ 16956ac0f488SChris Mason full_stripe_write(cur); 16966ac0f488SChris Mason continue; 16976ac0f488SChris Mason } 16986ac0f488SChris Mason if (last) { 16996ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 17006ac0f488SChris Mason merge_rbio(last, cur); 17016ac0f488SChris Mason __free_raid_bio(cur); 17026ac0f488SChris Mason continue; 17036ac0f488SChris Mason 17046ac0f488SChris Mason } 17056ac0f488SChris Mason __raid56_parity_write(last); 17066ac0f488SChris Mason } 17076ac0f488SChris Mason last = cur; 17086ac0f488SChris Mason } 17096ac0f488SChris Mason if (last) { 17106ac0f488SChris Mason __raid56_parity_write(last); 17116ac0f488SChris Mason } 17126ac0f488SChris Mason kfree(plug); 17136ac0f488SChris Mason } 17146ac0f488SChris Mason 17156ac0f488SChris Mason /* 17166ac0f488SChris Mason * if the unplug comes from schedule, we have to push the 17176ac0f488SChris Mason * work off to a helper thread 17186ac0f488SChris Mason */ 17196ac0f488SChris Mason static void unplug_work(struct btrfs_work *work) 17206ac0f488SChris Mason { 17216ac0f488SChris Mason struct btrfs_plug_cb *plug; 17226ac0f488SChris Mason plug = container_of(work, struct btrfs_plug_cb, work); 17236ac0f488SChris Mason run_plug(plug); 17246ac0f488SChris Mason } 17256ac0f488SChris Mason 17266ac0f488SChris Mason static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 17276ac0f488SChris Mason { 17286ac0f488SChris Mason struct btrfs_plug_cb *plug; 17296ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 17306ac0f488SChris Mason 17316ac0f488SChris Mason if (from_schedule) { 17329e0af237SLiu Bo btrfs_init_work(&plug->work, btrfs_rmw_helper, 17339e0af237SLiu Bo unplug_work, NULL, NULL); 1734d05a33acSQu Wenruo btrfs_queue_work(plug->info->rmw_workers, 17356ac0f488SChris Mason &plug->work); 17366ac0f488SChris Mason return; 17376ac0f488SChris Mason } 17386ac0f488SChris Mason run_plug(plug); 17396ac0f488SChris Mason } 17406ac0f488SChris Mason 17416ac0f488SChris Mason /* 174253b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 174353b381b3SDavid Woodhouse */ 174453b381b3SDavid Woodhouse int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 17458e5cfb55SZhao Lei struct btrfs_bio *bbio, u64 stripe_len) 174653b381b3SDavid Woodhouse { 174753b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 17486ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 17496ac0f488SChris Mason struct blk_plug_cb *cb; 17504245215dSMiao Xie int ret; 175153b381b3SDavid Woodhouse 17528e5cfb55SZhao Lei rbio = alloc_rbio(root, bbio, stripe_len); 1753af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 17546e9606d2SZhao Lei btrfs_put_bbio(bbio); 175553b381b3SDavid Woodhouse return PTR_ERR(rbio); 1756af8e2d1dSMiao Xie } 175753b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 17584f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 17591b94b556SMiao Xie rbio->operation = BTRFS_RBIO_WRITE; 17606ac0f488SChris Mason 17614245215dSMiao Xie btrfs_bio_counter_inc_noblocked(root->fs_info); 17624245215dSMiao Xie rbio->generic_bio_cnt = 1; 17634245215dSMiao Xie 17646ac0f488SChris Mason /* 17656ac0f488SChris Mason * don't plug on full rbios, just get them out the door 17666ac0f488SChris Mason * as quickly as we can 17676ac0f488SChris Mason */ 17684245215dSMiao Xie if (rbio_is_full(rbio)) { 17694245215dSMiao Xie ret = full_stripe_write(rbio); 17704245215dSMiao Xie if (ret) 17714245215dSMiao Xie btrfs_bio_counter_dec(root->fs_info); 17724245215dSMiao Xie return ret; 17734245215dSMiao Xie } 17746ac0f488SChris Mason 17756ac0f488SChris Mason cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 17766ac0f488SChris Mason sizeof(*plug)); 17776ac0f488SChris Mason if (cb) { 17786ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 17796ac0f488SChris Mason if (!plug->info) { 17806ac0f488SChris Mason plug->info = root->fs_info; 17816ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 17826ac0f488SChris Mason } 17836ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 17844245215dSMiao Xie ret = 0; 17856ac0f488SChris Mason } else { 17864245215dSMiao Xie ret = __raid56_parity_write(rbio); 17874245215dSMiao Xie if (ret) 17884245215dSMiao Xie btrfs_bio_counter_dec(root->fs_info); 178953b381b3SDavid Woodhouse } 17904245215dSMiao Xie return ret; 17916ac0f488SChris Mason } 179253b381b3SDavid Woodhouse 179353b381b3SDavid Woodhouse /* 179453b381b3SDavid Woodhouse * all parity reconstruction happens here. We've read in everything 179553b381b3SDavid Woodhouse * we can find from the drives and this does the heavy lifting of 179653b381b3SDavid Woodhouse * sorting the good from the bad. 179753b381b3SDavid Woodhouse */ 179853b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 179953b381b3SDavid Woodhouse { 180053b381b3SDavid Woodhouse int pagenr, stripe; 180153b381b3SDavid Woodhouse void **pointers; 180253b381b3SDavid Woodhouse int faila = -1, failb = -1; 1803ed6078f7SDavid Sterba int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 180453b381b3SDavid Woodhouse struct page *page; 180553b381b3SDavid Woodhouse int err; 180653b381b3SDavid Woodhouse int i; 180753b381b3SDavid Woodhouse 180831e818feSDavid Sterba pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 180953b381b3SDavid Woodhouse if (!pointers) { 181053b381b3SDavid Woodhouse err = -ENOMEM; 181153b381b3SDavid Woodhouse goto cleanup_io; 181253b381b3SDavid Woodhouse } 181353b381b3SDavid Woodhouse 181453b381b3SDavid Woodhouse faila = rbio->faila; 181553b381b3SDavid Woodhouse failb = rbio->failb; 181653b381b3SDavid Woodhouse 1817b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1818b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 181953b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 182053b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 182153b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 182253b381b3SDavid Woodhouse } 182353b381b3SDavid Woodhouse 182453b381b3SDavid Woodhouse index_rbio_pages(rbio); 182553b381b3SDavid Woodhouse 182653b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 18275a6ac9eaSMiao Xie /* 18285a6ac9eaSMiao Xie * Now we just use bitmap to mark the horizontal stripes in 18295a6ac9eaSMiao Xie * which we have data when doing parity scrub. 18305a6ac9eaSMiao Xie */ 18315a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 18325a6ac9eaSMiao Xie !test_bit(pagenr, rbio->dbitmap)) 18335a6ac9eaSMiao Xie continue; 18345a6ac9eaSMiao Xie 183553b381b3SDavid Woodhouse /* setup our array of pointers with pages 183653b381b3SDavid Woodhouse * from each stripe 183753b381b3SDavid Woodhouse */ 18382c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 183953b381b3SDavid Woodhouse /* 184053b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 184153b381b3SDavid Woodhouse * pages from the bio list 184253b381b3SDavid Woodhouse */ 1843b4ee1782SOmar Sandoval if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1844b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 184553b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 184653b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 184753b381b3SDavid Woodhouse } else { 184853b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 184953b381b3SDavid Woodhouse } 185053b381b3SDavid Woodhouse pointers[stripe] = kmap(page); 185153b381b3SDavid Woodhouse } 185253b381b3SDavid Woodhouse 185353b381b3SDavid Woodhouse /* all raid6 handling here */ 185410f11900SZhao Lei if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 185553b381b3SDavid Woodhouse /* 185653b381b3SDavid Woodhouse * single failure, rebuild from parity raid5 185753b381b3SDavid Woodhouse * style 185853b381b3SDavid Woodhouse */ 185953b381b3SDavid Woodhouse if (failb < 0) { 186053b381b3SDavid Woodhouse if (faila == rbio->nr_data) { 186153b381b3SDavid Woodhouse /* 186253b381b3SDavid Woodhouse * Just the P stripe has failed, without 186353b381b3SDavid Woodhouse * a bad data or Q stripe. 186453b381b3SDavid Woodhouse * TODO, we should redo the xor here. 186553b381b3SDavid Woodhouse */ 186653b381b3SDavid Woodhouse err = -EIO; 186753b381b3SDavid Woodhouse goto cleanup; 186853b381b3SDavid Woodhouse } 186953b381b3SDavid Woodhouse /* 187053b381b3SDavid Woodhouse * a single failure in raid6 is rebuilt 187153b381b3SDavid Woodhouse * in the pstripe code below 187253b381b3SDavid Woodhouse */ 187353b381b3SDavid Woodhouse goto pstripe; 187453b381b3SDavid Woodhouse } 187553b381b3SDavid Woodhouse 187653b381b3SDavid Woodhouse /* make sure our ps and qs are in order */ 187753b381b3SDavid Woodhouse if (faila > failb) { 187853b381b3SDavid Woodhouse int tmp = failb; 187953b381b3SDavid Woodhouse failb = faila; 188053b381b3SDavid Woodhouse faila = tmp; 188153b381b3SDavid Woodhouse } 188253b381b3SDavid Woodhouse 188353b381b3SDavid Woodhouse /* if the q stripe is failed, do a pstripe reconstruction 188453b381b3SDavid Woodhouse * from the xors. 188553b381b3SDavid Woodhouse * If both the q stripe and the P stripe are failed, we're 188653b381b3SDavid Woodhouse * here due to a crc mismatch and we can't give them the 188753b381b3SDavid Woodhouse * data they want 188853b381b3SDavid Woodhouse */ 18898e5cfb55SZhao Lei if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 18908e5cfb55SZhao Lei if (rbio->bbio->raid_map[faila] == 18918e5cfb55SZhao Lei RAID5_P_STRIPE) { 189253b381b3SDavid Woodhouse err = -EIO; 189353b381b3SDavid Woodhouse goto cleanup; 189453b381b3SDavid Woodhouse } 189553b381b3SDavid Woodhouse /* 189653b381b3SDavid Woodhouse * otherwise we have one bad data stripe and 189753b381b3SDavid Woodhouse * a good P stripe. raid5! 189853b381b3SDavid Woodhouse */ 189953b381b3SDavid Woodhouse goto pstripe; 190053b381b3SDavid Woodhouse } 190153b381b3SDavid Woodhouse 19028e5cfb55SZhao Lei if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 19032c8cdd6eSMiao Xie raid6_datap_recov(rbio->real_stripes, 190453b381b3SDavid Woodhouse PAGE_SIZE, faila, pointers); 190553b381b3SDavid Woodhouse } else { 19062c8cdd6eSMiao Xie raid6_2data_recov(rbio->real_stripes, 190753b381b3SDavid Woodhouse PAGE_SIZE, faila, failb, 190853b381b3SDavid Woodhouse pointers); 190953b381b3SDavid Woodhouse } 191053b381b3SDavid Woodhouse } else { 191153b381b3SDavid Woodhouse void *p; 191253b381b3SDavid Woodhouse 191353b381b3SDavid Woodhouse /* rebuild from P stripe here (raid5 or raid6) */ 191453b381b3SDavid Woodhouse BUG_ON(failb != -1); 191553b381b3SDavid Woodhouse pstripe: 191653b381b3SDavid Woodhouse /* Copy parity block into failed block to start with */ 191753b381b3SDavid Woodhouse memcpy(pointers[faila], 191853b381b3SDavid Woodhouse pointers[rbio->nr_data], 191953b381b3SDavid Woodhouse PAGE_CACHE_SIZE); 192053b381b3SDavid Woodhouse 192153b381b3SDavid Woodhouse /* rearrange the pointer array */ 192253b381b3SDavid Woodhouse p = pointers[faila]; 192353b381b3SDavid Woodhouse for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 192453b381b3SDavid Woodhouse pointers[stripe] = pointers[stripe + 1]; 192553b381b3SDavid Woodhouse pointers[rbio->nr_data - 1] = p; 192653b381b3SDavid Woodhouse 192753b381b3SDavid Woodhouse /* xor in the rest */ 192853b381b3SDavid Woodhouse run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 192953b381b3SDavid Woodhouse } 193053b381b3SDavid Woodhouse /* if we're doing this rebuild as part of an rmw, go through 193153b381b3SDavid Woodhouse * and set all of our private rbio pages in the 193253b381b3SDavid Woodhouse * failed stripes as uptodate. This way finish_rmw will 193353b381b3SDavid Woodhouse * know they can be trusted. If this was a read reconstruction, 193453b381b3SDavid Woodhouse * other endio functions will fiddle the uptodate bits 193553b381b3SDavid Woodhouse */ 19361b94b556SMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) { 193753b381b3SDavid Woodhouse for (i = 0; i < nr_pages; i++) { 193853b381b3SDavid Woodhouse if (faila != -1) { 193953b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, faila, i); 194053b381b3SDavid Woodhouse SetPageUptodate(page); 194153b381b3SDavid Woodhouse } 194253b381b3SDavid Woodhouse if (failb != -1) { 194353b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, failb, i); 194453b381b3SDavid Woodhouse SetPageUptodate(page); 194553b381b3SDavid Woodhouse } 194653b381b3SDavid Woodhouse } 194753b381b3SDavid Woodhouse } 19482c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 194953b381b3SDavid Woodhouse /* 195053b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 195153b381b3SDavid Woodhouse * pages from the bio list 195253b381b3SDavid Woodhouse */ 1953b4ee1782SOmar Sandoval if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1954b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 195553b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 195653b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 195753b381b3SDavid Woodhouse } else { 195853b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 195953b381b3SDavid Woodhouse } 196053b381b3SDavid Woodhouse kunmap(page); 196153b381b3SDavid Woodhouse } 196253b381b3SDavid Woodhouse } 196353b381b3SDavid Woodhouse 196453b381b3SDavid Woodhouse err = 0; 196553b381b3SDavid Woodhouse cleanup: 196653b381b3SDavid Woodhouse kfree(pointers); 196753b381b3SDavid Woodhouse 196853b381b3SDavid Woodhouse cleanup_io: 19691b94b556SMiao Xie if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 19706e9606d2SZhao Lei if (err == 0) 19714ae10b3aSChris Mason cache_rbio_pages(rbio); 19724ae10b3aSChris Mason else 19734ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 19744ae10b3aSChris Mason 19754246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 1976b4ee1782SOmar Sandoval } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 197722365979SLinus Torvalds rbio_orig_end_io(rbio, err); 197853b381b3SDavid Woodhouse } else if (err == 0) { 197953b381b3SDavid Woodhouse rbio->faila = -1; 198053b381b3SDavid Woodhouse rbio->failb = -1; 19815a6ac9eaSMiao Xie 19825a6ac9eaSMiao Xie if (rbio->operation == BTRFS_RBIO_WRITE) 198353b381b3SDavid Woodhouse finish_rmw(rbio); 19845a6ac9eaSMiao Xie else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 19855a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 19865a6ac9eaSMiao Xie else 19875a6ac9eaSMiao Xie BUG(); 198853b381b3SDavid Woodhouse } else { 19894246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 199053b381b3SDavid Woodhouse } 199153b381b3SDavid Woodhouse } 199253b381b3SDavid Woodhouse 199353b381b3SDavid Woodhouse /* 199453b381b3SDavid Woodhouse * This is called only for stripes we've read from disk to 199553b381b3SDavid Woodhouse * reconstruct the parity. 199653b381b3SDavid Woodhouse */ 19974246a0b6SChristoph Hellwig static void raid_recover_end_io(struct bio *bio) 199853b381b3SDavid Woodhouse { 199953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 200053b381b3SDavid Woodhouse 200153b381b3SDavid Woodhouse /* 200253b381b3SDavid Woodhouse * we only read stripe pages off the disk, set them 200353b381b3SDavid Woodhouse * up to date if there were no errors 200453b381b3SDavid Woodhouse */ 20054246a0b6SChristoph Hellwig if (bio->bi_error) 200653b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 200753b381b3SDavid Woodhouse else 200853b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 200953b381b3SDavid Woodhouse bio_put(bio); 201053b381b3SDavid Woodhouse 2011b89e1b01SMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 201253b381b3SDavid Woodhouse return; 201353b381b3SDavid Woodhouse 2014b89e1b01SMiao Xie if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 20154246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 201653b381b3SDavid Woodhouse else 201753b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 201853b381b3SDavid Woodhouse } 201953b381b3SDavid Woodhouse 202053b381b3SDavid Woodhouse /* 202153b381b3SDavid Woodhouse * reads everything we need off the disk to reconstruct 202253b381b3SDavid Woodhouse * the parity. endio handlers trigger final reconstruction 202353b381b3SDavid Woodhouse * when the IO is done. 202453b381b3SDavid Woodhouse * 202553b381b3SDavid Woodhouse * This is used both for reads from the higher layers and for 202653b381b3SDavid Woodhouse * parity construction required to finish a rmw cycle. 202753b381b3SDavid Woodhouse */ 202853b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 202953b381b3SDavid Woodhouse { 203053b381b3SDavid Woodhouse int bios_to_read = 0; 203153b381b3SDavid Woodhouse struct bio_list bio_list; 203253b381b3SDavid Woodhouse int ret; 2033ed6078f7SDavid Sterba int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); 203453b381b3SDavid Woodhouse int pagenr; 203553b381b3SDavid Woodhouse int stripe; 203653b381b3SDavid Woodhouse struct bio *bio; 203753b381b3SDavid Woodhouse 203853b381b3SDavid Woodhouse bio_list_init(&bio_list); 203953b381b3SDavid Woodhouse 204053b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 204153b381b3SDavid Woodhouse if (ret) 204253b381b3SDavid Woodhouse goto cleanup; 204353b381b3SDavid Woodhouse 2044b89e1b01SMiao Xie atomic_set(&rbio->error, 0); 204553b381b3SDavid Woodhouse 204653b381b3SDavid Woodhouse /* 20474ae10b3aSChris Mason * read everything that hasn't failed. Thanks to the 20484ae10b3aSChris Mason * stripe cache, it is possible that some or all of these 20494ae10b3aSChris Mason * pages are going to be uptodate. 205053b381b3SDavid Woodhouse */ 20512c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 20525588383eSLiu Bo if (rbio->faila == stripe || rbio->failb == stripe) { 2053b89e1b01SMiao Xie atomic_inc(&rbio->error); 205453b381b3SDavid Woodhouse continue; 20555588383eSLiu Bo } 205653b381b3SDavid Woodhouse 205753b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 205853b381b3SDavid Woodhouse struct page *p; 205953b381b3SDavid Woodhouse 206053b381b3SDavid Woodhouse /* 206153b381b3SDavid Woodhouse * the rmw code may have already read this 206253b381b3SDavid Woodhouse * page in 206353b381b3SDavid Woodhouse */ 206453b381b3SDavid Woodhouse p = rbio_stripe_page(rbio, stripe, pagenr); 206553b381b3SDavid Woodhouse if (PageUptodate(p)) 206653b381b3SDavid Woodhouse continue; 206753b381b3SDavid Woodhouse 206853b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 206953b381b3SDavid Woodhouse rbio_stripe_page(rbio, stripe, pagenr), 207053b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 207153b381b3SDavid Woodhouse if (ret < 0) 207253b381b3SDavid Woodhouse goto cleanup; 207353b381b3SDavid Woodhouse } 207453b381b3SDavid Woodhouse } 207553b381b3SDavid Woodhouse 207653b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 207753b381b3SDavid Woodhouse if (!bios_to_read) { 207853b381b3SDavid Woodhouse /* 207953b381b3SDavid Woodhouse * we might have no bios to read just because the pages 208053b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 208153b381b3SDavid Woodhouse * the devices were gone. 208253b381b3SDavid Woodhouse */ 2083b89e1b01SMiao Xie if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 208453b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 208553b381b3SDavid Woodhouse goto out; 208653b381b3SDavid Woodhouse } else { 208753b381b3SDavid Woodhouse goto cleanup; 208853b381b3SDavid Woodhouse } 208953b381b3SDavid Woodhouse } 209053b381b3SDavid Woodhouse 209153b381b3SDavid Woodhouse /* 209253b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 209353b381b3SDavid Woodhouse * not to touch it after that 209453b381b3SDavid Woodhouse */ 2095b89e1b01SMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 209653b381b3SDavid Woodhouse while (1) { 209753b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 209853b381b3SDavid Woodhouse if (!bio) 209953b381b3SDavid Woodhouse break; 210053b381b3SDavid Woodhouse 210153b381b3SDavid Woodhouse bio->bi_private = rbio; 210253b381b3SDavid Woodhouse bio->bi_end_io = raid_recover_end_io; 210353b381b3SDavid Woodhouse 210453b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 210553b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 210653b381b3SDavid Woodhouse 210753b381b3SDavid Woodhouse submit_bio(READ, bio); 210853b381b3SDavid Woodhouse } 210953b381b3SDavid Woodhouse out: 211053b381b3SDavid Woodhouse return 0; 211153b381b3SDavid Woodhouse 211253b381b3SDavid Woodhouse cleanup: 2113b4ee1782SOmar Sandoval if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2114b4ee1782SOmar Sandoval rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 21154246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 211653b381b3SDavid Woodhouse return -EIO; 211753b381b3SDavid Woodhouse } 211853b381b3SDavid Woodhouse 211953b381b3SDavid Woodhouse /* 212053b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 212153b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 212253b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 212353b381b3SDavid Woodhouse * of the drive. 212453b381b3SDavid Woodhouse */ 212553b381b3SDavid Woodhouse int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 21268e5cfb55SZhao Lei struct btrfs_bio *bbio, u64 stripe_len, 21278e5cfb55SZhao Lei int mirror_num, int generic_io) 212853b381b3SDavid Woodhouse { 212953b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 213053b381b3SDavid Woodhouse int ret; 213153b381b3SDavid Woodhouse 21328e5cfb55SZhao Lei rbio = alloc_rbio(root, bbio, stripe_len); 2133af8e2d1dSMiao Xie if (IS_ERR(rbio)) { 21346e9606d2SZhao Lei if (generic_io) 21356e9606d2SZhao Lei btrfs_put_bbio(bbio); 213653b381b3SDavid Woodhouse return PTR_ERR(rbio); 2137af8e2d1dSMiao Xie } 213853b381b3SDavid Woodhouse 21391b94b556SMiao Xie rbio->operation = BTRFS_RBIO_READ_REBUILD; 214053b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 21414f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 214253b381b3SDavid Woodhouse 214353b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 214453b381b3SDavid Woodhouse if (rbio->faila == -1) { 214553b381b3SDavid Woodhouse BUG(); 21466e9606d2SZhao Lei if (generic_io) 21476e9606d2SZhao Lei btrfs_put_bbio(bbio); 214853b381b3SDavid Woodhouse kfree(rbio); 214953b381b3SDavid Woodhouse return -EIO; 215053b381b3SDavid Woodhouse } 215153b381b3SDavid Woodhouse 21524245215dSMiao Xie if (generic_io) { 21534245215dSMiao Xie btrfs_bio_counter_inc_noblocked(root->fs_info); 21544245215dSMiao Xie rbio->generic_bio_cnt = 1; 21554245215dSMiao Xie } else { 21566e9606d2SZhao Lei btrfs_get_bbio(bbio); 21574245215dSMiao Xie } 21584245215dSMiao Xie 215953b381b3SDavid Woodhouse /* 216053b381b3SDavid Woodhouse * reconstruct from the q stripe if they are 216153b381b3SDavid Woodhouse * asking for mirror 3 216253b381b3SDavid Woodhouse */ 216353b381b3SDavid Woodhouse if (mirror_num == 3) 21642c8cdd6eSMiao Xie rbio->failb = rbio->real_stripes - 2; 216553b381b3SDavid Woodhouse 216653b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 216753b381b3SDavid Woodhouse 216853b381b3SDavid Woodhouse /* 216953b381b3SDavid Woodhouse * __raid56_parity_recover will end the bio with 217053b381b3SDavid Woodhouse * any errors it hits. We don't want to return 217153b381b3SDavid Woodhouse * its error value up the stack because our caller 217253b381b3SDavid Woodhouse * will end up calling bio_endio with any nonzero 217353b381b3SDavid Woodhouse * return 217453b381b3SDavid Woodhouse */ 217553b381b3SDavid Woodhouse if (ret == 0) 217653b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 217753b381b3SDavid Woodhouse /* 217853b381b3SDavid Woodhouse * our rbio has been added to the list of 217953b381b3SDavid Woodhouse * rbios that will be handled after the 218053b381b3SDavid Woodhouse * currently lock owner is done 218153b381b3SDavid Woodhouse */ 218253b381b3SDavid Woodhouse return 0; 218353b381b3SDavid Woodhouse 218453b381b3SDavid Woodhouse } 218553b381b3SDavid Woodhouse 218653b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work) 218753b381b3SDavid Woodhouse { 218853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 218953b381b3SDavid Woodhouse 219053b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 219153b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 219253b381b3SDavid Woodhouse } 219353b381b3SDavid Woodhouse 219453b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work) 219553b381b3SDavid Woodhouse { 219653b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 219753b381b3SDavid Woodhouse 219853b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 219953b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 220053b381b3SDavid Woodhouse } 22015a6ac9eaSMiao Xie 22025a6ac9eaSMiao Xie /* 22035a6ac9eaSMiao Xie * The following code is used to scrub/replace the parity stripe 22045a6ac9eaSMiao Xie * 22055a6ac9eaSMiao Xie * Note: We need make sure all the pages that add into the scrub/replace 22065a6ac9eaSMiao Xie * raid bio are correct and not be changed during the scrub/replace. That 22075a6ac9eaSMiao Xie * is those pages just hold metadata or file data with checksum. 22085a6ac9eaSMiao Xie */ 22095a6ac9eaSMiao Xie 22105a6ac9eaSMiao Xie struct btrfs_raid_bio * 22115a6ac9eaSMiao Xie raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, 22128e5cfb55SZhao Lei struct btrfs_bio *bbio, u64 stripe_len, 22138e5cfb55SZhao Lei struct btrfs_device *scrub_dev, 22145a6ac9eaSMiao Xie unsigned long *dbitmap, int stripe_nsectors) 22155a6ac9eaSMiao Xie { 22165a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 22175a6ac9eaSMiao Xie int i; 22185a6ac9eaSMiao Xie 22198e5cfb55SZhao Lei rbio = alloc_rbio(root, bbio, stripe_len); 22205a6ac9eaSMiao Xie if (IS_ERR(rbio)) 22215a6ac9eaSMiao Xie return NULL; 22225a6ac9eaSMiao Xie bio_list_add(&rbio->bio_list, bio); 22235a6ac9eaSMiao Xie /* 22245a6ac9eaSMiao Xie * This is a special bio which is used to hold the completion handler 22255a6ac9eaSMiao Xie * and make the scrub rbio is similar to the other types 22265a6ac9eaSMiao Xie */ 22275a6ac9eaSMiao Xie ASSERT(!bio->bi_iter.bi_size); 22285a6ac9eaSMiao Xie rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 22295a6ac9eaSMiao Xie 22302c8cdd6eSMiao Xie for (i = 0; i < rbio->real_stripes; i++) { 22315a6ac9eaSMiao Xie if (bbio->stripes[i].dev == scrub_dev) { 22325a6ac9eaSMiao Xie rbio->scrubp = i; 22335a6ac9eaSMiao Xie break; 22345a6ac9eaSMiao Xie } 22355a6ac9eaSMiao Xie } 22365a6ac9eaSMiao Xie 22375a6ac9eaSMiao Xie /* Now we just support the sectorsize equals to page size */ 22385a6ac9eaSMiao Xie ASSERT(root->sectorsize == PAGE_SIZE); 22395a6ac9eaSMiao Xie ASSERT(rbio->stripe_npages == stripe_nsectors); 22405a6ac9eaSMiao Xie bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 22415a6ac9eaSMiao Xie 22425a6ac9eaSMiao Xie return rbio; 22435a6ac9eaSMiao Xie } 22445a6ac9eaSMiao Xie 2245b4ee1782SOmar Sandoval /* Used for both parity scrub and missing. */ 2246b4ee1782SOmar Sandoval void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2247b4ee1782SOmar Sandoval u64 logical) 22485a6ac9eaSMiao Xie { 22495a6ac9eaSMiao Xie int stripe_offset; 22505a6ac9eaSMiao Xie int index; 22515a6ac9eaSMiao Xie 22528e5cfb55SZhao Lei ASSERT(logical >= rbio->bbio->raid_map[0]); 22538e5cfb55SZhao Lei ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 22545a6ac9eaSMiao Xie rbio->stripe_len * rbio->nr_data); 22558e5cfb55SZhao Lei stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 22565a6ac9eaSMiao Xie index = stripe_offset >> PAGE_CACHE_SHIFT; 22575a6ac9eaSMiao Xie rbio->bio_pages[index] = page; 22585a6ac9eaSMiao Xie } 22595a6ac9eaSMiao Xie 22605a6ac9eaSMiao Xie /* 22615a6ac9eaSMiao Xie * We just scrub the parity that we have correct data on the same horizontal, 22625a6ac9eaSMiao Xie * so we needn't allocate all pages for all the stripes. 22635a6ac9eaSMiao Xie */ 22645a6ac9eaSMiao Xie static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 22655a6ac9eaSMiao Xie { 22665a6ac9eaSMiao Xie int i; 22675a6ac9eaSMiao Xie int bit; 22685a6ac9eaSMiao Xie int index; 22695a6ac9eaSMiao Xie struct page *page; 22705a6ac9eaSMiao Xie 22715a6ac9eaSMiao Xie for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 22722c8cdd6eSMiao Xie for (i = 0; i < rbio->real_stripes; i++) { 22735a6ac9eaSMiao Xie index = i * rbio->stripe_npages + bit; 22745a6ac9eaSMiao Xie if (rbio->stripe_pages[index]) 22755a6ac9eaSMiao Xie continue; 22765a6ac9eaSMiao Xie 22775a6ac9eaSMiao Xie page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 22785a6ac9eaSMiao Xie if (!page) 22795a6ac9eaSMiao Xie return -ENOMEM; 22805a6ac9eaSMiao Xie rbio->stripe_pages[index] = page; 22815a6ac9eaSMiao Xie ClearPageUptodate(page); 22825a6ac9eaSMiao Xie } 22835a6ac9eaSMiao Xie } 22845a6ac9eaSMiao Xie return 0; 22855a6ac9eaSMiao Xie } 22865a6ac9eaSMiao Xie 22875a6ac9eaSMiao Xie /* 22885a6ac9eaSMiao Xie * end io function used by finish_rmw. When we finally 22895a6ac9eaSMiao Xie * get here, we've written a full stripe 22905a6ac9eaSMiao Xie */ 22914246a0b6SChristoph Hellwig static void raid_write_parity_end_io(struct bio *bio) 22925a6ac9eaSMiao Xie { 22935a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio = bio->bi_private; 22944246a0b6SChristoph Hellwig int err = bio->bi_error; 22955a6ac9eaSMiao Xie 22964246a0b6SChristoph Hellwig if (bio->bi_error) 22975a6ac9eaSMiao Xie fail_bio_stripe(rbio, bio); 22985a6ac9eaSMiao Xie 22995a6ac9eaSMiao Xie bio_put(bio); 23005a6ac9eaSMiao Xie 23015a6ac9eaSMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 23025a6ac9eaSMiao Xie return; 23035a6ac9eaSMiao Xie 23045a6ac9eaSMiao Xie err = 0; 23055a6ac9eaSMiao Xie 23065a6ac9eaSMiao Xie if (atomic_read(&rbio->error)) 23075a6ac9eaSMiao Xie err = -EIO; 23085a6ac9eaSMiao Xie 23094246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, err); 23105a6ac9eaSMiao Xie } 23115a6ac9eaSMiao Xie 23125a6ac9eaSMiao Xie static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 23135a6ac9eaSMiao Xie int need_check) 23145a6ac9eaSMiao Xie { 231576035976SMiao Xie struct btrfs_bio *bbio = rbio->bbio; 23162c8cdd6eSMiao Xie void *pointers[rbio->real_stripes]; 231776035976SMiao Xie DECLARE_BITMAP(pbitmap, rbio->stripe_npages); 23185a6ac9eaSMiao Xie int nr_data = rbio->nr_data; 23195a6ac9eaSMiao Xie int stripe; 23205a6ac9eaSMiao Xie int pagenr; 23215a6ac9eaSMiao Xie int p_stripe = -1; 23225a6ac9eaSMiao Xie int q_stripe = -1; 23235a6ac9eaSMiao Xie struct page *p_page = NULL; 23245a6ac9eaSMiao Xie struct page *q_page = NULL; 23255a6ac9eaSMiao Xie struct bio_list bio_list; 23265a6ac9eaSMiao Xie struct bio *bio; 232776035976SMiao Xie int is_replace = 0; 23285a6ac9eaSMiao Xie int ret; 23295a6ac9eaSMiao Xie 23305a6ac9eaSMiao Xie bio_list_init(&bio_list); 23315a6ac9eaSMiao Xie 23322c8cdd6eSMiao Xie if (rbio->real_stripes - rbio->nr_data == 1) { 23332c8cdd6eSMiao Xie p_stripe = rbio->real_stripes - 1; 23342c8cdd6eSMiao Xie } else if (rbio->real_stripes - rbio->nr_data == 2) { 23352c8cdd6eSMiao Xie p_stripe = rbio->real_stripes - 2; 23362c8cdd6eSMiao Xie q_stripe = rbio->real_stripes - 1; 23375a6ac9eaSMiao Xie } else { 23385a6ac9eaSMiao Xie BUG(); 23395a6ac9eaSMiao Xie } 23405a6ac9eaSMiao Xie 234176035976SMiao Xie if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 234276035976SMiao Xie is_replace = 1; 234376035976SMiao Xie bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 234476035976SMiao Xie } 234576035976SMiao Xie 23465a6ac9eaSMiao Xie /* 23475a6ac9eaSMiao Xie * Because the higher layers(scrubber) are unlikely to 23485a6ac9eaSMiao Xie * use this area of the disk again soon, so don't cache 23495a6ac9eaSMiao Xie * it. 23505a6ac9eaSMiao Xie */ 23515a6ac9eaSMiao Xie clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 23525a6ac9eaSMiao Xie 23535a6ac9eaSMiao Xie if (!need_check) 23545a6ac9eaSMiao Xie goto writeback; 23555a6ac9eaSMiao Xie 23565a6ac9eaSMiao Xie p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 23575a6ac9eaSMiao Xie if (!p_page) 23585a6ac9eaSMiao Xie goto cleanup; 23595a6ac9eaSMiao Xie SetPageUptodate(p_page); 23605a6ac9eaSMiao Xie 23615a6ac9eaSMiao Xie if (q_stripe != -1) { 23625a6ac9eaSMiao Xie q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 23635a6ac9eaSMiao Xie if (!q_page) { 23645a6ac9eaSMiao Xie __free_page(p_page); 23655a6ac9eaSMiao Xie goto cleanup; 23665a6ac9eaSMiao Xie } 23675a6ac9eaSMiao Xie SetPageUptodate(q_page); 23685a6ac9eaSMiao Xie } 23695a6ac9eaSMiao Xie 23705a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 23715a6ac9eaSMiao Xie 23725a6ac9eaSMiao Xie for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 23735a6ac9eaSMiao Xie struct page *p; 23745a6ac9eaSMiao Xie void *parity; 23755a6ac9eaSMiao Xie /* first collect one page from each data stripe */ 23765a6ac9eaSMiao Xie for (stripe = 0; stripe < nr_data; stripe++) { 23775a6ac9eaSMiao Xie p = page_in_rbio(rbio, stripe, pagenr, 0); 23785a6ac9eaSMiao Xie pointers[stripe] = kmap(p); 23795a6ac9eaSMiao Xie } 23805a6ac9eaSMiao Xie 23815a6ac9eaSMiao Xie /* then add the parity stripe */ 23825a6ac9eaSMiao Xie pointers[stripe++] = kmap(p_page); 23835a6ac9eaSMiao Xie 23845a6ac9eaSMiao Xie if (q_stripe != -1) { 23855a6ac9eaSMiao Xie 23865a6ac9eaSMiao Xie /* 23875a6ac9eaSMiao Xie * raid6, add the qstripe and call the 23885a6ac9eaSMiao Xie * library function to fill in our p/q 23895a6ac9eaSMiao Xie */ 23905a6ac9eaSMiao Xie pointers[stripe++] = kmap(q_page); 23915a6ac9eaSMiao Xie 23922c8cdd6eSMiao Xie raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 23935a6ac9eaSMiao Xie pointers); 23945a6ac9eaSMiao Xie } else { 23955a6ac9eaSMiao Xie /* raid5 */ 23965a6ac9eaSMiao Xie memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 23975a6ac9eaSMiao Xie run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 23985a6ac9eaSMiao Xie } 23995a6ac9eaSMiao Xie 24005a6ac9eaSMiao Xie /* Check scrubbing pairty and repair it */ 24015a6ac9eaSMiao Xie p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 24025a6ac9eaSMiao Xie parity = kmap(p); 24035a6ac9eaSMiao Xie if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) 24045a6ac9eaSMiao Xie memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); 24055a6ac9eaSMiao Xie else 24065a6ac9eaSMiao Xie /* Parity is right, needn't writeback */ 24075a6ac9eaSMiao Xie bitmap_clear(rbio->dbitmap, pagenr, 1); 24085a6ac9eaSMiao Xie kunmap(p); 24095a6ac9eaSMiao Xie 24102c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) 24115a6ac9eaSMiao Xie kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 24125a6ac9eaSMiao Xie } 24135a6ac9eaSMiao Xie 24145a6ac9eaSMiao Xie __free_page(p_page); 24155a6ac9eaSMiao Xie if (q_page) 24165a6ac9eaSMiao Xie __free_page(q_page); 24175a6ac9eaSMiao Xie 24185a6ac9eaSMiao Xie writeback: 24195a6ac9eaSMiao Xie /* 24205a6ac9eaSMiao Xie * time to start writing. Make bios for everything from the 24215a6ac9eaSMiao Xie * higher layers (the bio_list in our rbio) and our p/q. Ignore 24225a6ac9eaSMiao Xie * everything else. 24235a6ac9eaSMiao Xie */ 24245a6ac9eaSMiao Xie for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 24255a6ac9eaSMiao Xie struct page *page; 24265a6ac9eaSMiao Xie 24275a6ac9eaSMiao Xie page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 24285a6ac9eaSMiao Xie ret = rbio_add_io_page(rbio, &bio_list, 24295a6ac9eaSMiao Xie page, rbio->scrubp, pagenr, rbio->stripe_len); 24305a6ac9eaSMiao Xie if (ret) 24315a6ac9eaSMiao Xie goto cleanup; 24325a6ac9eaSMiao Xie } 24335a6ac9eaSMiao Xie 243476035976SMiao Xie if (!is_replace) 243576035976SMiao Xie goto submit_write; 243676035976SMiao Xie 243776035976SMiao Xie for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 243876035976SMiao Xie struct page *page; 243976035976SMiao Xie 244076035976SMiao Xie page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 244176035976SMiao Xie ret = rbio_add_io_page(rbio, &bio_list, page, 244276035976SMiao Xie bbio->tgtdev_map[rbio->scrubp], 244376035976SMiao Xie pagenr, rbio->stripe_len); 244476035976SMiao Xie if (ret) 244576035976SMiao Xie goto cleanup; 244676035976SMiao Xie } 244776035976SMiao Xie 244876035976SMiao Xie submit_write: 24495a6ac9eaSMiao Xie nr_data = bio_list_size(&bio_list); 24505a6ac9eaSMiao Xie if (!nr_data) { 24515a6ac9eaSMiao Xie /* Every parity is right */ 24524246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, 0); 24535a6ac9eaSMiao Xie return; 24545a6ac9eaSMiao Xie } 24555a6ac9eaSMiao Xie 24565a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, nr_data); 24575a6ac9eaSMiao Xie 24585a6ac9eaSMiao Xie while (1) { 24595a6ac9eaSMiao Xie bio = bio_list_pop(&bio_list); 24605a6ac9eaSMiao Xie if (!bio) 24615a6ac9eaSMiao Xie break; 24625a6ac9eaSMiao Xie 24635a6ac9eaSMiao Xie bio->bi_private = rbio; 24645a6ac9eaSMiao Xie bio->bi_end_io = raid_write_parity_end_io; 24655a6ac9eaSMiao Xie submit_bio(WRITE, bio); 24665a6ac9eaSMiao Xie } 24675a6ac9eaSMiao Xie return; 24685a6ac9eaSMiao Xie 24695a6ac9eaSMiao Xie cleanup: 24704246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 24715a6ac9eaSMiao Xie } 24725a6ac9eaSMiao Xie 24735a6ac9eaSMiao Xie static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 24745a6ac9eaSMiao Xie { 24755a6ac9eaSMiao Xie if (stripe >= 0 && stripe < rbio->nr_data) 24765a6ac9eaSMiao Xie return 1; 24775a6ac9eaSMiao Xie return 0; 24785a6ac9eaSMiao Xie } 24795a6ac9eaSMiao Xie 24805a6ac9eaSMiao Xie /* 24815a6ac9eaSMiao Xie * While we're doing the parity check and repair, we could have errors 24825a6ac9eaSMiao Xie * in reading pages off the disk. This checks for errors and if we're 24835a6ac9eaSMiao Xie * not able to read the page it'll trigger parity reconstruction. The 24845a6ac9eaSMiao Xie * parity scrub will be finished after we've reconstructed the failed 24855a6ac9eaSMiao Xie * stripes 24865a6ac9eaSMiao Xie */ 24875a6ac9eaSMiao Xie static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 24885a6ac9eaSMiao Xie { 24895a6ac9eaSMiao Xie if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 24905a6ac9eaSMiao Xie goto cleanup; 24915a6ac9eaSMiao Xie 24925a6ac9eaSMiao Xie if (rbio->faila >= 0 || rbio->failb >= 0) { 24935a6ac9eaSMiao Xie int dfail = 0, failp = -1; 24945a6ac9eaSMiao Xie 24955a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->faila)) 24965a6ac9eaSMiao Xie dfail++; 24975a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->faila)) 24985a6ac9eaSMiao Xie failp = rbio->faila; 24995a6ac9eaSMiao Xie 25005a6ac9eaSMiao Xie if (is_data_stripe(rbio, rbio->failb)) 25015a6ac9eaSMiao Xie dfail++; 25025a6ac9eaSMiao Xie else if (is_parity_stripe(rbio->failb)) 25035a6ac9eaSMiao Xie failp = rbio->failb; 25045a6ac9eaSMiao Xie 25055a6ac9eaSMiao Xie /* 25065a6ac9eaSMiao Xie * Because we can not use a scrubbing parity to repair 25075a6ac9eaSMiao Xie * the data, so the capability of the repair is declined. 25085a6ac9eaSMiao Xie * (In the case of RAID5, we can not repair anything) 25095a6ac9eaSMiao Xie */ 25105a6ac9eaSMiao Xie if (dfail > rbio->bbio->max_errors - 1) 25115a6ac9eaSMiao Xie goto cleanup; 25125a6ac9eaSMiao Xie 25135a6ac9eaSMiao Xie /* 25145a6ac9eaSMiao Xie * If all data is good, only parity is correctly, just 25155a6ac9eaSMiao Xie * repair the parity. 25165a6ac9eaSMiao Xie */ 25175a6ac9eaSMiao Xie if (dfail == 0) { 25185a6ac9eaSMiao Xie finish_parity_scrub(rbio, 0); 25195a6ac9eaSMiao Xie return; 25205a6ac9eaSMiao Xie } 25215a6ac9eaSMiao Xie 25225a6ac9eaSMiao Xie /* 25235a6ac9eaSMiao Xie * Here means we got one corrupted data stripe and one 25245a6ac9eaSMiao Xie * corrupted parity on RAID6, if the corrupted parity 25255a6ac9eaSMiao Xie * is scrubbing parity, luckly, use the other one to repair 25265a6ac9eaSMiao Xie * the data, or we can not repair the data stripe. 25275a6ac9eaSMiao Xie */ 25285a6ac9eaSMiao Xie if (failp != rbio->scrubp) 25295a6ac9eaSMiao Xie goto cleanup; 25305a6ac9eaSMiao Xie 25315a6ac9eaSMiao Xie __raid_recover_end_io(rbio); 25325a6ac9eaSMiao Xie } else { 25335a6ac9eaSMiao Xie finish_parity_scrub(rbio, 1); 25345a6ac9eaSMiao Xie } 25355a6ac9eaSMiao Xie return; 25365a6ac9eaSMiao Xie 25375a6ac9eaSMiao Xie cleanup: 25384246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 25395a6ac9eaSMiao Xie } 25405a6ac9eaSMiao Xie 25415a6ac9eaSMiao Xie /* 25425a6ac9eaSMiao Xie * end io for the read phase of the rmw cycle. All the bios here are physical 25435a6ac9eaSMiao Xie * stripe bios we've read from the disk so we can recalculate the parity of the 25445a6ac9eaSMiao Xie * stripe. 25455a6ac9eaSMiao Xie * 25465a6ac9eaSMiao Xie * This will usually kick off finish_rmw once all the bios are read in, but it 25475a6ac9eaSMiao Xie * may trigger parity reconstruction if we had any errors along the way 25485a6ac9eaSMiao Xie */ 25494246a0b6SChristoph Hellwig static void raid56_parity_scrub_end_io(struct bio *bio) 25505a6ac9eaSMiao Xie { 25515a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio = bio->bi_private; 25525a6ac9eaSMiao Xie 25534246a0b6SChristoph Hellwig if (bio->bi_error) 25545a6ac9eaSMiao Xie fail_bio_stripe(rbio, bio); 25555a6ac9eaSMiao Xie else 25565a6ac9eaSMiao Xie set_bio_pages_uptodate(bio); 25575a6ac9eaSMiao Xie 25585a6ac9eaSMiao Xie bio_put(bio); 25595a6ac9eaSMiao Xie 25605a6ac9eaSMiao Xie if (!atomic_dec_and_test(&rbio->stripes_pending)) 25615a6ac9eaSMiao Xie return; 25625a6ac9eaSMiao Xie 25635a6ac9eaSMiao Xie /* 25645a6ac9eaSMiao Xie * this will normally call finish_rmw to start our write 25655a6ac9eaSMiao Xie * but if there are any failed stripes we'll reconstruct 25665a6ac9eaSMiao Xie * from parity first 25675a6ac9eaSMiao Xie */ 25685a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 25695a6ac9eaSMiao Xie } 25705a6ac9eaSMiao Xie 25715a6ac9eaSMiao Xie static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 25725a6ac9eaSMiao Xie { 25735a6ac9eaSMiao Xie int bios_to_read = 0; 25745a6ac9eaSMiao Xie struct bio_list bio_list; 25755a6ac9eaSMiao Xie int ret; 25765a6ac9eaSMiao Xie int pagenr; 25775a6ac9eaSMiao Xie int stripe; 25785a6ac9eaSMiao Xie struct bio *bio; 25795a6ac9eaSMiao Xie 25805a6ac9eaSMiao Xie ret = alloc_rbio_essential_pages(rbio); 25815a6ac9eaSMiao Xie if (ret) 25825a6ac9eaSMiao Xie goto cleanup; 25835a6ac9eaSMiao Xie 25845a6ac9eaSMiao Xie bio_list_init(&bio_list); 25855a6ac9eaSMiao Xie 25865a6ac9eaSMiao Xie atomic_set(&rbio->error, 0); 25875a6ac9eaSMiao Xie /* 25885a6ac9eaSMiao Xie * build a list of bios to read all the missing parts of this 25895a6ac9eaSMiao Xie * stripe 25905a6ac9eaSMiao Xie */ 25912c8cdd6eSMiao Xie for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 25925a6ac9eaSMiao Xie for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 25935a6ac9eaSMiao Xie struct page *page; 25945a6ac9eaSMiao Xie /* 25955a6ac9eaSMiao Xie * we want to find all the pages missing from 25965a6ac9eaSMiao Xie * the rbio and read them from the disk. If 25975a6ac9eaSMiao Xie * page_in_rbio finds a page in the bio list 25985a6ac9eaSMiao Xie * we don't need to read it off the stripe. 25995a6ac9eaSMiao Xie */ 26005a6ac9eaSMiao Xie page = page_in_rbio(rbio, stripe, pagenr, 1); 26015a6ac9eaSMiao Xie if (page) 26025a6ac9eaSMiao Xie continue; 26035a6ac9eaSMiao Xie 26045a6ac9eaSMiao Xie page = rbio_stripe_page(rbio, stripe, pagenr); 26055a6ac9eaSMiao Xie /* 26065a6ac9eaSMiao Xie * the bio cache may have handed us an uptodate 26075a6ac9eaSMiao Xie * page. If so, be happy and use it 26085a6ac9eaSMiao Xie */ 26095a6ac9eaSMiao Xie if (PageUptodate(page)) 26105a6ac9eaSMiao Xie continue; 26115a6ac9eaSMiao Xie 26125a6ac9eaSMiao Xie ret = rbio_add_io_page(rbio, &bio_list, page, 26135a6ac9eaSMiao Xie stripe, pagenr, rbio->stripe_len); 26145a6ac9eaSMiao Xie if (ret) 26155a6ac9eaSMiao Xie goto cleanup; 26165a6ac9eaSMiao Xie } 26175a6ac9eaSMiao Xie } 26185a6ac9eaSMiao Xie 26195a6ac9eaSMiao Xie bios_to_read = bio_list_size(&bio_list); 26205a6ac9eaSMiao Xie if (!bios_to_read) { 26215a6ac9eaSMiao Xie /* 26225a6ac9eaSMiao Xie * this can happen if others have merged with 26235a6ac9eaSMiao Xie * us, it means there is nothing left to read. 26245a6ac9eaSMiao Xie * But if there are missing devices it may not be 26255a6ac9eaSMiao Xie * safe to do the full stripe write yet. 26265a6ac9eaSMiao Xie */ 26275a6ac9eaSMiao Xie goto finish; 26285a6ac9eaSMiao Xie } 26295a6ac9eaSMiao Xie 26305a6ac9eaSMiao Xie /* 26315a6ac9eaSMiao Xie * the bbio may be freed once we submit the last bio. Make sure 26325a6ac9eaSMiao Xie * not to touch it after that 26335a6ac9eaSMiao Xie */ 26345a6ac9eaSMiao Xie atomic_set(&rbio->stripes_pending, bios_to_read); 26355a6ac9eaSMiao Xie while (1) { 26365a6ac9eaSMiao Xie bio = bio_list_pop(&bio_list); 26375a6ac9eaSMiao Xie if (!bio) 26385a6ac9eaSMiao Xie break; 26395a6ac9eaSMiao Xie 26405a6ac9eaSMiao Xie bio->bi_private = rbio; 26415a6ac9eaSMiao Xie bio->bi_end_io = raid56_parity_scrub_end_io; 26425a6ac9eaSMiao Xie 26435a6ac9eaSMiao Xie btrfs_bio_wq_end_io(rbio->fs_info, bio, 26445a6ac9eaSMiao Xie BTRFS_WQ_ENDIO_RAID56); 26455a6ac9eaSMiao Xie 26465a6ac9eaSMiao Xie submit_bio(READ, bio); 26475a6ac9eaSMiao Xie } 26485a6ac9eaSMiao Xie /* the actual write will happen once the reads are done */ 26495a6ac9eaSMiao Xie return; 26505a6ac9eaSMiao Xie 26515a6ac9eaSMiao Xie cleanup: 26524246a0b6SChristoph Hellwig rbio_orig_end_io(rbio, -EIO); 26535a6ac9eaSMiao Xie return; 26545a6ac9eaSMiao Xie 26555a6ac9eaSMiao Xie finish: 26565a6ac9eaSMiao Xie validate_rbio_for_parity_scrub(rbio); 26575a6ac9eaSMiao Xie } 26585a6ac9eaSMiao Xie 26595a6ac9eaSMiao Xie static void scrub_parity_work(struct btrfs_work *work) 26605a6ac9eaSMiao Xie { 26615a6ac9eaSMiao Xie struct btrfs_raid_bio *rbio; 26625a6ac9eaSMiao Xie 26635a6ac9eaSMiao Xie rbio = container_of(work, struct btrfs_raid_bio, work); 26645a6ac9eaSMiao Xie raid56_parity_scrub_stripe(rbio); 26655a6ac9eaSMiao Xie } 26665a6ac9eaSMiao Xie 26675a6ac9eaSMiao Xie static void async_scrub_parity(struct btrfs_raid_bio *rbio) 26685a6ac9eaSMiao Xie { 26695a6ac9eaSMiao Xie btrfs_init_work(&rbio->work, btrfs_rmw_helper, 26705a6ac9eaSMiao Xie scrub_parity_work, NULL, NULL); 26715a6ac9eaSMiao Xie 26725a6ac9eaSMiao Xie btrfs_queue_work(rbio->fs_info->rmw_workers, 26735a6ac9eaSMiao Xie &rbio->work); 26745a6ac9eaSMiao Xie } 26755a6ac9eaSMiao Xie 26765a6ac9eaSMiao Xie void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 26775a6ac9eaSMiao Xie { 26785a6ac9eaSMiao Xie if (!lock_stripe_add(rbio)) 26795a6ac9eaSMiao Xie async_scrub_parity(rbio); 26805a6ac9eaSMiao Xie } 2681b4ee1782SOmar Sandoval 2682b4ee1782SOmar Sandoval /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2683b4ee1782SOmar Sandoval 2684b4ee1782SOmar Sandoval struct btrfs_raid_bio * 2685b4ee1782SOmar Sandoval raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, 2686b4ee1782SOmar Sandoval struct btrfs_bio *bbio, u64 length) 2687b4ee1782SOmar Sandoval { 2688b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2689b4ee1782SOmar Sandoval 2690b4ee1782SOmar Sandoval rbio = alloc_rbio(root, bbio, length); 2691b4ee1782SOmar Sandoval if (IS_ERR(rbio)) 2692b4ee1782SOmar Sandoval return NULL; 2693b4ee1782SOmar Sandoval 2694b4ee1782SOmar Sandoval rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2695b4ee1782SOmar Sandoval bio_list_add(&rbio->bio_list, bio); 2696b4ee1782SOmar Sandoval /* 2697b4ee1782SOmar Sandoval * This is a special bio which is used to hold the completion handler 2698b4ee1782SOmar Sandoval * and make the scrub rbio is similar to the other types 2699b4ee1782SOmar Sandoval */ 2700b4ee1782SOmar Sandoval ASSERT(!bio->bi_iter.bi_size); 2701b4ee1782SOmar Sandoval 2702b4ee1782SOmar Sandoval rbio->faila = find_logical_bio_stripe(rbio, bio); 2703b4ee1782SOmar Sandoval if (rbio->faila == -1) { 2704b4ee1782SOmar Sandoval BUG(); 2705b4ee1782SOmar Sandoval kfree(rbio); 2706b4ee1782SOmar Sandoval return NULL; 2707b4ee1782SOmar Sandoval } 2708b4ee1782SOmar Sandoval 2709b4ee1782SOmar Sandoval return rbio; 2710b4ee1782SOmar Sandoval } 2711b4ee1782SOmar Sandoval 2712b4ee1782SOmar Sandoval static void missing_raid56_work(struct btrfs_work *work) 2713b4ee1782SOmar Sandoval { 2714b4ee1782SOmar Sandoval struct btrfs_raid_bio *rbio; 2715b4ee1782SOmar Sandoval 2716b4ee1782SOmar Sandoval rbio = container_of(work, struct btrfs_raid_bio, work); 2717b4ee1782SOmar Sandoval __raid56_parity_recover(rbio); 2718b4ee1782SOmar Sandoval } 2719b4ee1782SOmar Sandoval 2720b4ee1782SOmar Sandoval static void async_missing_raid56(struct btrfs_raid_bio *rbio) 2721b4ee1782SOmar Sandoval { 2722b4ee1782SOmar Sandoval btrfs_init_work(&rbio->work, btrfs_rmw_helper, 2723b4ee1782SOmar Sandoval missing_raid56_work, NULL, NULL); 2724b4ee1782SOmar Sandoval 2725b4ee1782SOmar Sandoval btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 2726b4ee1782SOmar Sandoval } 2727b4ee1782SOmar Sandoval 2728b4ee1782SOmar Sandoval void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2729b4ee1782SOmar Sandoval { 2730b4ee1782SOmar Sandoval if (!lock_stripe_add(rbio)) 2731b4ee1782SOmar Sandoval async_missing_raid56(rbio); 2732b4ee1782SOmar Sandoval } 2733