153b381b3SDavid Woodhouse /* 253b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 353b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 453b381b3SDavid Woodhouse * 553b381b3SDavid Woodhouse * This program is free software; you can redistribute it and/or 653b381b3SDavid Woodhouse * modify it under the terms of the GNU General Public 753b381b3SDavid Woodhouse * License v2 as published by the Free Software Foundation. 853b381b3SDavid Woodhouse * 953b381b3SDavid Woodhouse * This program is distributed in the hope that it will be useful, 1053b381b3SDavid Woodhouse * but WITHOUT ANY WARRANTY; without even the implied warranty of 1153b381b3SDavid Woodhouse * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 1253b381b3SDavid Woodhouse * General Public License for more details. 1353b381b3SDavid Woodhouse * 1453b381b3SDavid Woodhouse * You should have received a copy of the GNU General Public 1553b381b3SDavid Woodhouse * License along with this program; if not, write to the 1653b381b3SDavid Woodhouse * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 1753b381b3SDavid Woodhouse * Boston, MA 021110-1307, USA. 1853b381b3SDavid Woodhouse */ 1953b381b3SDavid Woodhouse #include <linux/sched.h> 2053b381b3SDavid Woodhouse #include <linux/wait.h> 2153b381b3SDavid Woodhouse #include <linux/bio.h> 2253b381b3SDavid Woodhouse #include <linux/slab.h> 2353b381b3SDavid Woodhouse #include <linux/buffer_head.h> 2453b381b3SDavid Woodhouse #include <linux/blkdev.h> 2553b381b3SDavid Woodhouse #include <linux/random.h> 2653b381b3SDavid Woodhouse #include <linux/iocontext.h> 2753b381b3SDavid Woodhouse #include <linux/capability.h> 2853b381b3SDavid Woodhouse #include <linux/ratelimit.h> 2953b381b3SDavid Woodhouse #include <linux/kthread.h> 3053b381b3SDavid Woodhouse #include <linux/raid/pq.h> 3153b381b3SDavid Woodhouse #include <linux/hash.h> 3253b381b3SDavid Woodhouse #include <linux/list_sort.h> 3353b381b3SDavid Woodhouse #include <linux/raid/xor.h> 34d7011f5bSGeert Uytterhoeven #include <linux/vmalloc.h> 3553b381b3SDavid Woodhouse #include <asm/div64.h> 3653b381b3SDavid Woodhouse #include "ctree.h" 3753b381b3SDavid Woodhouse #include "extent_map.h" 3853b381b3SDavid Woodhouse #include "disk-io.h" 3953b381b3SDavid Woodhouse #include "transaction.h" 4053b381b3SDavid Woodhouse #include "print-tree.h" 4153b381b3SDavid Woodhouse #include "volumes.h" 4253b381b3SDavid Woodhouse #include "raid56.h" 4353b381b3SDavid Woodhouse #include "async-thread.h" 4453b381b3SDavid Woodhouse #include "check-integrity.h" 4553b381b3SDavid Woodhouse #include "rcu-string.h" 4653b381b3SDavid Woodhouse 4753b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 4853b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 4953b381b3SDavid Woodhouse 504ae10b3aSChris Mason /* 514ae10b3aSChris Mason * set when this rbio is sitting in the hash, but it is just a cache 524ae10b3aSChris Mason * of past RMW 534ae10b3aSChris Mason */ 544ae10b3aSChris Mason #define RBIO_CACHE_BIT 2 554ae10b3aSChris Mason 564ae10b3aSChris Mason /* 574ae10b3aSChris Mason * set when it is safe to trust the stripe_pages for caching 584ae10b3aSChris Mason */ 594ae10b3aSChris Mason #define RBIO_CACHE_READY_BIT 3 604ae10b3aSChris Mason 614ae10b3aSChris Mason 624ae10b3aSChris Mason #define RBIO_CACHE_SIZE 1024 634ae10b3aSChris Mason 6453b381b3SDavid Woodhouse struct btrfs_raid_bio { 6553b381b3SDavid Woodhouse struct btrfs_fs_info *fs_info; 6653b381b3SDavid Woodhouse struct btrfs_bio *bbio; 6753b381b3SDavid Woodhouse 6853b381b3SDavid Woodhouse /* 6953b381b3SDavid Woodhouse * logical block numbers for the start of each stripe 7053b381b3SDavid Woodhouse * The last one or two are p/q. These are sorted, 7153b381b3SDavid Woodhouse * so raid_map[0] is the start of our full stripe 7253b381b3SDavid Woodhouse */ 7353b381b3SDavid Woodhouse u64 *raid_map; 7453b381b3SDavid Woodhouse 7553b381b3SDavid Woodhouse /* while we're doing rmw on a stripe 7653b381b3SDavid Woodhouse * we put it into a hash table so we can 7753b381b3SDavid Woodhouse * lock the stripe and merge more rbios 7853b381b3SDavid Woodhouse * into it. 7953b381b3SDavid Woodhouse */ 8053b381b3SDavid Woodhouse struct list_head hash_list; 8153b381b3SDavid Woodhouse 8253b381b3SDavid Woodhouse /* 834ae10b3aSChris Mason * LRU list for the stripe cache 844ae10b3aSChris Mason */ 854ae10b3aSChris Mason struct list_head stripe_cache; 864ae10b3aSChris Mason 874ae10b3aSChris Mason /* 8853b381b3SDavid Woodhouse * for scheduling work in the helper threads 8953b381b3SDavid Woodhouse */ 9053b381b3SDavid Woodhouse struct btrfs_work work; 9153b381b3SDavid Woodhouse 9253b381b3SDavid Woodhouse /* 9353b381b3SDavid Woodhouse * bio list and bio_list_lock are used 9453b381b3SDavid Woodhouse * to add more bios into the stripe 9553b381b3SDavid Woodhouse * in hopes of avoiding the full rmw 9653b381b3SDavid Woodhouse */ 9753b381b3SDavid Woodhouse struct bio_list bio_list; 9853b381b3SDavid Woodhouse spinlock_t bio_list_lock; 9953b381b3SDavid Woodhouse 1006ac0f488SChris Mason /* also protected by the bio_list_lock, the 1016ac0f488SChris Mason * plug list is used by the plugging code 1026ac0f488SChris Mason * to collect partial bios while plugged. The 1036ac0f488SChris Mason * stripe locking code also uses it to hand off 10453b381b3SDavid Woodhouse * the stripe lock to the next pending IO 10553b381b3SDavid Woodhouse */ 10653b381b3SDavid Woodhouse struct list_head plug_list; 10753b381b3SDavid Woodhouse 10853b381b3SDavid Woodhouse /* 10953b381b3SDavid Woodhouse * flags that tell us if it is safe to 11053b381b3SDavid Woodhouse * merge with this bio 11153b381b3SDavid Woodhouse */ 11253b381b3SDavid Woodhouse unsigned long flags; 11353b381b3SDavid Woodhouse 11453b381b3SDavid Woodhouse /* size of each individual stripe on disk */ 11553b381b3SDavid Woodhouse int stripe_len; 11653b381b3SDavid Woodhouse 11753b381b3SDavid Woodhouse /* number of data stripes (no p/q) */ 11853b381b3SDavid Woodhouse int nr_data; 11953b381b3SDavid Woodhouse 12053b381b3SDavid Woodhouse /* 12153b381b3SDavid Woodhouse * set if we're doing a parity rebuild 12253b381b3SDavid Woodhouse * for a read from higher up, which is handled 12353b381b3SDavid Woodhouse * differently from a parity rebuild as part of 12453b381b3SDavid Woodhouse * rmw 12553b381b3SDavid Woodhouse */ 12653b381b3SDavid Woodhouse int read_rebuild; 12753b381b3SDavid Woodhouse 12853b381b3SDavid Woodhouse /* first bad stripe */ 12953b381b3SDavid Woodhouse int faila; 13053b381b3SDavid Woodhouse 13153b381b3SDavid Woodhouse /* second bad stripe (for raid6 use) */ 13253b381b3SDavid Woodhouse int failb; 13353b381b3SDavid Woodhouse 13453b381b3SDavid Woodhouse /* 13553b381b3SDavid Woodhouse * number of pages needed to represent the full 13653b381b3SDavid Woodhouse * stripe 13753b381b3SDavid Woodhouse */ 13853b381b3SDavid Woodhouse int nr_pages; 13953b381b3SDavid Woodhouse 14053b381b3SDavid Woodhouse /* 14153b381b3SDavid Woodhouse * size of all the bios in the bio_list. This 14253b381b3SDavid Woodhouse * helps us decide if the rbio maps to a full 14353b381b3SDavid Woodhouse * stripe or not 14453b381b3SDavid Woodhouse */ 14553b381b3SDavid Woodhouse int bio_list_bytes; 14653b381b3SDavid Woodhouse 14753b381b3SDavid Woodhouse atomic_t refs; 14853b381b3SDavid Woodhouse 14953b381b3SDavid Woodhouse /* 15053b381b3SDavid Woodhouse * these are two arrays of pointers. We allocate the 15153b381b3SDavid Woodhouse * rbio big enough to hold them both and setup their 15253b381b3SDavid Woodhouse * locations when the rbio is allocated 15353b381b3SDavid Woodhouse */ 15453b381b3SDavid Woodhouse 15553b381b3SDavid Woodhouse /* pointers to pages that we allocated for 15653b381b3SDavid Woodhouse * reading/writing stripes directly from the disk (including P/Q) 15753b381b3SDavid Woodhouse */ 15853b381b3SDavid Woodhouse struct page **stripe_pages; 15953b381b3SDavid Woodhouse 16053b381b3SDavid Woodhouse /* 16153b381b3SDavid Woodhouse * pointers to the pages in the bio_list. Stored 16253b381b3SDavid Woodhouse * here for faster lookup 16353b381b3SDavid Woodhouse */ 16453b381b3SDavid Woodhouse struct page **bio_pages; 16553b381b3SDavid Woodhouse }; 16653b381b3SDavid Woodhouse 16753b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 16853b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 16953b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work); 17053b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work); 17153b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 17253b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio); 17353b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 17453b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 17553b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio); 17653b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 17753b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 17853b381b3SDavid Woodhouse 17953b381b3SDavid Woodhouse /* 18053b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 18153b381b3SDavid Woodhouse * bios in hopes of making a full stripe 18253b381b3SDavid Woodhouse */ 18353b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 18453b381b3SDavid Woodhouse { 18553b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 18653b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 18753b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 18853b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 18953b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 19053b381b3SDavid Woodhouse int i; 19183c8266aSDavid Sterba int table_size; 19253b381b3SDavid Woodhouse 19353b381b3SDavid Woodhouse if (info->stripe_hash_table) 19453b381b3SDavid Woodhouse return 0; 19553b381b3SDavid Woodhouse 19683c8266aSDavid Sterba /* 19783c8266aSDavid Sterba * The table is large, starting with order 4 and can go as high as 19883c8266aSDavid Sterba * order 7 in case lock debugging is turned on. 19983c8266aSDavid Sterba * 20083c8266aSDavid Sterba * Try harder to allocate and fallback to vmalloc to lower the chance 20183c8266aSDavid Sterba * of a failing mount. 20283c8266aSDavid Sterba */ 20383c8266aSDavid Sterba table_size = sizeof(*table) + sizeof(*h) * num_entries; 20483c8266aSDavid Sterba table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 20583c8266aSDavid Sterba if (!table) { 20683c8266aSDavid Sterba table = vzalloc(table_size); 20753b381b3SDavid Woodhouse if (!table) 20853b381b3SDavid Woodhouse return -ENOMEM; 20983c8266aSDavid Sterba } 21053b381b3SDavid Woodhouse 2114ae10b3aSChris Mason spin_lock_init(&table->cache_lock); 2124ae10b3aSChris Mason INIT_LIST_HEAD(&table->stripe_cache); 2134ae10b3aSChris Mason 21453b381b3SDavid Woodhouse h = table->table; 21553b381b3SDavid Woodhouse 21653b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 21753b381b3SDavid Woodhouse cur = h + i; 21853b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 21953b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 22053b381b3SDavid Woodhouse init_waitqueue_head(&cur->wait); 22153b381b3SDavid Woodhouse } 22253b381b3SDavid Woodhouse 22353b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 22483c8266aSDavid Sterba if (x) { 22583c8266aSDavid Sterba if (is_vmalloc_addr(x)) 22683c8266aSDavid Sterba vfree(x); 22783c8266aSDavid Sterba else 22853b381b3SDavid Woodhouse kfree(x); 22983c8266aSDavid Sterba } 23053b381b3SDavid Woodhouse return 0; 23153b381b3SDavid Woodhouse } 23253b381b3SDavid Woodhouse 23353b381b3SDavid Woodhouse /* 2344ae10b3aSChris Mason * caching an rbio means to copy anything from the 2354ae10b3aSChris Mason * bio_pages array into the stripe_pages array. We 2364ae10b3aSChris Mason * use the page uptodate bit in the stripe cache array 2374ae10b3aSChris Mason * to indicate if it has valid data 2384ae10b3aSChris Mason * 2394ae10b3aSChris Mason * once the caching is done, we set the cache ready 2404ae10b3aSChris Mason * bit. 2414ae10b3aSChris Mason */ 2424ae10b3aSChris Mason static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 2434ae10b3aSChris Mason { 2444ae10b3aSChris Mason int i; 2454ae10b3aSChris Mason char *s; 2464ae10b3aSChris Mason char *d; 2474ae10b3aSChris Mason int ret; 2484ae10b3aSChris Mason 2494ae10b3aSChris Mason ret = alloc_rbio_pages(rbio); 2504ae10b3aSChris Mason if (ret) 2514ae10b3aSChris Mason return; 2524ae10b3aSChris Mason 2534ae10b3aSChris Mason for (i = 0; i < rbio->nr_pages; i++) { 2544ae10b3aSChris Mason if (!rbio->bio_pages[i]) 2554ae10b3aSChris Mason continue; 2564ae10b3aSChris Mason 2574ae10b3aSChris Mason s = kmap(rbio->bio_pages[i]); 2584ae10b3aSChris Mason d = kmap(rbio->stripe_pages[i]); 2594ae10b3aSChris Mason 2604ae10b3aSChris Mason memcpy(d, s, PAGE_CACHE_SIZE); 2614ae10b3aSChris Mason 2624ae10b3aSChris Mason kunmap(rbio->bio_pages[i]); 2634ae10b3aSChris Mason kunmap(rbio->stripe_pages[i]); 2644ae10b3aSChris Mason SetPageUptodate(rbio->stripe_pages[i]); 2654ae10b3aSChris Mason } 2664ae10b3aSChris Mason set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2674ae10b3aSChris Mason } 2684ae10b3aSChris Mason 2694ae10b3aSChris Mason /* 27053b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 27153b381b3SDavid Woodhouse */ 27253b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 27353b381b3SDavid Woodhouse { 27453b381b3SDavid Woodhouse u64 num = rbio->raid_map[0]; 27553b381b3SDavid Woodhouse 27653b381b3SDavid Woodhouse /* 27753b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 27853b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 27953b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 28053b381b3SDavid Woodhouse * returns just one or two different values. 28153b381b3SDavid Woodhouse * 28253b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 28353b381b3SDavid Woodhouse */ 28453b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 28553b381b3SDavid Woodhouse } 28653b381b3SDavid Woodhouse 28753b381b3SDavid Woodhouse /* 2884ae10b3aSChris Mason * stealing an rbio means taking all the uptodate pages from the stripe 2894ae10b3aSChris Mason * array in the source rbio and putting them into the destination rbio 2904ae10b3aSChris Mason */ 2914ae10b3aSChris Mason static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 2924ae10b3aSChris Mason { 2934ae10b3aSChris Mason int i; 2944ae10b3aSChris Mason struct page *s; 2954ae10b3aSChris Mason struct page *d; 2964ae10b3aSChris Mason 2974ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 2984ae10b3aSChris Mason return; 2994ae10b3aSChris Mason 3004ae10b3aSChris Mason for (i = 0; i < dest->nr_pages; i++) { 3014ae10b3aSChris Mason s = src->stripe_pages[i]; 3024ae10b3aSChris Mason if (!s || !PageUptodate(s)) { 3034ae10b3aSChris Mason continue; 3044ae10b3aSChris Mason } 3054ae10b3aSChris Mason 3064ae10b3aSChris Mason d = dest->stripe_pages[i]; 3074ae10b3aSChris Mason if (d) 3084ae10b3aSChris Mason __free_page(d); 3094ae10b3aSChris Mason 3104ae10b3aSChris Mason dest->stripe_pages[i] = s; 3114ae10b3aSChris Mason src->stripe_pages[i] = NULL; 3124ae10b3aSChris Mason } 3134ae10b3aSChris Mason } 3144ae10b3aSChris Mason 3154ae10b3aSChris Mason /* 31653b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 31753b381b3SDavid Woodhouse * splice it into the destination. The victim should 31853b381b3SDavid Woodhouse * be discarded afterwards. 31953b381b3SDavid Woodhouse * 32053b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 32153b381b3SDavid Woodhouse */ 32253b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 32353b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 32453b381b3SDavid Woodhouse { 32553b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 32653b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 32753b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 32853b381b3SDavid Woodhouse } 32953b381b3SDavid Woodhouse 33053b381b3SDavid Woodhouse /* 3314ae10b3aSChris Mason * used to prune items that are in the cache. The caller 3324ae10b3aSChris Mason * must hold the hash table lock. 3334ae10b3aSChris Mason */ 3344ae10b3aSChris Mason static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3354ae10b3aSChris Mason { 3364ae10b3aSChris Mason int bucket = rbio_bucket(rbio); 3374ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3384ae10b3aSChris Mason struct btrfs_stripe_hash *h; 3394ae10b3aSChris Mason int freeit = 0; 3404ae10b3aSChris Mason 3414ae10b3aSChris Mason /* 3424ae10b3aSChris Mason * check the bit again under the hash table lock. 3434ae10b3aSChris Mason */ 3444ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 3454ae10b3aSChris Mason return; 3464ae10b3aSChris Mason 3474ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 3484ae10b3aSChris Mason h = table->table + bucket; 3494ae10b3aSChris Mason 3504ae10b3aSChris Mason /* hold the lock for the bucket because we may be 3514ae10b3aSChris Mason * removing it from the hash table 3524ae10b3aSChris Mason */ 3534ae10b3aSChris Mason spin_lock(&h->lock); 3544ae10b3aSChris Mason 3554ae10b3aSChris Mason /* 3564ae10b3aSChris Mason * hold the lock for the bio list because we need 3574ae10b3aSChris Mason * to make sure the bio list is empty 3584ae10b3aSChris Mason */ 3594ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 3604ae10b3aSChris Mason 3614ae10b3aSChris Mason if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 3624ae10b3aSChris Mason list_del_init(&rbio->stripe_cache); 3634ae10b3aSChris Mason table->cache_size -= 1; 3644ae10b3aSChris Mason freeit = 1; 3654ae10b3aSChris Mason 3664ae10b3aSChris Mason /* if the bio list isn't empty, this rbio is 3674ae10b3aSChris Mason * still involved in an IO. We take it out 3684ae10b3aSChris Mason * of the cache list, and drop the ref that 3694ae10b3aSChris Mason * was held for the list. 3704ae10b3aSChris Mason * 3714ae10b3aSChris Mason * If the bio_list was empty, we also remove 3724ae10b3aSChris Mason * the rbio from the hash_table, and drop 3734ae10b3aSChris Mason * the corresponding ref 3744ae10b3aSChris Mason */ 3754ae10b3aSChris Mason if (bio_list_empty(&rbio->bio_list)) { 3764ae10b3aSChris Mason if (!list_empty(&rbio->hash_list)) { 3774ae10b3aSChris Mason list_del_init(&rbio->hash_list); 3784ae10b3aSChris Mason atomic_dec(&rbio->refs); 3794ae10b3aSChris Mason BUG_ON(!list_empty(&rbio->plug_list)); 3804ae10b3aSChris Mason } 3814ae10b3aSChris Mason } 3824ae10b3aSChris Mason } 3834ae10b3aSChris Mason 3844ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 3854ae10b3aSChris Mason spin_unlock(&h->lock); 3864ae10b3aSChris Mason 3874ae10b3aSChris Mason if (freeit) 3884ae10b3aSChris Mason __free_raid_bio(rbio); 3894ae10b3aSChris Mason } 3904ae10b3aSChris Mason 3914ae10b3aSChris Mason /* 3924ae10b3aSChris Mason * prune a given rbio from the cache 3934ae10b3aSChris Mason */ 3944ae10b3aSChris Mason static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 3954ae10b3aSChris Mason { 3964ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 3974ae10b3aSChris Mason unsigned long flags; 3984ae10b3aSChris Mason 3994ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 4004ae10b3aSChris Mason return; 4014ae10b3aSChris Mason 4024ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 4034ae10b3aSChris Mason 4044ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4054ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4064ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4074ae10b3aSChris Mason } 4084ae10b3aSChris Mason 4094ae10b3aSChris Mason /* 4104ae10b3aSChris Mason * remove everything in the cache 4114ae10b3aSChris Mason */ 41248a3b636SEric Sandeen static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 4134ae10b3aSChris Mason { 4144ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4154ae10b3aSChris Mason unsigned long flags; 4164ae10b3aSChris Mason struct btrfs_raid_bio *rbio; 4174ae10b3aSChris Mason 4184ae10b3aSChris Mason table = info->stripe_hash_table; 4194ae10b3aSChris Mason 4204ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4214ae10b3aSChris Mason while (!list_empty(&table->stripe_cache)) { 4224ae10b3aSChris Mason rbio = list_entry(table->stripe_cache.next, 4234ae10b3aSChris Mason struct btrfs_raid_bio, 4244ae10b3aSChris Mason stripe_cache); 4254ae10b3aSChris Mason __remove_rbio_from_cache(rbio); 4264ae10b3aSChris Mason } 4274ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4284ae10b3aSChris Mason } 4294ae10b3aSChris Mason 4304ae10b3aSChris Mason /* 4314ae10b3aSChris Mason * remove all cached entries and free the hash table 4324ae10b3aSChris Mason * used by unmount 43353b381b3SDavid Woodhouse */ 43453b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 43553b381b3SDavid Woodhouse { 43653b381b3SDavid Woodhouse if (!info->stripe_hash_table) 43753b381b3SDavid Woodhouse return; 4384ae10b3aSChris Mason btrfs_clear_rbio_cache(info); 43983c8266aSDavid Sterba if (is_vmalloc_addr(info->stripe_hash_table)) 44083c8266aSDavid Sterba vfree(info->stripe_hash_table); 44183c8266aSDavid Sterba else 44253b381b3SDavid Woodhouse kfree(info->stripe_hash_table); 44353b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 44453b381b3SDavid Woodhouse } 44553b381b3SDavid Woodhouse 44653b381b3SDavid Woodhouse /* 4474ae10b3aSChris Mason * insert an rbio into the stripe cache. It 4484ae10b3aSChris Mason * must have already been prepared by calling 4494ae10b3aSChris Mason * cache_rbio_pages 4504ae10b3aSChris Mason * 4514ae10b3aSChris Mason * If this rbio was already cached, it gets 4524ae10b3aSChris Mason * moved to the front of the lru. 4534ae10b3aSChris Mason * 4544ae10b3aSChris Mason * If the size of the rbio cache is too big, we 4554ae10b3aSChris Mason * prune an item. 4564ae10b3aSChris Mason */ 4574ae10b3aSChris Mason static void cache_rbio(struct btrfs_raid_bio *rbio) 4584ae10b3aSChris Mason { 4594ae10b3aSChris Mason struct btrfs_stripe_hash_table *table; 4604ae10b3aSChris Mason unsigned long flags; 4614ae10b3aSChris Mason 4624ae10b3aSChris Mason if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 4634ae10b3aSChris Mason return; 4644ae10b3aSChris Mason 4654ae10b3aSChris Mason table = rbio->fs_info->stripe_hash_table; 4664ae10b3aSChris Mason 4674ae10b3aSChris Mason spin_lock_irqsave(&table->cache_lock, flags); 4684ae10b3aSChris Mason spin_lock(&rbio->bio_list_lock); 4694ae10b3aSChris Mason 4704ae10b3aSChris Mason /* bump our ref if we were not in the list before */ 4714ae10b3aSChris Mason if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 4724ae10b3aSChris Mason atomic_inc(&rbio->refs); 4734ae10b3aSChris Mason 4744ae10b3aSChris Mason if (!list_empty(&rbio->stripe_cache)){ 4754ae10b3aSChris Mason list_move(&rbio->stripe_cache, &table->stripe_cache); 4764ae10b3aSChris Mason } else { 4774ae10b3aSChris Mason list_add(&rbio->stripe_cache, &table->stripe_cache); 4784ae10b3aSChris Mason table->cache_size += 1; 4794ae10b3aSChris Mason } 4804ae10b3aSChris Mason 4814ae10b3aSChris Mason spin_unlock(&rbio->bio_list_lock); 4824ae10b3aSChris Mason 4834ae10b3aSChris Mason if (table->cache_size > RBIO_CACHE_SIZE) { 4844ae10b3aSChris Mason struct btrfs_raid_bio *found; 4854ae10b3aSChris Mason 4864ae10b3aSChris Mason found = list_entry(table->stripe_cache.prev, 4874ae10b3aSChris Mason struct btrfs_raid_bio, 4884ae10b3aSChris Mason stripe_cache); 4894ae10b3aSChris Mason 4904ae10b3aSChris Mason if (found != rbio) 4914ae10b3aSChris Mason __remove_rbio_from_cache(found); 4924ae10b3aSChris Mason } 4934ae10b3aSChris Mason 4944ae10b3aSChris Mason spin_unlock_irqrestore(&table->cache_lock, flags); 4954ae10b3aSChris Mason return; 4964ae10b3aSChris Mason } 4974ae10b3aSChris Mason 4984ae10b3aSChris Mason /* 49953b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 50053b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 50153b381b3SDavid Woodhouse * loop through. 50253b381b3SDavid Woodhouse */ 50353b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 50453b381b3SDavid Woodhouse { 50553b381b3SDavid Woodhouse int src_off = 0; 50653b381b3SDavid Woodhouse int xor_src_cnt = 0; 50753b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 50853b381b3SDavid Woodhouse 50953b381b3SDavid Woodhouse while(src_cnt > 0) { 51053b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 51153b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 51253b381b3SDavid Woodhouse 51353b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 51453b381b3SDavid Woodhouse src_off += xor_src_cnt; 51553b381b3SDavid Woodhouse } 51653b381b3SDavid Woodhouse } 51753b381b3SDavid Woodhouse 51853b381b3SDavid Woodhouse /* 51953b381b3SDavid Woodhouse * returns true if the bio list inside this rbio 52053b381b3SDavid Woodhouse * covers an entire stripe (no rmw required). 52153b381b3SDavid Woodhouse * Must be called with the bio list lock held, or 52253b381b3SDavid Woodhouse * at a time when you know it is impossible to add 52353b381b3SDavid Woodhouse * new bios into the list 52453b381b3SDavid Woodhouse */ 52553b381b3SDavid Woodhouse static int __rbio_is_full(struct btrfs_raid_bio *rbio) 52653b381b3SDavid Woodhouse { 52753b381b3SDavid Woodhouse unsigned long size = rbio->bio_list_bytes; 52853b381b3SDavid Woodhouse int ret = 1; 52953b381b3SDavid Woodhouse 53053b381b3SDavid Woodhouse if (size != rbio->nr_data * rbio->stripe_len) 53153b381b3SDavid Woodhouse ret = 0; 53253b381b3SDavid Woodhouse 53353b381b3SDavid Woodhouse BUG_ON(size > rbio->nr_data * rbio->stripe_len); 53453b381b3SDavid Woodhouse return ret; 53553b381b3SDavid Woodhouse } 53653b381b3SDavid Woodhouse 53753b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 53853b381b3SDavid Woodhouse { 53953b381b3SDavid Woodhouse unsigned long flags; 54053b381b3SDavid Woodhouse int ret; 54153b381b3SDavid Woodhouse 54253b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 54353b381b3SDavid Woodhouse ret = __rbio_is_full(rbio); 54453b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 54553b381b3SDavid Woodhouse return ret; 54653b381b3SDavid Woodhouse } 54753b381b3SDavid Woodhouse 54853b381b3SDavid Woodhouse /* 54953b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 55053b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 55153b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 55253b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 55353b381b3SDavid Woodhouse * locked for final IO 55453b381b3SDavid Woodhouse * 55553b381b3SDavid Woodhouse * The caller is responsible for locking such that 55653b381b3SDavid Woodhouse * rmw_locked is safe to test 55753b381b3SDavid Woodhouse */ 55853b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 55953b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 56053b381b3SDavid Woodhouse { 56153b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 56253b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 56353b381b3SDavid Woodhouse return 0; 56453b381b3SDavid Woodhouse 5654ae10b3aSChris Mason /* 5664ae10b3aSChris Mason * we can't merge with cached rbios, since the 5674ae10b3aSChris Mason * idea is that when we merge the destination 5684ae10b3aSChris Mason * rbio is going to run our IO for us. We can 5694ae10b3aSChris Mason * steal from cached rbio's though, other functions 5704ae10b3aSChris Mason * handle that. 5714ae10b3aSChris Mason */ 5724ae10b3aSChris Mason if (test_bit(RBIO_CACHE_BIT, &last->flags) || 5734ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags)) 5744ae10b3aSChris Mason return 0; 5754ae10b3aSChris Mason 57653b381b3SDavid Woodhouse if (last->raid_map[0] != 57753b381b3SDavid Woodhouse cur->raid_map[0]) 57853b381b3SDavid Woodhouse return 0; 57953b381b3SDavid Woodhouse 58053b381b3SDavid Woodhouse /* reads can't merge with writes */ 58153b381b3SDavid Woodhouse if (last->read_rebuild != 58253b381b3SDavid Woodhouse cur->read_rebuild) { 58353b381b3SDavid Woodhouse return 0; 58453b381b3SDavid Woodhouse } 58553b381b3SDavid Woodhouse 58653b381b3SDavid Woodhouse return 1; 58753b381b3SDavid Woodhouse } 58853b381b3SDavid Woodhouse 58953b381b3SDavid Woodhouse /* 59053b381b3SDavid Woodhouse * helper to index into the pstripe 59153b381b3SDavid Woodhouse */ 59253b381b3SDavid Woodhouse static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 59353b381b3SDavid Woodhouse { 59453b381b3SDavid Woodhouse index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 59553b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 59653b381b3SDavid Woodhouse } 59753b381b3SDavid Woodhouse 59853b381b3SDavid Woodhouse /* 59953b381b3SDavid Woodhouse * helper to index into the qstripe, returns null 60053b381b3SDavid Woodhouse * if there is no qstripe 60153b381b3SDavid Woodhouse */ 60253b381b3SDavid Woodhouse static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 60353b381b3SDavid Woodhouse { 60453b381b3SDavid Woodhouse if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 60553b381b3SDavid Woodhouse return NULL; 60653b381b3SDavid Woodhouse 60753b381b3SDavid Woodhouse index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 60853b381b3SDavid Woodhouse PAGE_CACHE_SHIFT; 60953b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 61053b381b3SDavid Woodhouse } 61153b381b3SDavid Woodhouse 61253b381b3SDavid Woodhouse /* 61353b381b3SDavid Woodhouse * The first stripe in the table for a logical address 61453b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 61553b381b3SDavid Woodhouse * 61653b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 61753b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 61853b381b3SDavid Woodhouse * themselves. 61953b381b3SDavid Woodhouse * 62053b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 62153b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 62253b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 62353b381b3SDavid Woodhouse * 62453b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 62553b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 62653b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 62753b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 62853b381b3SDavid Woodhouse * 1 is returned 62953b381b3SDavid Woodhouse * 63053b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 63153b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 63253b381b3SDavid Woodhouse * already been freed. 63353b381b3SDavid Woodhouse */ 63453b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 63553b381b3SDavid Woodhouse { 63653b381b3SDavid Woodhouse int bucket = rbio_bucket(rbio); 63753b381b3SDavid Woodhouse struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 63853b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 63953b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 64053b381b3SDavid Woodhouse unsigned long flags; 64153b381b3SDavid Woodhouse DEFINE_WAIT(wait); 64253b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 6434ae10b3aSChris Mason struct btrfs_raid_bio *cache_drop = NULL; 64453b381b3SDavid Woodhouse int ret = 0; 64553b381b3SDavid Woodhouse int walk = 0; 64653b381b3SDavid Woodhouse 64753b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 64853b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 64953b381b3SDavid Woodhouse walk++; 65053b381b3SDavid Woodhouse if (cur->raid_map[0] == rbio->raid_map[0]) { 65153b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 65253b381b3SDavid Woodhouse 6534ae10b3aSChris Mason /* can we steal this cached rbio's pages? */ 6544ae10b3aSChris Mason if (bio_list_empty(&cur->bio_list) && 6554ae10b3aSChris Mason list_empty(&cur->plug_list) && 6564ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &cur->flags) && 6574ae10b3aSChris Mason !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 6584ae10b3aSChris Mason list_del_init(&cur->hash_list); 6594ae10b3aSChris Mason atomic_dec(&cur->refs); 6604ae10b3aSChris Mason 6614ae10b3aSChris Mason steal_rbio(cur, rbio); 6624ae10b3aSChris Mason cache_drop = cur; 6634ae10b3aSChris Mason spin_unlock(&cur->bio_list_lock); 6644ae10b3aSChris Mason 6654ae10b3aSChris Mason goto lockit; 6664ae10b3aSChris Mason } 6674ae10b3aSChris Mason 66853b381b3SDavid Woodhouse /* can we merge into the lock owner? */ 66953b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 67053b381b3SDavid Woodhouse merge_rbio(cur, rbio); 67153b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 67253b381b3SDavid Woodhouse freeit = rbio; 67353b381b3SDavid Woodhouse ret = 1; 67453b381b3SDavid Woodhouse goto out; 67553b381b3SDavid Woodhouse } 67653b381b3SDavid Woodhouse 6774ae10b3aSChris Mason 67853b381b3SDavid Woodhouse /* 67953b381b3SDavid Woodhouse * we couldn't merge with the running 68053b381b3SDavid Woodhouse * rbio, see if we can merge with the 68153b381b3SDavid Woodhouse * pending ones. We don't have to 68253b381b3SDavid Woodhouse * check for rmw_locked because there 68353b381b3SDavid Woodhouse * is no way they are inside finish_rmw 68453b381b3SDavid Woodhouse * right now 68553b381b3SDavid Woodhouse */ 68653b381b3SDavid Woodhouse list_for_each_entry(pending, &cur->plug_list, 68753b381b3SDavid Woodhouse plug_list) { 68853b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 68953b381b3SDavid Woodhouse merge_rbio(pending, rbio); 69053b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 69153b381b3SDavid Woodhouse freeit = rbio; 69253b381b3SDavid Woodhouse ret = 1; 69353b381b3SDavid Woodhouse goto out; 69453b381b3SDavid Woodhouse } 69553b381b3SDavid Woodhouse } 69653b381b3SDavid Woodhouse 69753b381b3SDavid Woodhouse /* no merging, put us on the tail of the plug list, 69853b381b3SDavid Woodhouse * our rbio will be started with the currently 69953b381b3SDavid Woodhouse * running rbio unlocks 70053b381b3SDavid Woodhouse */ 70153b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 70253b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 70353b381b3SDavid Woodhouse ret = 1; 70453b381b3SDavid Woodhouse goto out; 70553b381b3SDavid Woodhouse } 70653b381b3SDavid Woodhouse } 7074ae10b3aSChris Mason lockit: 70853b381b3SDavid Woodhouse atomic_inc(&rbio->refs); 70953b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 71053b381b3SDavid Woodhouse out: 71153b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 7124ae10b3aSChris Mason if (cache_drop) 7134ae10b3aSChris Mason remove_rbio_from_cache(cache_drop); 71453b381b3SDavid Woodhouse if (freeit) 71553b381b3SDavid Woodhouse __free_raid_bio(freeit); 71653b381b3SDavid Woodhouse return ret; 71753b381b3SDavid Woodhouse } 71853b381b3SDavid Woodhouse 71953b381b3SDavid Woodhouse /* 72053b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 72153b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 72253b381b3SDavid Woodhouse */ 72353b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 72453b381b3SDavid Woodhouse { 72553b381b3SDavid Woodhouse int bucket; 72653b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 72753b381b3SDavid Woodhouse unsigned long flags; 7284ae10b3aSChris Mason int keep_cache = 0; 72953b381b3SDavid Woodhouse 73053b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 73153b381b3SDavid Woodhouse h = rbio->fs_info->stripe_hash_table->table + bucket; 73253b381b3SDavid Woodhouse 7334ae10b3aSChris Mason if (list_empty(&rbio->plug_list)) 7344ae10b3aSChris Mason cache_rbio(rbio); 7354ae10b3aSChris Mason 73653b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 73753b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 73853b381b3SDavid Woodhouse 73953b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 7404ae10b3aSChris Mason /* 7414ae10b3aSChris Mason * if we're still cached and there is no other IO 7424ae10b3aSChris Mason * to perform, just leave this rbio here for others 7434ae10b3aSChris Mason * to steal from later 7444ae10b3aSChris Mason */ 7454ae10b3aSChris Mason if (list_empty(&rbio->plug_list) && 7464ae10b3aSChris Mason test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 7474ae10b3aSChris Mason keep_cache = 1; 7484ae10b3aSChris Mason clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 7494ae10b3aSChris Mason BUG_ON(!bio_list_empty(&rbio->bio_list)); 7504ae10b3aSChris Mason goto done; 7514ae10b3aSChris Mason } 75253b381b3SDavid Woodhouse 75353b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 75453b381b3SDavid Woodhouse atomic_dec(&rbio->refs); 75553b381b3SDavid Woodhouse 75653b381b3SDavid Woodhouse /* 75753b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 75853b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 75953b381b3SDavid Woodhouse * hand the lock over to one of them. 76053b381b3SDavid Woodhouse */ 76153b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 76253b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 76353b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 76453b381b3SDavid Woodhouse 76553b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 76653b381b3SDavid Woodhouse plug_list); 76753b381b3SDavid Woodhouse 76853b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 76953b381b3SDavid Woodhouse 77053b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 77153b381b3SDavid Woodhouse atomic_inc(&next->refs); 77253b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 77353b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 77453b381b3SDavid Woodhouse 77553b381b3SDavid Woodhouse if (next->read_rebuild) 77653b381b3SDavid Woodhouse async_read_rebuild(next); 7774ae10b3aSChris Mason else { 7784ae10b3aSChris Mason steal_rbio(rbio, next); 77953b381b3SDavid Woodhouse async_rmw_stripe(next); 7804ae10b3aSChris Mason } 78153b381b3SDavid Woodhouse 78253b381b3SDavid Woodhouse goto done_nolock; 78353b381b3SDavid Woodhouse } else if (waitqueue_active(&h->wait)) { 78453b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 78553b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 78653b381b3SDavid Woodhouse wake_up(&h->wait); 78753b381b3SDavid Woodhouse goto done_nolock; 78853b381b3SDavid Woodhouse } 78953b381b3SDavid Woodhouse } 7904ae10b3aSChris Mason done: 79153b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 79253b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 79353b381b3SDavid Woodhouse 79453b381b3SDavid Woodhouse done_nolock: 7954ae10b3aSChris Mason if (!keep_cache) 7964ae10b3aSChris Mason remove_rbio_from_cache(rbio); 79753b381b3SDavid Woodhouse } 79853b381b3SDavid Woodhouse 79953b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio) 80053b381b3SDavid Woodhouse { 80153b381b3SDavid Woodhouse int i; 80253b381b3SDavid Woodhouse 80353b381b3SDavid Woodhouse WARN_ON(atomic_read(&rbio->refs) < 0); 80453b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->refs)) 80553b381b3SDavid Woodhouse return; 80653b381b3SDavid Woodhouse 8074ae10b3aSChris Mason WARN_ON(!list_empty(&rbio->stripe_cache)); 80853b381b3SDavid Woodhouse WARN_ON(!list_empty(&rbio->hash_list)); 80953b381b3SDavid Woodhouse WARN_ON(!bio_list_empty(&rbio->bio_list)); 81053b381b3SDavid Woodhouse 81153b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 81253b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) { 81353b381b3SDavid Woodhouse __free_page(rbio->stripe_pages[i]); 81453b381b3SDavid Woodhouse rbio->stripe_pages[i] = NULL; 81553b381b3SDavid Woodhouse } 81653b381b3SDavid Woodhouse } 81753b381b3SDavid Woodhouse kfree(rbio->raid_map); 81853b381b3SDavid Woodhouse kfree(rbio->bbio); 81953b381b3SDavid Woodhouse kfree(rbio); 82053b381b3SDavid Woodhouse } 82153b381b3SDavid Woodhouse 82253b381b3SDavid Woodhouse static void free_raid_bio(struct btrfs_raid_bio *rbio) 82353b381b3SDavid Woodhouse { 82453b381b3SDavid Woodhouse unlock_stripe(rbio); 82553b381b3SDavid Woodhouse __free_raid_bio(rbio); 82653b381b3SDavid Woodhouse } 82753b381b3SDavid Woodhouse 82853b381b3SDavid Woodhouse /* 82953b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 83053b381b3SDavid Woodhouse * bio_list and calls end_io on them 83153b381b3SDavid Woodhouse */ 83253b381b3SDavid Woodhouse static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 83353b381b3SDavid Woodhouse { 83453b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 83553b381b3SDavid Woodhouse struct bio *next; 83653b381b3SDavid Woodhouse free_raid_bio(rbio); 83753b381b3SDavid Woodhouse 83853b381b3SDavid Woodhouse while (cur) { 83953b381b3SDavid Woodhouse next = cur->bi_next; 84053b381b3SDavid Woodhouse cur->bi_next = NULL; 84153b381b3SDavid Woodhouse if (uptodate) 84253b381b3SDavid Woodhouse set_bit(BIO_UPTODATE, &cur->bi_flags); 84353b381b3SDavid Woodhouse bio_endio(cur, err); 84453b381b3SDavid Woodhouse cur = next; 84553b381b3SDavid Woodhouse } 84653b381b3SDavid Woodhouse } 84753b381b3SDavid Woodhouse 84853b381b3SDavid Woodhouse /* 84953b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 85053b381b3SDavid Woodhouse * get here, we've written a full stripe 85153b381b3SDavid Woodhouse */ 85253b381b3SDavid Woodhouse static void raid_write_end_io(struct bio *bio, int err) 85353b381b3SDavid Woodhouse { 85453b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 85553b381b3SDavid Woodhouse 85653b381b3SDavid Woodhouse if (err) 85753b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 85853b381b3SDavid Woodhouse 85953b381b3SDavid Woodhouse bio_put(bio); 86053b381b3SDavid Woodhouse 86153b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 86253b381b3SDavid Woodhouse return; 86353b381b3SDavid Woodhouse 86453b381b3SDavid Woodhouse err = 0; 86553b381b3SDavid Woodhouse 86653b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 86753b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 86853b381b3SDavid Woodhouse err = -EIO; 86953b381b3SDavid Woodhouse 87053b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, 0); 87153b381b3SDavid Woodhouse return; 87253b381b3SDavid Woodhouse } 87353b381b3SDavid Woodhouse 87453b381b3SDavid Woodhouse /* 87553b381b3SDavid Woodhouse * the read/modify/write code wants to use the original bio for 87653b381b3SDavid Woodhouse * any pages it included, and then use the rbio for everything 87753b381b3SDavid Woodhouse * else. This function decides if a given index (stripe number) 87853b381b3SDavid Woodhouse * and page number in that stripe fall inside the original bio 87953b381b3SDavid Woodhouse * or the rbio. 88053b381b3SDavid Woodhouse * 88153b381b3SDavid Woodhouse * if you set bio_list_only, you'll get a NULL back for any ranges 88253b381b3SDavid Woodhouse * that are outside the bio_list 88353b381b3SDavid Woodhouse * 88453b381b3SDavid Woodhouse * This doesn't take any refs on anything, you get a bare page pointer 88553b381b3SDavid Woodhouse * and the caller must bump refs as required. 88653b381b3SDavid Woodhouse * 88753b381b3SDavid Woodhouse * You must call index_rbio_pages once before you can trust 88853b381b3SDavid Woodhouse * the answers from this function. 88953b381b3SDavid Woodhouse */ 89053b381b3SDavid Woodhouse static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 89153b381b3SDavid Woodhouse int index, int pagenr, int bio_list_only) 89253b381b3SDavid Woodhouse { 89353b381b3SDavid Woodhouse int chunk_page; 89453b381b3SDavid Woodhouse struct page *p = NULL; 89553b381b3SDavid Woodhouse 89653b381b3SDavid Woodhouse chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 89753b381b3SDavid Woodhouse 89853b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 89953b381b3SDavid Woodhouse p = rbio->bio_pages[chunk_page]; 90053b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 90153b381b3SDavid Woodhouse 90253b381b3SDavid Woodhouse if (p || bio_list_only) 90353b381b3SDavid Woodhouse return p; 90453b381b3SDavid Woodhouse 90553b381b3SDavid Woodhouse return rbio->stripe_pages[chunk_page]; 90653b381b3SDavid Woodhouse } 90753b381b3SDavid Woodhouse 90853b381b3SDavid Woodhouse /* 90953b381b3SDavid Woodhouse * number of pages we need for the entire stripe across all the 91053b381b3SDavid Woodhouse * drives 91153b381b3SDavid Woodhouse */ 91253b381b3SDavid Woodhouse static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 91353b381b3SDavid Woodhouse { 91453b381b3SDavid Woodhouse unsigned long nr = stripe_len * nr_stripes; 91553b381b3SDavid Woodhouse return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 91653b381b3SDavid Woodhouse } 91753b381b3SDavid Woodhouse 91853b381b3SDavid Woodhouse /* 91953b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 92053b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 92153b381b3SDavid Woodhouse */ 92253b381b3SDavid Woodhouse static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 92353b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 92453b381b3SDavid Woodhouse u64 stripe_len) 92553b381b3SDavid Woodhouse { 92653b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 92753b381b3SDavid Woodhouse int nr_data = 0; 92853b381b3SDavid Woodhouse int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 92953b381b3SDavid Woodhouse void *p; 93053b381b3SDavid Woodhouse 93153b381b3SDavid Woodhouse rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 93253b381b3SDavid Woodhouse GFP_NOFS); 93353b381b3SDavid Woodhouse if (!rbio) { 93453b381b3SDavid Woodhouse kfree(raid_map); 93553b381b3SDavid Woodhouse kfree(bbio); 93653b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 93753b381b3SDavid Woodhouse } 93853b381b3SDavid Woodhouse 93953b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 94053b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 94153b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 9424ae10b3aSChris Mason INIT_LIST_HEAD(&rbio->stripe_cache); 94353b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 94453b381b3SDavid Woodhouse rbio->bbio = bbio; 94553b381b3SDavid Woodhouse rbio->raid_map = raid_map; 94653b381b3SDavid Woodhouse rbio->fs_info = root->fs_info; 94753b381b3SDavid Woodhouse rbio->stripe_len = stripe_len; 94853b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 94953b381b3SDavid Woodhouse rbio->faila = -1; 95053b381b3SDavid Woodhouse rbio->failb = -1; 95153b381b3SDavid Woodhouse atomic_set(&rbio->refs, 1); 95253b381b3SDavid Woodhouse 95353b381b3SDavid Woodhouse /* 95453b381b3SDavid Woodhouse * the stripe_pages and bio_pages array point to the extra 95553b381b3SDavid Woodhouse * memory we allocated past the end of the rbio 95653b381b3SDavid Woodhouse */ 95753b381b3SDavid Woodhouse p = rbio + 1; 95853b381b3SDavid Woodhouse rbio->stripe_pages = p; 95953b381b3SDavid Woodhouse rbio->bio_pages = p + sizeof(struct page *) * num_pages; 96053b381b3SDavid Woodhouse 96153b381b3SDavid Woodhouse if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 96253b381b3SDavid Woodhouse nr_data = bbio->num_stripes - 2; 96353b381b3SDavid Woodhouse else 96453b381b3SDavid Woodhouse nr_data = bbio->num_stripes - 1; 96553b381b3SDavid Woodhouse 96653b381b3SDavid Woodhouse rbio->nr_data = nr_data; 96753b381b3SDavid Woodhouse return rbio; 96853b381b3SDavid Woodhouse } 96953b381b3SDavid Woodhouse 97053b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 97153b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 97253b381b3SDavid Woodhouse { 97353b381b3SDavid Woodhouse int i; 97453b381b3SDavid Woodhouse struct page *page; 97553b381b3SDavid Woodhouse 97653b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 97753b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 97853b381b3SDavid Woodhouse continue; 97953b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 98053b381b3SDavid Woodhouse if (!page) 98153b381b3SDavid Woodhouse return -ENOMEM; 98253b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 98353b381b3SDavid Woodhouse ClearPageUptodate(page); 98453b381b3SDavid Woodhouse } 98553b381b3SDavid Woodhouse return 0; 98653b381b3SDavid Woodhouse } 98753b381b3SDavid Woodhouse 98853b381b3SDavid Woodhouse /* allocate pages for just the p/q stripes */ 98953b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 99053b381b3SDavid Woodhouse { 99153b381b3SDavid Woodhouse int i; 99253b381b3SDavid Woodhouse struct page *page; 99353b381b3SDavid Woodhouse 99453b381b3SDavid Woodhouse i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 99553b381b3SDavid Woodhouse 99653b381b3SDavid Woodhouse for (; i < rbio->nr_pages; i++) { 99753b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 99853b381b3SDavid Woodhouse continue; 99953b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 100053b381b3SDavid Woodhouse if (!page) 100153b381b3SDavid Woodhouse return -ENOMEM; 100253b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 100353b381b3SDavid Woodhouse } 100453b381b3SDavid Woodhouse return 0; 100553b381b3SDavid Woodhouse } 100653b381b3SDavid Woodhouse 100753b381b3SDavid Woodhouse /* 100853b381b3SDavid Woodhouse * add a single page from a specific stripe into our list of bios for IO 100953b381b3SDavid Woodhouse * this will try to merge into existing bios if possible, and returns 101053b381b3SDavid Woodhouse * zero if all went well. 101153b381b3SDavid Woodhouse */ 101248a3b636SEric Sandeen static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 101353b381b3SDavid Woodhouse struct bio_list *bio_list, 101453b381b3SDavid Woodhouse struct page *page, 101553b381b3SDavid Woodhouse int stripe_nr, 101653b381b3SDavid Woodhouse unsigned long page_index, 101753b381b3SDavid Woodhouse unsigned long bio_max_len) 101853b381b3SDavid Woodhouse { 101953b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 102053b381b3SDavid Woodhouse u64 last_end = 0; 102153b381b3SDavid Woodhouse int ret; 102253b381b3SDavid Woodhouse struct bio *bio; 102353b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 102453b381b3SDavid Woodhouse u64 disk_start; 102553b381b3SDavid Woodhouse 102653b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[stripe_nr]; 102753b381b3SDavid Woodhouse disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 102853b381b3SDavid Woodhouse 102953b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 103053b381b3SDavid Woodhouse if (!stripe->dev->bdev) 103153b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 103253b381b3SDavid Woodhouse 103353b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 103453b381b3SDavid Woodhouse if (last) { 1035*4f024f37SKent Overstreet last_end = (u64)last->bi_iter.bi_sector << 9; 1036*4f024f37SKent Overstreet last_end += last->bi_iter.bi_size; 103753b381b3SDavid Woodhouse 103853b381b3SDavid Woodhouse /* 103953b381b3SDavid Woodhouse * we can't merge these if they are from different 104053b381b3SDavid Woodhouse * devices or if they are not contiguous 104153b381b3SDavid Woodhouse */ 104253b381b3SDavid Woodhouse if (last_end == disk_start && stripe->dev->bdev && 104353b381b3SDavid Woodhouse test_bit(BIO_UPTODATE, &last->bi_flags) && 104453b381b3SDavid Woodhouse last->bi_bdev == stripe->dev->bdev) { 104553b381b3SDavid Woodhouse ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 104653b381b3SDavid Woodhouse if (ret == PAGE_CACHE_SIZE) 104753b381b3SDavid Woodhouse return 0; 104853b381b3SDavid Woodhouse } 104953b381b3SDavid Woodhouse } 105053b381b3SDavid Woodhouse 105153b381b3SDavid Woodhouse /* put a new bio on the list */ 10529be3395bSChris Mason bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 105353b381b3SDavid Woodhouse if (!bio) 105453b381b3SDavid Woodhouse return -ENOMEM; 105553b381b3SDavid Woodhouse 1056*4f024f37SKent Overstreet bio->bi_iter.bi_size = 0; 105753b381b3SDavid Woodhouse bio->bi_bdev = stripe->dev->bdev; 1058*4f024f37SKent Overstreet bio->bi_iter.bi_sector = disk_start >> 9; 105953b381b3SDavid Woodhouse set_bit(BIO_UPTODATE, &bio->bi_flags); 106053b381b3SDavid Woodhouse 106153b381b3SDavid Woodhouse bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 106253b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 106353b381b3SDavid Woodhouse return 0; 106453b381b3SDavid Woodhouse } 106553b381b3SDavid Woodhouse 106653b381b3SDavid Woodhouse /* 106753b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 106853b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 106953b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 107053b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 107153b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 107253b381b3SDavid Woodhouse */ 107353b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 107453b381b3SDavid Woodhouse { 107553b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 107653b381b3SDavid Woodhouse BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 107753b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 107853b381b3SDavid Woodhouse } else { 107953b381b3SDavid Woodhouse finish_rmw(rbio); 108053b381b3SDavid Woodhouse } 108153b381b3SDavid Woodhouse } 108253b381b3SDavid Woodhouse 108353b381b3SDavid Woodhouse /* 108453b381b3SDavid Woodhouse * these are just the pages from the rbio array, not from anything 108553b381b3SDavid Woodhouse * the FS sent down to us 108653b381b3SDavid Woodhouse */ 108753b381b3SDavid Woodhouse static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 108853b381b3SDavid Woodhouse { 108953b381b3SDavid Woodhouse int index; 109053b381b3SDavid Woodhouse index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 109153b381b3SDavid Woodhouse index += page; 109253b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 109353b381b3SDavid Woodhouse } 109453b381b3SDavid Woodhouse 109553b381b3SDavid Woodhouse /* 109653b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 109753b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 109853b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 109953b381b3SDavid Woodhouse * reconstruction. 110053b381b3SDavid Woodhouse * 110153b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 110253b381b3SDavid Woodhouse */ 110353b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 110453b381b3SDavid Woodhouse { 110553b381b3SDavid Woodhouse struct bio *bio; 110653b381b3SDavid Woodhouse u64 start; 110753b381b3SDavid Woodhouse unsigned long stripe_offset; 110853b381b3SDavid Woodhouse unsigned long page_index; 110953b381b3SDavid Woodhouse struct page *p; 111053b381b3SDavid Woodhouse int i; 111153b381b3SDavid Woodhouse 111253b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 111353b381b3SDavid Woodhouse bio_list_for_each(bio, &rbio->bio_list) { 1114*4f024f37SKent Overstreet start = (u64)bio->bi_iter.bi_sector << 9; 111553b381b3SDavid Woodhouse stripe_offset = start - rbio->raid_map[0]; 111653b381b3SDavid Woodhouse page_index = stripe_offset >> PAGE_CACHE_SHIFT; 111753b381b3SDavid Woodhouse 111853b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 111953b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 112053b381b3SDavid Woodhouse rbio->bio_pages[page_index + i] = p; 112153b381b3SDavid Woodhouse } 112253b381b3SDavid Woodhouse } 112353b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 112453b381b3SDavid Woodhouse } 112553b381b3SDavid Woodhouse 112653b381b3SDavid Woodhouse /* 112753b381b3SDavid Woodhouse * this is called from one of two situations. We either 112853b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 112953b381b3SDavid Woodhouse * the missing bits off disk. 113053b381b3SDavid Woodhouse * 113153b381b3SDavid Woodhouse * This will calculate the parity and then send down any 113253b381b3SDavid Woodhouse * changed blocks. 113353b381b3SDavid Woodhouse */ 113453b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 113553b381b3SDavid Woodhouse { 113653b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 113753b381b3SDavid Woodhouse void *pointers[bbio->num_stripes]; 113853b381b3SDavid Woodhouse int stripe_len = rbio->stripe_len; 113953b381b3SDavid Woodhouse int nr_data = rbio->nr_data; 114053b381b3SDavid Woodhouse int stripe; 114153b381b3SDavid Woodhouse int pagenr; 114253b381b3SDavid Woodhouse int p_stripe = -1; 114353b381b3SDavid Woodhouse int q_stripe = -1; 114453b381b3SDavid Woodhouse struct bio_list bio_list; 114553b381b3SDavid Woodhouse struct bio *bio; 114653b381b3SDavid Woodhouse int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 114753b381b3SDavid Woodhouse int ret; 114853b381b3SDavid Woodhouse 114953b381b3SDavid Woodhouse bio_list_init(&bio_list); 115053b381b3SDavid Woodhouse 115153b381b3SDavid Woodhouse if (bbio->num_stripes - rbio->nr_data == 1) { 115253b381b3SDavid Woodhouse p_stripe = bbio->num_stripes - 1; 115353b381b3SDavid Woodhouse } else if (bbio->num_stripes - rbio->nr_data == 2) { 115453b381b3SDavid Woodhouse p_stripe = bbio->num_stripes - 2; 115553b381b3SDavid Woodhouse q_stripe = bbio->num_stripes - 1; 115653b381b3SDavid Woodhouse } else { 115753b381b3SDavid Woodhouse BUG(); 115853b381b3SDavid Woodhouse } 115953b381b3SDavid Woodhouse 116053b381b3SDavid Woodhouse /* at this point we either have a full stripe, 116153b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 116253b381b3SDavid Woodhouse * recalculate the parity and write the new results. 116353b381b3SDavid Woodhouse * 116453b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 116553b381b3SDavid Woodhouse * bio list here, anyone else that wants to 116653b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 116753b381b3SDavid Woodhouse */ 116853b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 116953b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 117053b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 117153b381b3SDavid Woodhouse 117253b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 117353b381b3SDavid Woodhouse 117453b381b3SDavid Woodhouse /* 117553b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 117653b381b3SDavid Woodhouse * bio list one last time and map the page pointers 11774ae10b3aSChris Mason * 11784ae10b3aSChris Mason * We don't cache full rbios because we're assuming 11794ae10b3aSChris Mason * the higher layers are unlikely to use this area of 11804ae10b3aSChris Mason * the disk again soon. If they do use it again, 11814ae10b3aSChris Mason * hopefully they will send another full bio. 118253b381b3SDavid Woodhouse */ 118353b381b3SDavid Woodhouse index_rbio_pages(rbio); 11844ae10b3aSChris Mason if (!rbio_is_full(rbio)) 11854ae10b3aSChris Mason cache_rbio_pages(rbio); 11864ae10b3aSChris Mason else 11874ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 118853b381b3SDavid Woodhouse 118953b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 119053b381b3SDavid Woodhouse struct page *p; 119153b381b3SDavid Woodhouse /* first collect one page from each data stripe */ 119253b381b3SDavid Woodhouse for (stripe = 0; stripe < nr_data; stripe++) { 119353b381b3SDavid Woodhouse p = page_in_rbio(rbio, stripe, pagenr, 0); 119453b381b3SDavid Woodhouse pointers[stripe] = kmap(p); 119553b381b3SDavid Woodhouse } 119653b381b3SDavid Woodhouse 119753b381b3SDavid Woodhouse /* then add the parity stripe */ 119853b381b3SDavid Woodhouse p = rbio_pstripe_page(rbio, pagenr); 119953b381b3SDavid Woodhouse SetPageUptodate(p); 120053b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 120153b381b3SDavid Woodhouse 120253b381b3SDavid Woodhouse if (q_stripe != -1) { 120353b381b3SDavid Woodhouse 120453b381b3SDavid Woodhouse /* 120553b381b3SDavid Woodhouse * raid6, add the qstripe and call the 120653b381b3SDavid Woodhouse * library function to fill in our p/q 120753b381b3SDavid Woodhouse */ 120853b381b3SDavid Woodhouse p = rbio_qstripe_page(rbio, pagenr); 120953b381b3SDavid Woodhouse SetPageUptodate(p); 121053b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 121153b381b3SDavid Woodhouse 121253b381b3SDavid Woodhouse raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 121353b381b3SDavid Woodhouse pointers); 121453b381b3SDavid Woodhouse } else { 121553b381b3SDavid Woodhouse /* raid5 */ 121653b381b3SDavid Woodhouse memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 121753b381b3SDavid Woodhouse run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 121853b381b3SDavid Woodhouse } 121953b381b3SDavid Woodhouse 122053b381b3SDavid Woodhouse 122153b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) 122253b381b3SDavid Woodhouse kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 122353b381b3SDavid Woodhouse } 122453b381b3SDavid Woodhouse 122553b381b3SDavid Woodhouse /* 122653b381b3SDavid Woodhouse * time to start writing. Make bios for everything from the 122753b381b3SDavid Woodhouse * higher layers (the bio_list in our rbio) and our p/q. Ignore 122853b381b3SDavid Woodhouse * everything else. 122953b381b3SDavid Woodhouse */ 123053b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 123153b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 123253b381b3SDavid Woodhouse struct page *page; 123353b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 123453b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 123553b381b3SDavid Woodhouse if (!page) 123653b381b3SDavid Woodhouse continue; 123753b381b3SDavid Woodhouse } else { 123853b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 123953b381b3SDavid Woodhouse } 124053b381b3SDavid Woodhouse 124153b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 124253b381b3SDavid Woodhouse page, stripe, pagenr, rbio->stripe_len); 124353b381b3SDavid Woodhouse if (ret) 124453b381b3SDavid Woodhouse goto cleanup; 124553b381b3SDavid Woodhouse } 124653b381b3SDavid Woodhouse } 124753b381b3SDavid Woodhouse 124853b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 124953b381b3SDavid Woodhouse BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 125053b381b3SDavid Woodhouse 125153b381b3SDavid Woodhouse while (1) { 125253b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 125353b381b3SDavid Woodhouse if (!bio) 125453b381b3SDavid Woodhouse break; 125553b381b3SDavid Woodhouse 125653b381b3SDavid Woodhouse bio->bi_private = rbio; 125753b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 125853b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 125953b381b3SDavid Woodhouse submit_bio(WRITE, bio); 126053b381b3SDavid Woodhouse } 126153b381b3SDavid Woodhouse return; 126253b381b3SDavid Woodhouse 126353b381b3SDavid Woodhouse cleanup: 126453b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 126553b381b3SDavid Woodhouse } 126653b381b3SDavid Woodhouse 126753b381b3SDavid Woodhouse /* 126853b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 126953b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 127053b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 127153b381b3SDavid Woodhouse */ 127253b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 127353b381b3SDavid Woodhouse struct bio *bio) 127453b381b3SDavid Woodhouse { 1275*4f024f37SKent Overstreet u64 physical = bio->bi_iter.bi_sector; 127653b381b3SDavid Woodhouse u64 stripe_start; 127753b381b3SDavid Woodhouse int i; 127853b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 127953b381b3SDavid Woodhouse 128053b381b3SDavid Woodhouse physical <<= 9; 128153b381b3SDavid Woodhouse 128253b381b3SDavid Woodhouse for (i = 0; i < rbio->bbio->num_stripes; i++) { 128353b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[i]; 128453b381b3SDavid Woodhouse stripe_start = stripe->physical; 128553b381b3SDavid Woodhouse if (physical >= stripe_start && 128653b381b3SDavid Woodhouse physical < stripe_start + rbio->stripe_len) { 128753b381b3SDavid Woodhouse return i; 128853b381b3SDavid Woodhouse } 128953b381b3SDavid Woodhouse } 129053b381b3SDavid Woodhouse return -1; 129153b381b3SDavid Woodhouse } 129253b381b3SDavid Woodhouse 129353b381b3SDavid Woodhouse /* 129453b381b3SDavid Woodhouse * helper to find the stripe number for a given 129553b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 129653b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 129753b381b3SDavid Woodhouse */ 129853b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 129953b381b3SDavid Woodhouse struct bio *bio) 130053b381b3SDavid Woodhouse { 1301*4f024f37SKent Overstreet u64 logical = bio->bi_iter.bi_sector; 130253b381b3SDavid Woodhouse u64 stripe_start; 130353b381b3SDavid Woodhouse int i; 130453b381b3SDavid Woodhouse 130553b381b3SDavid Woodhouse logical <<= 9; 130653b381b3SDavid Woodhouse 130753b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 130853b381b3SDavid Woodhouse stripe_start = rbio->raid_map[i]; 130953b381b3SDavid Woodhouse if (logical >= stripe_start && 131053b381b3SDavid Woodhouse logical < stripe_start + rbio->stripe_len) { 131153b381b3SDavid Woodhouse return i; 131253b381b3SDavid Woodhouse } 131353b381b3SDavid Woodhouse } 131453b381b3SDavid Woodhouse return -1; 131553b381b3SDavid Woodhouse } 131653b381b3SDavid Woodhouse 131753b381b3SDavid Woodhouse /* 131853b381b3SDavid Woodhouse * returns -EIO if we had too many failures 131953b381b3SDavid Woodhouse */ 132053b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 132153b381b3SDavid Woodhouse { 132253b381b3SDavid Woodhouse unsigned long flags; 132353b381b3SDavid Woodhouse int ret = 0; 132453b381b3SDavid Woodhouse 132553b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 132653b381b3SDavid Woodhouse 132753b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 132853b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 132953b381b3SDavid Woodhouse goto out; 133053b381b3SDavid Woodhouse 133153b381b3SDavid Woodhouse if (rbio->faila == -1) { 133253b381b3SDavid Woodhouse /* first failure on this rbio */ 133353b381b3SDavid Woodhouse rbio->faila = failed; 133453b381b3SDavid Woodhouse atomic_inc(&rbio->bbio->error); 133553b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 133653b381b3SDavid Woodhouse /* second failure on this rbio */ 133753b381b3SDavid Woodhouse rbio->failb = failed; 133853b381b3SDavid Woodhouse atomic_inc(&rbio->bbio->error); 133953b381b3SDavid Woodhouse } else { 134053b381b3SDavid Woodhouse ret = -EIO; 134153b381b3SDavid Woodhouse } 134253b381b3SDavid Woodhouse out: 134353b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 134453b381b3SDavid Woodhouse 134553b381b3SDavid Woodhouse return ret; 134653b381b3SDavid Woodhouse } 134753b381b3SDavid Woodhouse 134853b381b3SDavid Woodhouse /* 134953b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 135053b381b3SDavid Woodhouse * bio. 135153b381b3SDavid Woodhouse */ 135253b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 135353b381b3SDavid Woodhouse struct bio *bio) 135453b381b3SDavid Woodhouse { 135553b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 135653b381b3SDavid Woodhouse 135753b381b3SDavid Woodhouse if (failed < 0) 135853b381b3SDavid Woodhouse return -EIO; 135953b381b3SDavid Woodhouse 136053b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 136153b381b3SDavid Woodhouse } 136253b381b3SDavid Woodhouse 136353b381b3SDavid Woodhouse /* 136453b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 136553b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 136653b381b3SDavid Woodhouse */ 136753b381b3SDavid Woodhouse static void set_bio_pages_uptodate(struct bio *bio) 136853b381b3SDavid Woodhouse { 136953b381b3SDavid Woodhouse int i; 137053b381b3SDavid Woodhouse struct page *p; 137153b381b3SDavid Woodhouse 137253b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 137353b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 137453b381b3SDavid Woodhouse SetPageUptodate(p); 137553b381b3SDavid Woodhouse } 137653b381b3SDavid Woodhouse } 137753b381b3SDavid Woodhouse 137853b381b3SDavid Woodhouse /* 137953b381b3SDavid Woodhouse * end io for the read phase of the rmw cycle. All the bios here are physical 138053b381b3SDavid Woodhouse * stripe bios we've read from the disk so we can recalculate the parity of the 138153b381b3SDavid Woodhouse * stripe. 138253b381b3SDavid Woodhouse * 138353b381b3SDavid Woodhouse * This will usually kick off finish_rmw once all the bios are read in, but it 138453b381b3SDavid Woodhouse * may trigger parity reconstruction if we had any errors along the way 138553b381b3SDavid Woodhouse */ 138653b381b3SDavid Woodhouse static void raid_rmw_end_io(struct bio *bio, int err) 138753b381b3SDavid Woodhouse { 138853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 138953b381b3SDavid Woodhouse 139053b381b3SDavid Woodhouse if (err) 139153b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 139253b381b3SDavid Woodhouse else 139353b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 139453b381b3SDavid Woodhouse 139553b381b3SDavid Woodhouse bio_put(bio); 139653b381b3SDavid Woodhouse 139753b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 139853b381b3SDavid Woodhouse return; 139953b381b3SDavid Woodhouse 140053b381b3SDavid Woodhouse err = 0; 140153b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 140253b381b3SDavid Woodhouse goto cleanup; 140353b381b3SDavid Woodhouse 140453b381b3SDavid Woodhouse /* 140553b381b3SDavid Woodhouse * this will normally call finish_rmw to start our write 140653b381b3SDavid Woodhouse * but if there are any failed stripes we'll reconstruct 140753b381b3SDavid Woodhouse * from parity first 140853b381b3SDavid Woodhouse */ 140953b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 141053b381b3SDavid Woodhouse return; 141153b381b3SDavid Woodhouse 141253b381b3SDavid Woodhouse cleanup: 141353b381b3SDavid Woodhouse 141453b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 141553b381b3SDavid Woodhouse } 141653b381b3SDavid Woodhouse 141753b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 141853b381b3SDavid Woodhouse { 141953b381b3SDavid Woodhouse rbio->work.flags = 0; 142053b381b3SDavid Woodhouse rbio->work.func = rmw_work; 142153b381b3SDavid Woodhouse 142253b381b3SDavid Woodhouse btrfs_queue_worker(&rbio->fs_info->rmw_workers, 142353b381b3SDavid Woodhouse &rbio->work); 142453b381b3SDavid Woodhouse } 142553b381b3SDavid Woodhouse 142653b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio) 142753b381b3SDavid Woodhouse { 142853b381b3SDavid Woodhouse rbio->work.flags = 0; 142953b381b3SDavid Woodhouse rbio->work.func = read_rebuild_work; 143053b381b3SDavid Woodhouse 143153b381b3SDavid Woodhouse btrfs_queue_worker(&rbio->fs_info->rmw_workers, 143253b381b3SDavid Woodhouse &rbio->work); 143353b381b3SDavid Woodhouse } 143453b381b3SDavid Woodhouse 143553b381b3SDavid Woodhouse /* 143653b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 143753b381b3SDavid Woodhouse * unlock after all the writes are done 143853b381b3SDavid Woodhouse */ 143953b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 144053b381b3SDavid Woodhouse { 144153b381b3SDavid Woodhouse int bios_to_read = 0; 144253b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 144353b381b3SDavid Woodhouse struct bio_list bio_list; 144453b381b3SDavid Woodhouse int ret; 144553b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 144653b381b3SDavid Woodhouse int pagenr; 144753b381b3SDavid Woodhouse int stripe; 144853b381b3SDavid Woodhouse struct bio *bio; 144953b381b3SDavid Woodhouse 145053b381b3SDavid Woodhouse bio_list_init(&bio_list); 145153b381b3SDavid Woodhouse 145253b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 145353b381b3SDavid Woodhouse if (ret) 145453b381b3SDavid Woodhouse goto cleanup; 145553b381b3SDavid Woodhouse 145653b381b3SDavid Woodhouse index_rbio_pages(rbio); 145753b381b3SDavid Woodhouse 145853b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 145953b381b3SDavid Woodhouse /* 146053b381b3SDavid Woodhouse * build a list of bios to read all the missing parts of this 146153b381b3SDavid Woodhouse * stripe 146253b381b3SDavid Woodhouse */ 146353b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->nr_data; stripe++) { 146453b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 146553b381b3SDavid Woodhouse struct page *page; 146653b381b3SDavid Woodhouse /* 146753b381b3SDavid Woodhouse * we want to find all the pages missing from 146853b381b3SDavid Woodhouse * the rbio and read them from the disk. If 146953b381b3SDavid Woodhouse * page_in_rbio finds a page in the bio list 147053b381b3SDavid Woodhouse * we don't need to read it off the stripe. 147153b381b3SDavid Woodhouse */ 147253b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 147353b381b3SDavid Woodhouse if (page) 147453b381b3SDavid Woodhouse continue; 147553b381b3SDavid Woodhouse 147653b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 14774ae10b3aSChris Mason /* 14784ae10b3aSChris Mason * the bio cache may have handed us an uptodate 14794ae10b3aSChris Mason * page. If so, be happy and use it 14804ae10b3aSChris Mason */ 14814ae10b3aSChris Mason if (PageUptodate(page)) 14824ae10b3aSChris Mason continue; 14834ae10b3aSChris Mason 148453b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, page, 148553b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 148653b381b3SDavid Woodhouse if (ret) 148753b381b3SDavid Woodhouse goto cleanup; 148853b381b3SDavid Woodhouse } 148953b381b3SDavid Woodhouse } 149053b381b3SDavid Woodhouse 149153b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 149253b381b3SDavid Woodhouse if (!bios_to_read) { 149353b381b3SDavid Woodhouse /* 149453b381b3SDavid Woodhouse * this can happen if others have merged with 149553b381b3SDavid Woodhouse * us, it means there is nothing left to read. 149653b381b3SDavid Woodhouse * But if there are missing devices it may not be 149753b381b3SDavid Woodhouse * safe to do the full stripe write yet. 149853b381b3SDavid Woodhouse */ 149953b381b3SDavid Woodhouse goto finish; 150053b381b3SDavid Woodhouse } 150153b381b3SDavid Woodhouse 150253b381b3SDavid Woodhouse /* 150353b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 150453b381b3SDavid Woodhouse * not to touch it after that 150553b381b3SDavid Woodhouse */ 150653b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bios_to_read); 150753b381b3SDavid Woodhouse while (1) { 150853b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 150953b381b3SDavid Woodhouse if (!bio) 151053b381b3SDavid Woodhouse break; 151153b381b3SDavid Woodhouse 151253b381b3SDavid Woodhouse bio->bi_private = rbio; 151353b381b3SDavid Woodhouse bio->bi_end_io = raid_rmw_end_io; 151453b381b3SDavid Woodhouse 151553b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 151653b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 151753b381b3SDavid Woodhouse 151853b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 151953b381b3SDavid Woodhouse submit_bio(READ, bio); 152053b381b3SDavid Woodhouse } 152153b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 152253b381b3SDavid Woodhouse return 0; 152353b381b3SDavid Woodhouse 152453b381b3SDavid Woodhouse cleanup: 152553b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 152653b381b3SDavid Woodhouse return -EIO; 152753b381b3SDavid Woodhouse 152853b381b3SDavid Woodhouse finish: 152953b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 153053b381b3SDavid Woodhouse return 0; 153153b381b3SDavid Woodhouse } 153253b381b3SDavid Woodhouse 153353b381b3SDavid Woodhouse /* 153453b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 153553b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 153653b381b3SDavid Woodhouse */ 153753b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 153853b381b3SDavid Woodhouse { 153953b381b3SDavid Woodhouse int ret; 154053b381b3SDavid Woodhouse 154153b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 15423cd846d1SMiao Xie if (ret) { 15433cd846d1SMiao Xie __free_raid_bio(rbio); 154453b381b3SDavid Woodhouse return ret; 15453cd846d1SMiao Xie } 154653b381b3SDavid Woodhouse 154753b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 154853b381b3SDavid Woodhouse if (ret == 0) 154953b381b3SDavid Woodhouse finish_rmw(rbio); 155053b381b3SDavid Woodhouse return 0; 155153b381b3SDavid Woodhouse } 155253b381b3SDavid Woodhouse 155353b381b3SDavid Woodhouse /* 155453b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 155553b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 155653b381b3SDavid Woodhouse * rbio before calculating new parity 155753b381b3SDavid Woodhouse */ 155853b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 155953b381b3SDavid Woodhouse { 156053b381b3SDavid Woodhouse int ret; 156153b381b3SDavid Woodhouse 156253b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 156353b381b3SDavid Woodhouse if (ret == 0) 156453b381b3SDavid Woodhouse async_rmw_stripe(rbio); 156553b381b3SDavid Woodhouse return 0; 156653b381b3SDavid Woodhouse } 156753b381b3SDavid Woodhouse 156853b381b3SDavid Woodhouse /* 156953b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 157053b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 157153b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 157253b381b3SDavid Woodhouse * go directly to finish_rmw 157353b381b3SDavid Woodhouse */ 157453b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 157553b381b3SDavid Woodhouse { 157653b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 157753b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 157853b381b3SDavid Woodhouse return partial_stripe_write(rbio); 157953b381b3SDavid Woodhouse return full_stripe_write(rbio); 158053b381b3SDavid Woodhouse } 158153b381b3SDavid Woodhouse 158253b381b3SDavid Woodhouse /* 15836ac0f488SChris Mason * We use plugging call backs to collect full stripes. 15846ac0f488SChris Mason * Any time we get a partial stripe write while plugged 15856ac0f488SChris Mason * we collect it into a list. When the unplug comes down, 15866ac0f488SChris Mason * we sort the list by logical block number and merge 15876ac0f488SChris Mason * everything we can into the same rbios 15886ac0f488SChris Mason */ 15896ac0f488SChris Mason struct btrfs_plug_cb { 15906ac0f488SChris Mason struct blk_plug_cb cb; 15916ac0f488SChris Mason struct btrfs_fs_info *info; 15926ac0f488SChris Mason struct list_head rbio_list; 15936ac0f488SChris Mason struct btrfs_work work; 15946ac0f488SChris Mason }; 15956ac0f488SChris Mason 15966ac0f488SChris Mason /* 15976ac0f488SChris Mason * rbios on the plug list are sorted for easier merging. 15986ac0f488SChris Mason */ 15996ac0f488SChris Mason static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 16006ac0f488SChris Mason { 16016ac0f488SChris Mason struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 16026ac0f488SChris Mason plug_list); 16036ac0f488SChris Mason struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 16046ac0f488SChris Mason plug_list); 1605*4f024f37SKent Overstreet u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1606*4f024f37SKent Overstreet u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 16076ac0f488SChris Mason 16086ac0f488SChris Mason if (a_sector < b_sector) 16096ac0f488SChris Mason return -1; 16106ac0f488SChris Mason if (a_sector > b_sector) 16116ac0f488SChris Mason return 1; 16126ac0f488SChris Mason return 0; 16136ac0f488SChris Mason } 16146ac0f488SChris Mason 16156ac0f488SChris Mason static void run_plug(struct btrfs_plug_cb *plug) 16166ac0f488SChris Mason { 16176ac0f488SChris Mason struct btrfs_raid_bio *cur; 16186ac0f488SChris Mason struct btrfs_raid_bio *last = NULL; 16196ac0f488SChris Mason 16206ac0f488SChris Mason /* 16216ac0f488SChris Mason * sort our plug list then try to merge 16226ac0f488SChris Mason * everything we can in hopes of creating full 16236ac0f488SChris Mason * stripes. 16246ac0f488SChris Mason */ 16256ac0f488SChris Mason list_sort(NULL, &plug->rbio_list, plug_cmp); 16266ac0f488SChris Mason while (!list_empty(&plug->rbio_list)) { 16276ac0f488SChris Mason cur = list_entry(plug->rbio_list.next, 16286ac0f488SChris Mason struct btrfs_raid_bio, plug_list); 16296ac0f488SChris Mason list_del_init(&cur->plug_list); 16306ac0f488SChris Mason 16316ac0f488SChris Mason if (rbio_is_full(cur)) { 16326ac0f488SChris Mason /* we have a full stripe, send it down */ 16336ac0f488SChris Mason full_stripe_write(cur); 16346ac0f488SChris Mason continue; 16356ac0f488SChris Mason } 16366ac0f488SChris Mason if (last) { 16376ac0f488SChris Mason if (rbio_can_merge(last, cur)) { 16386ac0f488SChris Mason merge_rbio(last, cur); 16396ac0f488SChris Mason __free_raid_bio(cur); 16406ac0f488SChris Mason continue; 16416ac0f488SChris Mason 16426ac0f488SChris Mason } 16436ac0f488SChris Mason __raid56_parity_write(last); 16446ac0f488SChris Mason } 16456ac0f488SChris Mason last = cur; 16466ac0f488SChris Mason } 16476ac0f488SChris Mason if (last) { 16486ac0f488SChris Mason __raid56_parity_write(last); 16496ac0f488SChris Mason } 16506ac0f488SChris Mason kfree(plug); 16516ac0f488SChris Mason } 16526ac0f488SChris Mason 16536ac0f488SChris Mason /* 16546ac0f488SChris Mason * if the unplug comes from schedule, we have to push the 16556ac0f488SChris Mason * work off to a helper thread 16566ac0f488SChris Mason */ 16576ac0f488SChris Mason static void unplug_work(struct btrfs_work *work) 16586ac0f488SChris Mason { 16596ac0f488SChris Mason struct btrfs_plug_cb *plug; 16606ac0f488SChris Mason plug = container_of(work, struct btrfs_plug_cb, work); 16616ac0f488SChris Mason run_plug(plug); 16626ac0f488SChris Mason } 16636ac0f488SChris Mason 16646ac0f488SChris Mason static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 16656ac0f488SChris Mason { 16666ac0f488SChris Mason struct btrfs_plug_cb *plug; 16676ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 16686ac0f488SChris Mason 16696ac0f488SChris Mason if (from_schedule) { 16706ac0f488SChris Mason plug->work.flags = 0; 16716ac0f488SChris Mason plug->work.func = unplug_work; 16726ac0f488SChris Mason btrfs_queue_worker(&plug->info->rmw_workers, 16736ac0f488SChris Mason &plug->work); 16746ac0f488SChris Mason return; 16756ac0f488SChris Mason } 16766ac0f488SChris Mason run_plug(plug); 16776ac0f488SChris Mason } 16786ac0f488SChris Mason 16796ac0f488SChris Mason /* 168053b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 168153b381b3SDavid Woodhouse */ 168253b381b3SDavid Woodhouse int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 168353b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 168453b381b3SDavid Woodhouse u64 stripe_len) 168553b381b3SDavid Woodhouse { 168653b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 16876ac0f488SChris Mason struct btrfs_plug_cb *plug = NULL; 16886ac0f488SChris Mason struct blk_plug_cb *cb; 168953b381b3SDavid Woodhouse 169053b381b3SDavid Woodhouse rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 16913dc0e818SDan Carpenter if (IS_ERR(rbio)) 169253b381b3SDavid Woodhouse return PTR_ERR(rbio); 169353b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 1694*4f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 16956ac0f488SChris Mason 16966ac0f488SChris Mason /* 16976ac0f488SChris Mason * don't plug on full rbios, just get them out the door 16986ac0f488SChris Mason * as quickly as we can 16996ac0f488SChris Mason */ 17006ac0f488SChris Mason if (rbio_is_full(rbio)) 17016ac0f488SChris Mason return full_stripe_write(rbio); 17026ac0f488SChris Mason 17036ac0f488SChris Mason cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 17046ac0f488SChris Mason sizeof(*plug)); 17056ac0f488SChris Mason if (cb) { 17066ac0f488SChris Mason plug = container_of(cb, struct btrfs_plug_cb, cb); 17076ac0f488SChris Mason if (!plug->info) { 17086ac0f488SChris Mason plug->info = root->fs_info; 17096ac0f488SChris Mason INIT_LIST_HEAD(&plug->rbio_list); 17106ac0f488SChris Mason } 17116ac0f488SChris Mason list_add_tail(&rbio->plug_list, &plug->rbio_list); 17126ac0f488SChris Mason } else { 171353b381b3SDavid Woodhouse return __raid56_parity_write(rbio); 171453b381b3SDavid Woodhouse } 17156ac0f488SChris Mason return 0; 17166ac0f488SChris Mason } 171753b381b3SDavid Woodhouse 171853b381b3SDavid Woodhouse /* 171953b381b3SDavid Woodhouse * all parity reconstruction happens here. We've read in everything 172053b381b3SDavid Woodhouse * we can find from the drives and this does the heavy lifting of 172153b381b3SDavid Woodhouse * sorting the good from the bad. 172253b381b3SDavid Woodhouse */ 172353b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 172453b381b3SDavid Woodhouse { 172553b381b3SDavid Woodhouse int pagenr, stripe; 172653b381b3SDavid Woodhouse void **pointers; 172753b381b3SDavid Woodhouse int faila = -1, failb = -1; 172853b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 172953b381b3SDavid Woodhouse struct page *page; 173053b381b3SDavid Woodhouse int err; 173153b381b3SDavid Woodhouse int i; 173253b381b3SDavid Woodhouse 173353b381b3SDavid Woodhouse pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 173453b381b3SDavid Woodhouse GFP_NOFS); 173553b381b3SDavid Woodhouse if (!pointers) { 173653b381b3SDavid Woodhouse err = -ENOMEM; 173753b381b3SDavid Woodhouse goto cleanup_io; 173853b381b3SDavid Woodhouse } 173953b381b3SDavid Woodhouse 174053b381b3SDavid Woodhouse faila = rbio->faila; 174153b381b3SDavid Woodhouse failb = rbio->failb; 174253b381b3SDavid Woodhouse 174353b381b3SDavid Woodhouse if (rbio->read_rebuild) { 174453b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 174553b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 174653b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 174753b381b3SDavid Woodhouse } 174853b381b3SDavid Woodhouse 174953b381b3SDavid Woodhouse index_rbio_pages(rbio); 175053b381b3SDavid Woodhouse 175153b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 175253b381b3SDavid Woodhouse /* setup our array of pointers with pages 175353b381b3SDavid Woodhouse * from each stripe 175453b381b3SDavid Woodhouse */ 175553b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 175653b381b3SDavid Woodhouse /* 175753b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 175853b381b3SDavid Woodhouse * pages from the bio list 175953b381b3SDavid Woodhouse */ 176053b381b3SDavid Woodhouse if (rbio->read_rebuild && 176153b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 176253b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 176353b381b3SDavid Woodhouse } else { 176453b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 176553b381b3SDavid Woodhouse } 176653b381b3SDavid Woodhouse pointers[stripe] = kmap(page); 176753b381b3SDavid Woodhouse } 176853b381b3SDavid Woodhouse 176953b381b3SDavid Woodhouse /* all raid6 handling here */ 177053b381b3SDavid Woodhouse if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 177153b381b3SDavid Woodhouse RAID6_Q_STRIPE) { 177253b381b3SDavid Woodhouse 177353b381b3SDavid Woodhouse /* 177453b381b3SDavid Woodhouse * single failure, rebuild from parity raid5 177553b381b3SDavid Woodhouse * style 177653b381b3SDavid Woodhouse */ 177753b381b3SDavid Woodhouse if (failb < 0) { 177853b381b3SDavid Woodhouse if (faila == rbio->nr_data) { 177953b381b3SDavid Woodhouse /* 178053b381b3SDavid Woodhouse * Just the P stripe has failed, without 178153b381b3SDavid Woodhouse * a bad data or Q stripe. 178253b381b3SDavid Woodhouse * TODO, we should redo the xor here. 178353b381b3SDavid Woodhouse */ 178453b381b3SDavid Woodhouse err = -EIO; 178553b381b3SDavid Woodhouse goto cleanup; 178653b381b3SDavid Woodhouse } 178753b381b3SDavid Woodhouse /* 178853b381b3SDavid Woodhouse * a single failure in raid6 is rebuilt 178953b381b3SDavid Woodhouse * in the pstripe code below 179053b381b3SDavid Woodhouse */ 179153b381b3SDavid Woodhouse goto pstripe; 179253b381b3SDavid Woodhouse } 179353b381b3SDavid Woodhouse 179453b381b3SDavid Woodhouse /* make sure our ps and qs are in order */ 179553b381b3SDavid Woodhouse if (faila > failb) { 179653b381b3SDavid Woodhouse int tmp = failb; 179753b381b3SDavid Woodhouse failb = faila; 179853b381b3SDavid Woodhouse faila = tmp; 179953b381b3SDavid Woodhouse } 180053b381b3SDavid Woodhouse 180153b381b3SDavid Woodhouse /* if the q stripe is failed, do a pstripe reconstruction 180253b381b3SDavid Woodhouse * from the xors. 180353b381b3SDavid Woodhouse * If both the q stripe and the P stripe are failed, we're 180453b381b3SDavid Woodhouse * here due to a crc mismatch and we can't give them the 180553b381b3SDavid Woodhouse * data they want 180653b381b3SDavid Woodhouse */ 180753b381b3SDavid Woodhouse if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 180853b381b3SDavid Woodhouse if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 180953b381b3SDavid Woodhouse err = -EIO; 181053b381b3SDavid Woodhouse goto cleanup; 181153b381b3SDavid Woodhouse } 181253b381b3SDavid Woodhouse /* 181353b381b3SDavid Woodhouse * otherwise we have one bad data stripe and 181453b381b3SDavid Woodhouse * a good P stripe. raid5! 181553b381b3SDavid Woodhouse */ 181653b381b3SDavid Woodhouse goto pstripe; 181753b381b3SDavid Woodhouse } 181853b381b3SDavid Woodhouse 181953b381b3SDavid Woodhouse if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 182053b381b3SDavid Woodhouse raid6_datap_recov(rbio->bbio->num_stripes, 182153b381b3SDavid Woodhouse PAGE_SIZE, faila, pointers); 182253b381b3SDavid Woodhouse } else { 182353b381b3SDavid Woodhouse raid6_2data_recov(rbio->bbio->num_stripes, 182453b381b3SDavid Woodhouse PAGE_SIZE, faila, failb, 182553b381b3SDavid Woodhouse pointers); 182653b381b3SDavid Woodhouse } 182753b381b3SDavid Woodhouse } else { 182853b381b3SDavid Woodhouse void *p; 182953b381b3SDavid Woodhouse 183053b381b3SDavid Woodhouse /* rebuild from P stripe here (raid5 or raid6) */ 183153b381b3SDavid Woodhouse BUG_ON(failb != -1); 183253b381b3SDavid Woodhouse pstripe: 183353b381b3SDavid Woodhouse /* Copy parity block into failed block to start with */ 183453b381b3SDavid Woodhouse memcpy(pointers[faila], 183553b381b3SDavid Woodhouse pointers[rbio->nr_data], 183653b381b3SDavid Woodhouse PAGE_CACHE_SIZE); 183753b381b3SDavid Woodhouse 183853b381b3SDavid Woodhouse /* rearrange the pointer array */ 183953b381b3SDavid Woodhouse p = pointers[faila]; 184053b381b3SDavid Woodhouse for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 184153b381b3SDavid Woodhouse pointers[stripe] = pointers[stripe + 1]; 184253b381b3SDavid Woodhouse pointers[rbio->nr_data - 1] = p; 184353b381b3SDavid Woodhouse 184453b381b3SDavid Woodhouse /* xor in the rest */ 184553b381b3SDavid Woodhouse run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 184653b381b3SDavid Woodhouse } 184753b381b3SDavid Woodhouse /* if we're doing this rebuild as part of an rmw, go through 184853b381b3SDavid Woodhouse * and set all of our private rbio pages in the 184953b381b3SDavid Woodhouse * failed stripes as uptodate. This way finish_rmw will 185053b381b3SDavid Woodhouse * know they can be trusted. If this was a read reconstruction, 185153b381b3SDavid Woodhouse * other endio functions will fiddle the uptodate bits 185253b381b3SDavid Woodhouse */ 185353b381b3SDavid Woodhouse if (!rbio->read_rebuild) { 185453b381b3SDavid Woodhouse for (i = 0; i < nr_pages; i++) { 185553b381b3SDavid Woodhouse if (faila != -1) { 185653b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, faila, i); 185753b381b3SDavid Woodhouse SetPageUptodate(page); 185853b381b3SDavid Woodhouse } 185953b381b3SDavid Woodhouse if (failb != -1) { 186053b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, failb, i); 186153b381b3SDavid Woodhouse SetPageUptodate(page); 186253b381b3SDavid Woodhouse } 186353b381b3SDavid Woodhouse } 186453b381b3SDavid Woodhouse } 186553b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 186653b381b3SDavid Woodhouse /* 186753b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 186853b381b3SDavid Woodhouse * pages from the bio list 186953b381b3SDavid Woodhouse */ 187053b381b3SDavid Woodhouse if (rbio->read_rebuild && 187153b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 187253b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 187353b381b3SDavid Woodhouse } else { 187453b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 187553b381b3SDavid Woodhouse } 187653b381b3SDavid Woodhouse kunmap(page); 187753b381b3SDavid Woodhouse } 187853b381b3SDavid Woodhouse } 187953b381b3SDavid Woodhouse 188053b381b3SDavid Woodhouse err = 0; 188153b381b3SDavid Woodhouse cleanup: 188253b381b3SDavid Woodhouse kfree(pointers); 188353b381b3SDavid Woodhouse 188453b381b3SDavid Woodhouse cleanup_io: 188553b381b3SDavid Woodhouse 188653b381b3SDavid Woodhouse if (rbio->read_rebuild) { 18874ae10b3aSChris Mason if (err == 0) 18884ae10b3aSChris Mason cache_rbio_pages(rbio); 18894ae10b3aSChris Mason else 18904ae10b3aSChris Mason clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 18914ae10b3aSChris Mason 189253b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, err == 0); 189353b381b3SDavid Woodhouse } else if (err == 0) { 189453b381b3SDavid Woodhouse rbio->faila = -1; 189553b381b3SDavid Woodhouse rbio->failb = -1; 189653b381b3SDavid Woodhouse finish_rmw(rbio); 189753b381b3SDavid Woodhouse } else { 189853b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, 0); 189953b381b3SDavid Woodhouse } 190053b381b3SDavid Woodhouse } 190153b381b3SDavid Woodhouse 190253b381b3SDavid Woodhouse /* 190353b381b3SDavid Woodhouse * This is called only for stripes we've read from disk to 190453b381b3SDavid Woodhouse * reconstruct the parity. 190553b381b3SDavid Woodhouse */ 190653b381b3SDavid Woodhouse static void raid_recover_end_io(struct bio *bio, int err) 190753b381b3SDavid Woodhouse { 190853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 190953b381b3SDavid Woodhouse 191053b381b3SDavid Woodhouse /* 191153b381b3SDavid Woodhouse * we only read stripe pages off the disk, set them 191253b381b3SDavid Woodhouse * up to date if there were no errors 191353b381b3SDavid Woodhouse */ 191453b381b3SDavid Woodhouse if (err) 191553b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 191653b381b3SDavid Woodhouse else 191753b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 191853b381b3SDavid Woodhouse bio_put(bio); 191953b381b3SDavid Woodhouse 192053b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 192153b381b3SDavid Woodhouse return; 192253b381b3SDavid Woodhouse 192353b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 192453b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 192553b381b3SDavid Woodhouse else 192653b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 192753b381b3SDavid Woodhouse } 192853b381b3SDavid Woodhouse 192953b381b3SDavid Woodhouse /* 193053b381b3SDavid Woodhouse * reads everything we need off the disk to reconstruct 193153b381b3SDavid Woodhouse * the parity. endio handlers trigger final reconstruction 193253b381b3SDavid Woodhouse * when the IO is done. 193353b381b3SDavid Woodhouse * 193453b381b3SDavid Woodhouse * This is used both for reads from the higher layers and for 193553b381b3SDavid Woodhouse * parity construction required to finish a rmw cycle. 193653b381b3SDavid Woodhouse */ 193753b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 193853b381b3SDavid Woodhouse { 193953b381b3SDavid Woodhouse int bios_to_read = 0; 194053b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 194153b381b3SDavid Woodhouse struct bio_list bio_list; 194253b381b3SDavid Woodhouse int ret; 194353b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 194453b381b3SDavid Woodhouse int pagenr; 194553b381b3SDavid Woodhouse int stripe; 194653b381b3SDavid Woodhouse struct bio *bio; 194753b381b3SDavid Woodhouse 194853b381b3SDavid Woodhouse bio_list_init(&bio_list); 194953b381b3SDavid Woodhouse 195053b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 195153b381b3SDavid Woodhouse if (ret) 195253b381b3SDavid Woodhouse goto cleanup; 195353b381b3SDavid Woodhouse 195453b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 195553b381b3SDavid Woodhouse 195653b381b3SDavid Woodhouse /* 19574ae10b3aSChris Mason * read everything that hasn't failed. Thanks to the 19584ae10b3aSChris Mason * stripe cache, it is possible that some or all of these 19594ae10b3aSChris Mason * pages are going to be uptodate. 196053b381b3SDavid Woodhouse */ 196153b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 196253b381b3SDavid Woodhouse if (rbio->faila == stripe || 196353b381b3SDavid Woodhouse rbio->failb == stripe) 196453b381b3SDavid Woodhouse continue; 196553b381b3SDavid Woodhouse 196653b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 196753b381b3SDavid Woodhouse struct page *p; 196853b381b3SDavid Woodhouse 196953b381b3SDavid Woodhouse /* 197053b381b3SDavid Woodhouse * the rmw code may have already read this 197153b381b3SDavid Woodhouse * page in 197253b381b3SDavid Woodhouse */ 197353b381b3SDavid Woodhouse p = rbio_stripe_page(rbio, stripe, pagenr); 197453b381b3SDavid Woodhouse if (PageUptodate(p)) 197553b381b3SDavid Woodhouse continue; 197653b381b3SDavid Woodhouse 197753b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 197853b381b3SDavid Woodhouse rbio_stripe_page(rbio, stripe, pagenr), 197953b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 198053b381b3SDavid Woodhouse if (ret < 0) 198153b381b3SDavid Woodhouse goto cleanup; 198253b381b3SDavid Woodhouse } 198353b381b3SDavid Woodhouse } 198453b381b3SDavid Woodhouse 198553b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 198653b381b3SDavid Woodhouse if (!bios_to_read) { 198753b381b3SDavid Woodhouse /* 198853b381b3SDavid Woodhouse * we might have no bios to read just because the pages 198953b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 199053b381b3SDavid Woodhouse * the devices were gone. 199153b381b3SDavid Woodhouse */ 199253b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 199353b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 199453b381b3SDavid Woodhouse goto out; 199553b381b3SDavid Woodhouse } else { 199653b381b3SDavid Woodhouse goto cleanup; 199753b381b3SDavid Woodhouse } 199853b381b3SDavid Woodhouse } 199953b381b3SDavid Woodhouse 200053b381b3SDavid Woodhouse /* 200153b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 200253b381b3SDavid Woodhouse * not to touch it after that 200353b381b3SDavid Woodhouse */ 200453b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bios_to_read); 200553b381b3SDavid Woodhouse while (1) { 200653b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 200753b381b3SDavid Woodhouse if (!bio) 200853b381b3SDavid Woodhouse break; 200953b381b3SDavid Woodhouse 201053b381b3SDavid Woodhouse bio->bi_private = rbio; 201153b381b3SDavid Woodhouse bio->bi_end_io = raid_recover_end_io; 201253b381b3SDavid Woodhouse 201353b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 201453b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 201553b381b3SDavid Woodhouse 201653b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 201753b381b3SDavid Woodhouse submit_bio(READ, bio); 201853b381b3SDavid Woodhouse } 201953b381b3SDavid Woodhouse out: 202053b381b3SDavid Woodhouse return 0; 202153b381b3SDavid Woodhouse 202253b381b3SDavid Woodhouse cleanup: 202353b381b3SDavid Woodhouse if (rbio->read_rebuild) 202453b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 202553b381b3SDavid Woodhouse return -EIO; 202653b381b3SDavid Woodhouse } 202753b381b3SDavid Woodhouse 202853b381b3SDavid Woodhouse /* 202953b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 203053b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 203153b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 203253b381b3SDavid Woodhouse * of the drive. 203353b381b3SDavid Woodhouse */ 203453b381b3SDavid Woodhouse int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 203553b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 203653b381b3SDavid Woodhouse u64 stripe_len, int mirror_num) 203753b381b3SDavid Woodhouse { 203853b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 203953b381b3SDavid Woodhouse int ret; 204053b381b3SDavid Woodhouse 204153b381b3SDavid Woodhouse rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 20423dc0e818SDan Carpenter if (IS_ERR(rbio)) 204353b381b3SDavid Woodhouse return PTR_ERR(rbio); 204453b381b3SDavid Woodhouse 204553b381b3SDavid Woodhouse rbio->read_rebuild = 1; 204653b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 2047*4f024f37SKent Overstreet rbio->bio_list_bytes = bio->bi_iter.bi_size; 204853b381b3SDavid Woodhouse 204953b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 205053b381b3SDavid Woodhouse if (rbio->faila == -1) { 205153b381b3SDavid Woodhouse BUG(); 20523dc0e818SDan Carpenter kfree(raid_map); 20533dc0e818SDan Carpenter kfree(bbio); 205453b381b3SDavid Woodhouse kfree(rbio); 205553b381b3SDavid Woodhouse return -EIO; 205653b381b3SDavid Woodhouse } 205753b381b3SDavid Woodhouse 205853b381b3SDavid Woodhouse /* 205953b381b3SDavid Woodhouse * reconstruct from the q stripe if they are 206053b381b3SDavid Woodhouse * asking for mirror 3 206153b381b3SDavid Woodhouse */ 206253b381b3SDavid Woodhouse if (mirror_num == 3) 206353b381b3SDavid Woodhouse rbio->failb = bbio->num_stripes - 2; 206453b381b3SDavid Woodhouse 206553b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 206653b381b3SDavid Woodhouse 206753b381b3SDavid Woodhouse /* 206853b381b3SDavid Woodhouse * __raid56_parity_recover will end the bio with 206953b381b3SDavid Woodhouse * any errors it hits. We don't want to return 207053b381b3SDavid Woodhouse * its error value up the stack because our caller 207153b381b3SDavid Woodhouse * will end up calling bio_endio with any nonzero 207253b381b3SDavid Woodhouse * return 207353b381b3SDavid Woodhouse */ 207453b381b3SDavid Woodhouse if (ret == 0) 207553b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 207653b381b3SDavid Woodhouse /* 207753b381b3SDavid Woodhouse * our rbio has been added to the list of 207853b381b3SDavid Woodhouse * rbios that will be handled after the 207953b381b3SDavid Woodhouse * currently lock owner is done 208053b381b3SDavid Woodhouse */ 208153b381b3SDavid Woodhouse return 0; 208253b381b3SDavid Woodhouse 208353b381b3SDavid Woodhouse } 208453b381b3SDavid Woodhouse 208553b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work) 208653b381b3SDavid Woodhouse { 208753b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 208853b381b3SDavid Woodhouse 208953b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 209053b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 209153b381b3SDavid Woodhouse } 209253b381b3SDavid Woodhouse 209353b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work) 209453b381b3SDavid Woodhouse { 209553b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 209653b381b3SDavid Woodhouse 209753b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 209853b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 209953b381b3SDavid Woodhouse } 2100