1*53b381b3SDavid Woodhouse /* 2*53b381b3SDavid Woodhouse * Copyright (C) 2012 Fusion-io All rights reserved. 3*53b381b3SDavid Woodhouse * Copyright (C) 2012 Intel Corp. All rights reserved. 4*53b381b3SDavid Woodhouse * 5*53b381b3SDavid Woodhouse * This program is free software; you can redistribute it and/or 6*53b381b3SDavid Woodhouse * modify it under the terms of the GNU General Public 7*53b381b3SDavid Woodhouse * License v2 as published by the Free Software Foundation. 8*53b381b3SDavid Woodhouse * 9*53b381b3SDavid Woodhouse * This program is distributed in the hope that it will be useful, 10*53b381b3SDavid Woodhouse * but WITHOUT ANY WARRANTY; without even the implied warranty of 11*53b381b3SDavid Woodhouse * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12*53b381b3SDavid Woodhouse * General Public License for more details. 13*53b381b3SDavid Woodhouse * 14*53b381b3SDavid Woodhouse * You should have received a copy of the GNU General Public 15*53b381b3SDavid Woodhouse * License along with this program; if not, write to the 16*53b381b3SDavid Woodhouse * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17*53b381b3SDavid Woodhouse * Boston, MA 021110-1307, USA. 18*53b381b3SDavid Woodhouse */ 19*53b381b3SDavid Woodhouse #include <linux/sched.h> 20*53b381b3SDavid Woodhouse #include <linux/wait.h> 21*53b381b3SDavid Woodhouse #include <linux/bio.h> 22*53b381b3SDavid Woodhouse #include <linux/slab.h> 23*53b381b3SDavid Woodhouse #include <linux/buffer_head.h> 24*53b381b3SDavid Woodhouse #include <linux/blkdev.h> 25*53b381b3SDavid Woodhouse #include <linux/random.h> 26*53b381b3SDavid Woodhouse #include <linux/iocontext.h> 27*53b381b3SDavid Woodhouse #include <linux/capability.h> 28*53b381b3SDavid Woodhouse #include <linux/ratelimit.h> 29*53b381b3SDavid Woodhouse #include <linux/kthread.h> 30*53b381b3SDavid Woodhouse #include <linux/raid/pq.h> 31*53b381b3SDavid Woodhouse #include <linux/hash.h> 32*53b381b3SDavid Woodhouse #include <linux/list_sort.h> 33*53b381b3SDavid Woodhouse #include <linux/raid/xor.h> 34*53b381b3SDavid Woodhouse #include <asm/div64.h> 35*53b381b3SDavid Woodhouse #include "compat.h" 36*53b381b3SDavid Woodhouse #include "ctree.h" 37*53b381b3SDavid Woodhouse #include "extent_map.h" 38*53b381b3SDavid Woodhouse #include "disk-io.h" 39*53b381b3SDavid Woodhouse #include "transaction.h" 40*53b381b3SDavid Woodhouse #include "print-tree.h" 41*53b381b3SDavid Woodhouse #include "volumes.h" 42*53b381b3SDavid Woodhouse #include "raid56.h" 43*53b381b3SDavid Woodhouse #include "async-thread.h" 44*53b381b3SDavid Woodhouse #include "check-integrity.h" 45*53b381b3SDavid Woodhouse #include "rcu-string.h" 46*53b381b3SDavid Woodhouse 47*53b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */ 48*53b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT 1 49*53b381b3SDavid Woodhouse 50*53b381b3SDavid Woodhouse struct btrfs_raid_bio { 51*53b381b3SDavid Woodhouse struct btrfs_fs_info *fs_info; 52*53b381b3SDavid Woodhouse struct btrfs_bio *bbio; 53*53b381b3SDavid Woodhouse 54*53b381b3SDavid Woodhouse /* 55*53b381b3SDavid Woodhouse * logical block numbers for the start of each stripe 56*53b381b3SDavid Woodhouse * The last one or two are p/q. These are sorted, 57*53b381b3SDavid Woodhouse * so raid_map[0] is the start of our full stripe 58*53b381b3SDavid Woodhouse */ 59*53b381b3SDavid Woodhouse u64 *raid_map; 60*53b381b3SDavid Woodhouse 61*53b381b3SDavid Woodhouse /* while we're doing rmw on a stripe 62*53b381b3SDavid Woodhouse * we put it into a hash table so we can 63*53b381b3SDavid Woodhouse * lock the stripe and merge more rbios 64*53b381b3SDavid Woodhouse * into it. 65*53b381b3SDavid Woodhouse */ 66*53b381b3SDavid Woodhouse struct list_head hash_list; 67*53b381b3SDavid Woodhouse 68*53b381b3SDavid Woodhouse /* 69*53b381b3SDavid Woodhouse * for scheduling work in the helper threads 70*53b381b3SDavid Woodhouse */ 71*53b381b3SDavid Woodhouse struct btrfs_work work; 72*53b381b3SDavid Woodhouse 73*53b381b3SDavid Woodhouse /* 74*53b381b3SDavid Woodhouse * bio list and bio_list_lock are used 75*53b381b3SDavid Woodhouse * to add more bios into the stripe 76*53b381b3SDavid Woodhouse * in hopes of avoiding the full rmw 77*53b381b3SDavid Woodhouse */ 78*53b381b3SDavid Woodhouse struct bio_list bio_list; 79*53b381b3SDavid Woodhouse spinlock_t bio_list_lock; 80*53b381b3SDavid Woodhouse 81*53b381b3SDavid Woodhouse /* 82*53b381b3SDavid Woodhouse * also protected by the bio_list_lock, the 83*53b381b3SDavid Woodhouse * stripe locking code uses plug_list to hand off 84*53b381b3SDavid Woodhouse * the stripe lock to the next pending IO 85*53b381b3SDavid Woodhouse */ 86*53b381b3SDavid Woodhouse struct list_head plug_list; 87*53b381b3SDavid Woodhouse 88*53b381b3SDavid Woodhouse /* 89*53b381b3SDavid Woodhouse * flags that tell us if it is safe to 90*53b381b3SDavid Woodhouse * merge with this bio 91*53b381b3SDavid Woodhouse */ 92*53b381b3SDavid Woodhouse unsigned long flags; 93*53b381b3SDavid Woodhouse 94*53b381b3SDavid Woodhouse /* size of each individual stripe on disk */ 95*53b381b3SDavid Woodhouse int stripe_len; 96*53b381b3SDavid Woodhouse 97*53b381b3SDavid Woodhouse /* number of data stripes (no p/q) */ 98*53b381b3SDavid Woodhouse int nr_data; 99*53b381b3SDavid Woodhouse 100*53b381b3SDavid Woodhouse /* 101*53b381b3SDavid Woodhouse * set if we're doing a parity rebuild 102*53b381b3SDavid Woodhouse * for a read from higher up, which is handled 103*53b381b3SDavid Woodhouse * differently from a parity rebuild as part of 104*53b381b3SDavid Woodhouse * rmw 105*53b381b3SDavid Woodhouse */ 106*53b381b3SDavid Woodhouse int read_rebuild; 107*53b381b3SDavid Woodhouse 108*53b381b3SDavid Woodhouse /* first bad stripe */ 109*53b381b3SDavid Woodhouse int faila; 110*53b381b3SDavid Woodhouse 111*53b381b3SDavid Woodhouse /* second bad stripe (for raid6 use) */ 112*53b381b3SDavid Woodhouse int failb; 113*53b381b3SDavid Woodhouse 114*53b381b3SDavid Woodhouse /* 115*53b381b3SDavid Woodhouse * number of pages needed to represent the full 116*53b381b3SDavid Woodhouse * stripe 117*53b381b3SDavid Woodhouse */ 118*53b381b3SDavid Woodhouse int nr_pages; 119*53b381b3SDavid Woodhouse 120*53b381b3SDavid Woodhouse /* 121*53b381b3SDavid Woodhouse * size of all the bios in the bio_list. This 122*53b381b3SDavid Woodhouse * helps us decide if the rbio maps to a full 123*53b381b3SDavid Woodhouse * stripe or not 124*53b381b3SDavid Woodhouse */ 125*53b381b3SDavid Woodhouse int bio_list_bytes; 126*53b381b3SDavid Woodhouse 127*53b381b3SDavid Woodhouse atomic_t refs; 128*53b381b3SDavid Woodhouse 129*53b381b3SDavid Woodhouse /* 130*53b381b3SDavid Woodhouse * these are two arrays of pointers. We allocate the 131*53b381b3SDavid Woodhouse * rbio big enough to hold them both and setup their 132*53b381b3SDavid Woodhouse * locations when the rbio is allocated 133*53b381b3SDavid Woodhouse */ 134*53b381b3SDavid Woodhouse 135*53b381b3SDavid Woodhouse /* pointers to pages that we allocated for 136*53b381b3SDavid Woodhouse * reading/writing stripes directly from the disk (including P/Q) 137*53b381b3SDavid Woodhouse */ 138*53b381b3SDavid Woodhouse struct page **stripe_pages; 139*53b381b3SDavid Woodhouse 140*53b381b3SDavid Woodhouse /* 141*53b381b3SDavid Woodhouse * pointers to the pages in the bio_list. Stored 142*53b381b3SDavid Woodhouse * here for faster lookup 143*53b381b3SDavid Woodhouse */ 144*53b381b3SDavid Woodhouse struct page **bio_pages; 145*53b381b3SDavid Woodhouse }; 146*53b381b3SDavid Woodhouse 147*53b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 148*53b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 149*53b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work); 150*53b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work); 151*53b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 152*53b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio); 153*53b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 154*53b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 155*53b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio); 156*53b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio); 157*53b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 158*53b381b3SDavid Woodhouse 159*53b381b3SDavid Woodhouse /* 160*53b381b3SDavid Woodhouse * the stripe hash table is used for locking, and to collect 161*53b381b3SDavid Woodhouse * bios in hopes of making a full stripe 162*53b381b3SDavid Woodhouse */ 163*53b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 164*53b381b3SDavid Woodhouse { 165*53b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *table; 166*53b381b3SDavid Woodhouse struct btrfs_stripe_hash_table *x; 167*53b381b3SDavid Woodhouse struct btrfs_stripe_hash *cur; 168*53b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 169*53b381b3SDavid Woodhouse int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 170*53b381b3SDavid Woodhouse int i; 171*53b381b3SDavid Woodhouse 172*53b381b3SDavid Woodhouse if (info->stripe_hash_table) 173*53b381b3SDavid Woodhouse return 0; 174*53b381b3SDavid Woodhouse 175*53b381b3SDavid Woodhouse table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); 176*53b381b3SDavid Woodhouse if (!table) 177*53b381b3SDavid Woodhouse return -ENOMEM; 178*53b381b3SDavid Woodhouse 179*53b381b3SDavid Woodhouse table->table = (void *)(table + 1); 180*53b381b3SDavid Woodhouse h = table->table; 181*53b381b3SDavid Woodhouse 182*53b381b3SDavid Woodhouse for (i = 0; i < num_entries; i++) { 183*53b381b3SDavid Woodhouse cur = h + i; 184*53b381b3SDavid Woodhouse INIT_LIST_HEAD(&cur->hash_list); 185*53b381b3SDavid Woodhouse spin_lock_init(&cur->lock); 186*53b381b3SDavid Woodhouse init_waitqueue_head(&cur->wait); 187*53b381b3SDavid Woodhouse } 188*53b381b3SDavid Woodhouse 189*53b381b3SDavid Woodhouse x = cmpxchg(&info->stripe_hash_table, NULL, table); 190*53b381b3SDavid Woodhouse if (x) 191*53b381b3SDavid Woodhouse kfree(x); 192*53b381b3SDavid Woodhouse return 0; 193*53b381b3SDavid Woodhouse } 194*53b381b3SDavid Woodhouse 195*53b381b3SDavid Woodhouse /* 196*53b381b3SDavid Woodhouse * we hash on the first logical address of the stripe 197*53b381b3SDavid Woodhouse */ 198*53b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio) 199*53b381b3SDavid Woodhouse { 200*53b381b3SDavid Woodhouse u64 num = rbio->raid_map[0]; 201*53b381b3SDavid Woodhouse 202*53b381b3SDavid Woodhouse /* 203*53b381b3SDavid Woodhouse * we shift down quite a bit. We're using byte 204*53b381b3SDavid Woodhouse * addressing, and most of the lower bits are zeros. 205*53b381b3SDavid Woodhouse * This tends to upset hash_64, and it consistently 206*53b381b3SDavid Woodhouse * returns just one or two different values. 207*53b381b3SDavid Woodhouse * 208*53b381b3SDavid Woodhouse * shifting off the lower bits fixes things. 209*53b381b3SDavid Woodhouse */ 210*53b381b3SDavid Woodhouse return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 211*53b381b3SDavid Woodhouse } 212*53b381b3SDavid Woodhouse 213*53b381b3SDavid Woodhouse /* 214*53b381b3SDavid Woodhouse * merging means we take the bio_list from the victim and 215*53b381b3SDavid Woodhouse * splice it into the destination. The victim should 216*53b381b3SDavid Woodhouse * be discarded afterwards. 217*53b381b3SDavid Woodhouse * 218*53b381b3SDavid Woodhouse * must be called with dest->rbio_list_lock held 219*53b381b3SDavid Woodhouse */ 220*53b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest, 221*53b381b3SDavid Woodhouse struct btrfs_raid_bio *victim) 222*53b381b3SDavid Woodhouse { 223*53b381b3SDavid Woodhouse bio_list_merge(&dest->bio_list, &victim->bio_list); 224*53b381b3SDavid Woodhouse dest->bio_list_bytes += victim->bio_list_bytes; 225*53b381b3SDavid Woodhouse bio_list_init(&victim->bio_list); 226*53b381b3SDavid Woodhouse } 227*53b381b3SDavid Woodhouse 228*53b381b3SDavid Woodhouse /* 229*53b381b3SDavid Woodhouse * free the hash table used by unmount 230*53b381b3SDavid Woodhouse */ 231*53b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 232*53b381b3SDavid Woodhouse { 233*53b381b3SDavid Woodhouse if (!info->stripe_hash_table) 234*53b381b3SDavid Woodhouse return; 235*53b381b3SDavid Woodhouse kfree(info->stripe_hash_table); 236*53b381b3SDavid Woodhouse info->stripe_hash_table = NULL; 237*53b381b3SDavid Woodhouse } 238*53b381b3SDavid Woodhouse 239*53b381b3SDavid Woodhouse /* 240*53b381b3SDavid Woodhouse * helper function to run the xor_blocks api. It is only 241*53b381b3SDavid Woodhouse * able to do MAX_XOR_BLOCKS at a time, so we need to 242*53b381b3SDavid Woodhouse * loop through. 243*53b381b3SDavid Woodhouse */ 244*53b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len) 245*53b381b3SDavid Woodhouse { 246*53b381b3SDavid Woodhouse int src_off = 0; 247*53b381b3SDavid Woodhouse int xor_src_cnt = 0; 248*53b381b3SDavid Woodhouse void *dest = pages[src_cnt]; 249*53b381b3SDavid Woodhouse 250*53b381b3SDavid Woodhouse while(src_cnt > 0) { 251*53b381b3SDavid Woodhouse xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 252*53b381b3SDavid Woodhouse xor_blocks(xor_src_cnt, len, dest, pages + src_off); 253*53b381b3SDavid Woodhouse 254*53b381b3SDavid Woodhouse src_cnt -= xor_src_cnt; 255*53b381b3SDavid Woodhouse src_off += xor_src_cnt; 256*53b381b3SDavid Woodhouse } 257*53b381b3SDavid Woodhouse } 258*53b381b3SDavid Woodhouse 259*53b381b3SDavid Woodhouse /* 260*53b381b3SDavid Woodhouse * returns true if the bio list inside this rbio 261*53b381b3SDavid Woodhouse * covers an entire stripe (no rmw required). 262*53b381b3SDavid Woodhouse * Must be called with the bio list lock held, or 263*53b381b3SDavid Woodhouse * at a time when you know it is impossible to add 264*53b381b3SDavid Woodhouse * new bios into the list 265*53b381b3SDavid Woodhouse */ 266*53b381b3SDavid Woodhouse static int __rbio_is_full(struct btrfs_raid_bio *rbio) 267*53b381b3SDavid Woodhouse { 268*53b381b3SDavid Woodhouse unsigned long size = rbio->bio_list_bytes; 269*53b381b3SDavid Woodhouse int ret = 1; 270*53b381b3SDavid Woodhouse 271*53b381b3SDavid Woodhouse if (size != rbio->nr_data * rbio->stripe_len) 272*53b381b3SDavid Woodhouse ret = 0; 273*53b381b3SDavid Woodhouse 274*53b381b3SDavid Woodhouse BUG_ON(size > rbio->nr_data * rbio->stripe_len); 275*53b381b3SDavid Woodhouse return ret; 276*53b381b3SDavid Woodhouse } 277*53b381b3SDavid Woodhouse 278*53b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio) 279*53b381b3SDavid Woodhouse { 280*53b381b3SDavid Woodhouse unsigned long flags; 281*53b381b3SDavid Woodhouse int ret; 282*53b381b3SDavid Woodhouse 283*53b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 284*53b381b3SDavid Woodhouse ret = __rbio_is_full(rbio); 285*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 286*53b381b3SDavid Woodhouse return ret; 287*53b381b3SDavid Woodhouse } 288*53b381b3SDavid Woodhouse 289*53b381b3SDavid Woodhouse /* 290*53b381b3SDavid Woodhouse * returns 1 if it is safe to merge two rbios together. 291*53b381b3SDavid Woodhouse * The merging is safe if the two rbios correspond to 292*53b381b3SDavid Woodhouse * the same stripe and if they are both going in the same 293*53b381b3SDavid Woodhouse * direction (read vs write), and if neither one is 294*53b381b3SDavid Woodhouse * locked for final IO 295*53b381b3SDavid Woodhouse * 296*53b381b3SDavid Woodhouse * The caller is responsible for locking such that 297*53b381b3SDavid Woodhouse * rmw_locked is safe to test 298*53b381b3SDavid Woodhouse */ 299*53b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last, 300*53b381b3SDavid Woodhouse struct btrfs_raid_bio *cur) 301*53b381b3SDavid Woodhouse { 302*53b381b3SDavid Woodhouse if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 303*53b381b3SDavid Woodhouse test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 304*53b381b3SDavid Woodhouse return 0; 305*53b381b3SDavid Woodhouse 306*53b381b3SDavid Woodhouse if (last->raid_map[0] != 307*53b381b3SDavid Woodhouse cur->raid_map[0]) 308*53b381b3SDavid Woodhouse return 0; 309*53b381b3SDavid Woodhouse 310*53b381b3SDavid Woodhouse /* reads can't merge with writes */ 311*53b381b3SDavid Woodhouse if (last->read_rebuild != 312*53b381b3SDavid Woodhouse cur->read_rebuild) { 313*53b381b3SDavid Woodhouse return 0; 314*53b381b3SDavid Woodhouse } 315*53b381b3SDavid Woodhouse 316*53b381b3SDavid Woodhouse return 1; 317*53b381b3SDavid Woodhouse } 318*53b381b3SDavid Woodhouse 319*53b381b3SDavid Woodhouse /* 320*53b381b3SDavid Woodhouse * helper to index into the pstripe 321*53b381b3SDavid Woodhouse */ 322*53b381b3SDavid Woodhouse static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 323*53b381b3SDavid Woodhouse { 324*53b381b3SDavid Woodhouse index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 325*53b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 326*53b381b3SDavid Woodhouse } 327*53b381b3SDavid Woodhouse 328*53b381b3SDavid Woodhouse /* 329*53b381b3SDavid Woodhouse * helper to index into the qstripe, returns null 330*53b381b3SDavid Woodhouse * if there is no qstripe 331*53b381b3SDavid Woodhouse */ 332*53b381b3SDavid Woodhouse static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 333*53b381b3SDavid Woodhouse { 334*53b381b3SDavid Woodhouse if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 335*53b381b3SDavid Woodhouse return NULL; 336*53b381b3SDavid Woodhouse 337*53b381b3SDavid Woodhouse index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 338*53b381b3SDavid Woodhouse PAGE_CACHE_SHIFT; 339*53b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 340*53b381b3SDavid Woodhouse } 341*53b381b3SDavid Woodhouse 342*53b381b3SDavid Woodhouse /* 343*53b381b3SDavid Woodhouse * The first stripe in the table for a logical address 344*53b381b3SDavid Woodhouse * has the lock. rbios are added in one of three ways: 345*53b381b3SDavid Woodhouse * 346*53b381b3SDavid Woodhouse * 1) Nobody has the stripe locked yet. The rbio is given 347*53b381b3SDavid Woodhouse * the lock and 0 is returned. The caller must start the IO 348*53b381b3SDavid Woodhouse * themselves. 349*53b381b3SDavid Woodhouse * 350*53b381b3SDavid Woodhouse * 2) Someone has the stripe locked, but we're able to merge 351*53b381b3SDavid Woodhouse * with the lock owner. The rbio is freed and the IO will 352*53b381b3SDavid Woodhouse * start automatically along with the existing rbio. 1 is returned. 353*53b381b3SDavid Woodhouse * 354*53b381b3SDavid Woodhouse * 3) Someone has the stripe locked, but we're not able to merge. 355*53b381b3SDavid Woodhouse * The rbio is added to the lock owner's plug list, or merged into 356*53b381b3SDavid Woodhouse * an rbio already on the plug list. When the lock owner unlocks, 357*53b381b3SDavid Woodhouse * the next rbio on the list is run and the IO is started automatically. 358*53b381b3SDavid Woodhouse * 1 is returned 359*53b381b3SDavid Woodhouse * 360*53b381b3SDavid Woodhouse * If we return 0, the caller still owns the rbio and must continue with 361*53b381b3SDavid Woodhouse * IO submission. If we return 1, the caller must assume the rbio has 362*53b381b3SDavid Woodhouse * already been freed. 363*53b381b3SDavid Woodhouse */ 364*53b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 365*53b381b3SDavid Woodhouse { 366*53b381b3SDavid Woodhouse int bucket = rbio_bucket(rbio); 367*53b381b3SDavid Woodhouse struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 368*53b381b3SDavid Woodhouse struct btrfs_raid_bio *cur; 369*53b381b3SDavid Woodhouse struct btrfs_raid_bio *pending; 370*53b381b3SDavid Woodhouse unsigned long flags; 371*53b381b3SDavid Woodhouse DEFINE_WAIT(wait); 372*53b381b3SDavid Woodhouse struct btrfs_raid_bio *freeit = NULL; 373*53b381b3SDavid Woodhouse int ret = 0; 374*53b381b3SDavid Woodhouse int walk = 0; 375*53b381b3SDavid Woodhouse 376*53b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 377*53b381b3SDavid Woodhouse list_for_each_entry(cur, &h->hash_list, hash_list) { 378*53b381b3SDavid Woodhouse walk++; 379*53b381b3SDavid Woodhouse if (cur->raid_map[0] == rbio->raid_map[0]) { 380*53b381b3SDavid Woodhouse spin_lock(&cur->bio_list_lock); 381*53b381b3SDavid Woodhouse 382*53b381b3SDavid Woodhouse /* can we merge into the lock owner? */ 383*53b381b3SDavid Woodhouse if (rbio_can_merge(cur, rbio)) { 384*53b381b3SDavid Woodhouse merge_rbio(cur, rbio); 385*53b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 386*53b381b3SDavid Woodhouse freeit = rbio; 387*53b381b3SDavid Woodhouse ret = 1; 388*53b381b3SDavid Woodhouse goto out; 389*53b381b3SDavid Woodhouse } 390*53b381b3SDavid Woodhouse 391*53b381b3SDavid Woodhouse /* 392*53b381b3SDavid Woodhouse * we couldn't merge with the running 393*53b381b3SDavid Woodhouse * rbio, see if we can merge with the 394*53b381b3SDavid Woodhouse * pending ones. We don't have to 395*53b381b3SDavid Woodhouse * check for rmw_locked because there 396*53b381b3SDavid Woodhouse * is no way they are inside finish_rmw 397*53b381b3SDavid Woodhouse * right now 398*53b381b3SDavid Woodhouse */ 399*53b381b3SDavid Woodhouse list_for_each_entry(pending, &cur->plug_list, 400*53b381b3SDavid Woodhouse plug_list) { 401*53b381b3SDavid Woodhouse if (rbio_can_merge(pending, rbio)) { 402*53b381b3SDavid Woodhouse merge_rbio(pending, rbio); 403*53b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 404*53b381b3SDavid Woodhouse freeit = rbio; 405*53b381b3SDavid Woodhouse ret = 1; 406*53b381b3SDavid Woodhouse goto out; 407*53b381b3SDavid Woodhouse } 408*53b381b3SDavid Woodhouse } 409*53b381b3SDavid Woodhouse 410*53b381b3SDavid Woodhouse /* no merging, put us on the tail of the plug list, 411*53b381b3SDavid Woodhouse * our rbio will be started with the currently 412*53b381b3SDavid Woodhouse * running rbio unlocks 413*53b381b3SDavid Woodhouse */ 414*53b381b3SDavid Woodhouse list_add_tail(&rbio->plug_list, &cur->plug_list); 415*53b381b3SDavid Woodhouse spin_unlock(&cur->bio_list_lock); 416*53b381b3SDavid Woodhouse ret = 1; 417*53b381b3SDavid Woodhouse goto out; 418*53b381b3SDavid Woodhouse } 419*53b381b3SDavid Woodhouse } 420*53b381b3SDavid Woodhouse 421*53b381b3SDavid Woodhouse atomic_inc(&rbio->refs); 422*53b381b3SDavid Woodhouse list_add(&rbio->hash_list, &h->hash_list); 423*53b381b3SDavid Woodhouse out: 424*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 425*53b381b3SDavid Woodhouse if (freeit) 426*53b381b3SDavid Woodhouse __free_raid_bio(freeit); 427*53b381b3SDavid Woodhouse return ret; 428*53b381b3SDavid Woodhouse } 429*53b381b3SDavid Woodhouse 430*53b381b3SDavid Woodhouse /* 431*53b381b3SDavid Woodhouse * called as rmw or parity rebuild is completed. If the plug list has more 432*53b381b3SDavid Woodhouse * rbios waiting for this stripe, the next one on the list will be started 433*53b381b3SDavid Woodhouse */ 434*53b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 435*53b381b3SDavid Woodhouse { 436*53b381b3SDavid Woodhouse int bucket; 437*53b381b3SDavid Woodhouse struct btrfs_stripe_hash *h; 438*53b381b3SDavid Woodhouse unsigned long flags; 439*53b381b3SDavid Woodhouse 440*53b381b3SDavid Woodhouse bucket = rbio_bucket(rbio); 441*53b381b3SDavid Woodhouse h = rbio->fs_info->stripe_hash_table->table + bucket; 442*53b381b3SDavid Woodhouse 443*53b381b3SDavid Woodhouse spin_lock_irqsave(&h->lock, flags); 444*53b381b3SDavid Woodhouse spin_lock(&rbio->bio_list_lock); 445*53b381b3SDavid Woodhouse 446*53b381b3SDavid Woodhouse if (!list_empty(&rbio->hash_list)) { 447*53b381b3SDavid Woodhouse 448*53b381b3SDavid Woodhouse list_del_init(&rbio->hash_list); 449*53b381b3SDavid Woodhouse atomic_dec(&rbio->refs); 450*53b381b3SDavid Woodhouse 451*53b381b3SDavid Woodhouse /* 452*53b381b3SDavid Woodhouse * we use the plug list to hold all the rbios 453*53b381b3SDavid Woodhouse * waiting for the chance to lock this stripe. 454*53b381b3SDavid Woodhouse * hand the lock over to one of them. 455*53b381b3SDavid Woodhouse */ 456*53b381b3SDavid Woodhouse if (!list_empty(&rbio->plug_list)) { 457*53b381b3SDavid Woodhouse struct btrfs_raid_bio *next; 458*53b381b3SDavid Woodhouse struct list_head *head = rbio->plug_list.next; 459*53b381b3SDavid Woodhouse 460*53b381b3SDavid Woodhouse next = list_entry(head, struct btrfs_raid_bio, 461*53b381b3SDavid Woodhouse plug_list); 462*53b381b3SDavid Woodhouse 463*53b381b3SDavid Woodhouse list_del_init(&rbio->plug_list); 464*53b381b3SDavid Woodhouse 465*53b381b3SDavid Woodhouse list_add(&next->hash_list, &h->hash_list); 466*53b381b3SDavid Woodhouse atomic_inc(&next->refs); 467*53b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 468*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 469*53b381b3SDavid Woodhouse 470*53b381b3SDavid Woodhouse if (next->read_rebuild) 471*53b381b3SDavid Woodhouse async_read_rebuild(next); 472*53b381b3SDavid Woodhouse else 473*53b381b3SDavid Woodhouse async_rmw_stripe(next); 474*53b381b3SDavid Woodhouse 475*53b381b3SDavid Woodhouse goto done_nolock; 476*53b381b3SDavid Woodhouse 477*53b381b3SDavid Woodhouse } else if (waitqueue_active(&h->wait)) { 478*53b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 479*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 480*53b381b3SDavid Woodhouse wake_up(&h->wait); 481*53b381b3SDavid Woodhouse goto done_nolock; 482*53b381b3SDavid Woodhouse } 483*53b381b3SDavid Woodhouse } 484*53b381b3SDavid Woodhouse spin_unlock(&rbio->bio_list_lock); 485*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&h->lock, flags); 486*53b381b3SDavid Woodhouse 487*53b381b3SDavid Woodhouse done_nolock: 488*53b381b3SDavid Woodhouse return; 489*53b381b3SDavid Woodhouse } 490*53b381b3SDavid Woodhouse 491*53b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio) 492*53b381b3SDavid Woodhouse { 493*53b381b3SDavid Woodhouse int i; 494*53b381b3SDavid Woodhouse 495*53b381b3SDavid Woodhouse WARN_ON(atomic_read(&rbio->refs) < 0); 496*53b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->refs)) 497*53b381b3SDavid Woodhouse return; 498*53b381b3SDavid Woodhouse 499*53b381b3SDavid Woodhouse WARN_ON(!list_empty(&rbio->hash_list)); 500*53b381b3SDavid Woodhouse WARN_ON(!bio_list_empty(&rbio->bio_list)); 501*53b381b3SDavid Woodhouse 502*53b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 503*53b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) { 504*53b381b3SDavid Woodhouse __free_page(rbio->stripe_pages[i]); 505*53b381b3SDavid Woodhouse rbio->stripe_pages[i] = NULL; 506*53b381b3SDavid Woodhouse } 507*53b381b3SDavid Woodhouse } 508*53b381b3SDavid Woodhouse kfree(rbio->raid_map); 509*53b381b3SDavid Woodhouse kfree(rbio->bbio); 510*53b381b3SDavid Woodhouse kfree(rbio); 511*53b381b3SDavid Woodhouse } 512*53b381b3SDavid Woodhouse 513*53b381b3SDavid Woodhouse static void free_raid_bio(struct btrfs_raid_bio *rbio) 514*53b381b3SDavid Woodhouse { 515*53b381b3SDavid Woodhouse unlock_stripe(rbio); 516*53b381b3SDavid Woodhouse __free_raid_bio(rbio); 517*53b381b3SDavid Woodhouse } 518*53b381b3SDavid Woodhouse 519*53b381b3SDavid Woodhouse /* 520*53b381b3SDavid Woodhouse * this frees the rbio and runs through all the bios in the 521*53b381b3SDavid Woodhouse * bio_list and calls end_io on them 522*53b381b3SDavid Woodhouse */ 523*53b381b3SDavid Woodhouse static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 524*53b381b3SDavid Woodhouse { 525*53b381b3SDavid Woodhouse struct bio *cur = bio_list_get(&rbio->bio_list); 526*53b381b3SDavid Woodhouse struct bio *next; 527*53b381b3SDavid Woodhouse free_raid_bio(rbio); 528*53b381b3SDavid Woodhouse 529*53b381b3SDavid Woodhouse while (cur) { 530*53b381b3SDavid Woodhouse next = cur->bi_next; 531*53b381b3SDavid Woodhouse cur->bi_next = NULL; 532*53b381b3SDavid Woodhouse if (uptodate) 533*53b381b3SDavid Woodhouse set_bit(BIO_UPTODATE, &cur->bi_flags); 534*53b381b3SDavid Woodhouse bio_endio(cur, err); 535*53b381b3SDavid Woodhouse cur = next; 536*53b381b3SDavid Woodhouse } 537*53b381b3SDavid Woodhouse } 538*53b381b3SDavid Woodhouse 539*53b381b3SDavid Woodhouse /* 540*53b381b3SDavid Woodhouse * end io function used by finish_rmw. When we finally 541*53b381b3SDavid Woodhouse * get here, we've written a full stripe 542*53b381b3SDavid Woodhouse */ 543*53b381b3SDavid Woodhouse static void raid_write_end_io(struct bio *bio, int err) 544*53b381b3SDavid Woodhouse { 545*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 546*53b381b3SDavid Woodhouse 547*53b381b3SDavid Woodhouse if (err) 548*53b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 549*53b381b3SDavid Woodhouse 550*53b381b3SDavid Woodhouse bio_put(bio); 551*53b381b3SDavid Woodhouse 552*53b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 553*53b381b3SDavid Woodhouse return; 554*53b381b3SDavid Woodhouse 555*53b381b3SDavid Woodhouse err = 0; 556*53b381b3SDavid Woodhouse 557*53b381b3SDavid Woodhouse /* OK, we have read all the stripes we need to. */ 558*53b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 559*53b381b3SDavid Woodhouse err = -EIO; 560*53b381b3SDavid Woodhouse 561*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, 0); 562*53b381b3SDavid Woodhouse return; 563*53b381b3SDavid Woodhouse } 564*53b381b3SDavid Woodhouse 565*53b381b3SDavid Woodhouse /* 566*53b381b3SDavid Woodhouse * the read/modify/write code wants to use the original bio for 567*53b381b3SDavid Woodhouse * any pages it included, and then use the rbio for everything 568*53b381b3SDavid Woodhouse * else. This function decides if a given index (stripe number) 569*53b381b3SDavid Woodhouse * and page number in that stripe fall inside the original bio 570*53b381b3SDavid Woodhouse * or the rbio. 571*53b381b3SDavid Woodhouse * 572*53b381b3SDavid Woodhouse * if you set bio_list_only, you'll get a NULL back for any ranges 573*53b381b3SDavid Woodhouse * that are outside the bio_list 574*53b381b3SDavid Woodhouse * 575*53b381b3SDavid Woodhouse * This doesn't take any refs on anything, you get a bare page pointer 576*53b381b3SDavid Woodhouse * and the caller must bump refs as required. 577*53b381b3SDavid Woodhouse * 578*53b381b3SDavid Woodhouse * You must call index_rbio_pages once before you can trust 579*53b381b3SDavid Woodhouse * the answers from this function. 580*53b381b3SDavid Woodhouse */ 581*53b381b3SDavid Woodhouse static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 582*53b381b3SDavid Woodhouse int index, int pagenr, int bio_list_only) 583*53b381b3SDavid Woodhouse { 584*53b381b3SDavid Woodhouse int chunk_page; 585*53b381b3SDavid Woodhouse struct page *p = NULL; 586*53b381b3SDavid Woodhouse 587*53b381b3SDavid Woodhouse chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 588*53b381b3SDavid Woodhouse 589*53b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 590*53b381b3SDavid Woodhouse p = rbio->bio_pages[chunk_page]; 591*53b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 592*53b381b3SDavid Woodhouse 593*53b381b3SDavid Woodhouse if (p || bio_list_only) 594*53b381b3SDavid Woodhouse return p; 595*53b381b3SDavid Woodhouse 596*53b381b3SDavid Woodhouse return rbio->stripe_pages[chunk_page]; 597*53b381b3SDavid Woodhouse } 598*53b381b3SDavid Woodhouse 599*53b381b3SDavid Woodhouse /* 600*53b381b3SDavid Woodhouse * number of pages we need for the entire stripe across all the 601*53b381b3SDavid Woodhouse * drives 602*53b381b3SDavid Woodhouse */ 603*53b381b3SDavid Woodhouse static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 604*53b381b3SDavid Woodhouse { 605*53b381b3SDavid Woodhouse unsigned long nr = stripe_len * nr_stripes; 606*53b381b3SDavid Woodhouse return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 607*53b381b3SDavid Woodhouse } 608*53b381b3SDavid Woodhouse 609*53b381b3SDavid Woodhouse /* 610*53b381b3SDavid Woodhouse * allocation and initial setup for the btrfs_raid_bio. Not 611*53b381b3SDavid Woodhouse * this does not allocate any pages for rbio->pages. 612*53b381b3SDavid Woodhouse */ 613*53b381b3SDavid Woodhouse static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 614*53b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 615*53b381b3SDavid Woodhouse u64 stripe_len) 616*53b381b3SDavid Woodhouse { 617*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 618*53b381b3SDavid Woodhouse int nr_data = 0; 619*53b381b3SDavid Woodhouse int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 620*53b381b3SDavid Woodhouse void *p; 621*53b381b3SDavid Woodhouse 622*53b381b3SDavid Woodhouse rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 623*53b381b3SDavid Woodhouse GFP_NOFS); 624*53b381b3SDavid Woodhouse if (!rbio) { 625*53b381b3SDavid Woodhouse kfree(raid_map); 626*53b381b3SDavid Woodhouse kfree(bbio); 627*53b381b3SDavid Woodhouse return ERR_PTR(-ENOMEM); 628*53b381b3SDavid Woodhouse } 629*53b381b3SDavid Woodhouse 630*53b381b3SDavid Woodhouse bio_list_init(&rbio->bio_list); 631*53b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->plug_list); 632*53b381b3SDavid Woodhouse spin_lock_init(&rbio->bio_list_lock); 633*53b381b3SDavid Woodhouse INIT_LIST_HEAD(&rbio->hash_list); 634*53b381b3SDavid Woodhouse rbio->bbio = bbio; 635*53b381b3SDavid Woodhouse rbio->raid_map = raid_map; 636*53b381b3SDavid Woodhouse rbio->fs_info = root->fs_info; 637*53b381b3SDavid Woodhouse rbio->stripe_len = stripe_len; 638*53b381b3SDavid Woodhouse rbio->nr_pages = num_pages; 639*53b381b3SDavid Woodhouse rbio->faila = -1; 640*53b381b3SDavid Woodhouse rbio->failb = -1; 641*53b381b3SDavid Woodhouse atomic_set(&rbio->refs, 1); 642*53b381b3SDavid Woodhouse 643*53b381b3SDavid Woodhouse /* 644*53b381b3SDavid Woodhouse * the stripe_pages and bio_pages array point to the extra 645*53b381b3SDavid Woodhouse * memory we allocated past the end of the rbio 646*53b381b3SDavid Woodhouse */ 647*53b381b3SDavid Woodhouse p = rbio + 1; 648*53b381b3SDavid Woodhouse rbio->stripe_pages = p; 649*53b381b3SDavid Woodhouse rbio->bio_pages = p + sizeof(struct page *) * num_pages; 650*53b381b3SDavid Woodhouse 651*53b381b3SDavid Woodhouse if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 652*53b381b3SDavid Woodhouse nr_data = bbio->num_stripes - 2; 653*53b381b3SDavid Woodhouse else 654*53b381b3SDavid Woodhouse nr_data = bbio->num_stripes - 1; 655*53b381b3SDavid Woodhouse 656*53b381b3SDavid Woodhouse rbio->nr_data = nr_data; 657*53b381b3SDavid Woodhouse return rbio; 658*53b381b3SDavid Woodhouse } 659*53b381b3SDavid Woodhouse 660*53b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */ 661*53b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 662*53b381b3SDavid Woodhouse { 663*53b381b3SDavid Woodhouse int i; 664*53b381b3SDavid Woodhouse struct page *page; 665*53b381b3SDavid Woodhouse 666*53b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_pages; i++) { 667*53b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 668*53b381b3SDavid Woodhouse continue; 669*53b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 670*53b381b3SDavid Woodhouse if (!page) 671*53b381b3SDavid Woodhouse return -ENOMEM; 672*53b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 673*53b381b3SDavid Woodhouse ClearPageUptodate(page); 674*53b381b3SDavid Woodhouse } 675*53b381b3SDavid Woodhouse return 0; 676*53b381b3SDavid Woodhouse } 677*53b381b3SDavid Woodhouse 678*53b381b3SDavid Woodhouse /* allocate pages for just the p/q stripes */ 679*53b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 680*53b381b3SDavid Woodhouse { 681*53b381b3SDavid Woodhouse int i; 682*53b381b3SDavid Woodhouse struct page *page; 683*53b381b3SDavid Woodhouse 684*53b381b3SDavid Woodhouse i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 685*53b381b3SDavid Woodhouse 686*53b381b3SDavid Woodhouse for (; i < rbio->nr_pages; i++) { 687*53b381b3SDavid Woodhouse if (rbio->stripe_pages[i]) 688*53b381b3SDavid Woodhouse continue; 689*53b381b3SDavid Woodhouse page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 690*53b381b3SDavid Woodhouse if (!page) 691*53b381b3SDavid Woodhouse return -ENOMEM; 692*53b381b3SDavid Woodhouse rbio->stripe_pages[i] = page; 693*53b381b3SDavid Woodhouse } 694*53b381b3SDavid Woodhouse return 0; 695*53b381b3SDavid Woodhouse } 696*53b381b3SDavid Woodhouse 697*53b381b3SDavid Woodhouse /* 698*53b381b3SDavid Woodhouse * add a single page from a specific stripe into our list of bios for IO 699*53b381b3SDavid Woodhouse * this will try to merge into existing bios if possible, and returns 700*53b381b3SDavid Woodhouse * zero if all went well. 701*53b381b3SDavid Woodhouse */ 702*53b381b3SDavid Woodhouse int rbio_add_io_page(struct btrfs_raid_bio *rbio, 703*53b381b3SDavid Woodhouse struct bio_list *bio_list, 704*53b381b3SDavid Woodhouse struct page *page, 705*53b381b3SDavid Woodhouse int stripe_nr, 706*53b381b3SDavid Woodhouse unsigned long page_index, 707*53b381b3SDavid Woodhouse unsigned long bio_max_len) 708*53b381b3SDavid Woodhouse { 709*53b381b3SDavid Woodhouse struct bio *last = bio_list->tail; 710*53b381b3SDavid Woodhouse u64 last_end = 0; 711*53b381b3SDavid Woodhouse int ret; 712*53b381b3SDavid Woodhouse struct bio *bio; 713*53b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 714*53b381b3SDavid Woodhouse u64 disk_start; 715*53b381b3SDavid Woodhouse 716*53b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[stripe_nr]; 717*53b381b3SDavid Woodhouse disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 718*53b381b3SDavid Woodhouse 719*53b381b3SDavid Woodhouse /* if the device is missing, just fail this stripe */ 720*53b381b3SDavid Woodhouse if (!stripe->dev->bdev) 721*53b381b3SDavid Woodhouse return fail_rbio_index(rbio, stripe_nr); 722*53b381b3SDavid Woodhouse 723*53b381b3SDavid Woodhouse /* see if we can add this page onto our existing bio */ 724*53b381b3SDavid Woodhouse if (last) { 725*53b381b3SDavid Woodhouse last_end = (u64)last->bi_sector << 9; 726*53b381b3SDavid Woodhouse last_end += last->bi_size; 727*53b381b3SDavid Woodhouse 728*53b381b3SDavid Woodhouse /* 729*53b381b3SDavid Woodhouse * we can't merge these if they are from different 730*53b381b3SDavid Woodhouse * devices or if they are not contiguous 731*53b381b3SDavid Woodhouse */ 732*53b381b3SDavid Woodhouse if (last_end == disk_start && stripe->dev->bdev && 733*53b381b3SDavid Woodhouse test_bit(BIO_UPTODATE, &last->bi_flags) && 734*53b381b3SDavid Woodhouse last->bi_bdev == stripe->dev->bdev) { 735*53b381b3SDavid Woodhouse ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 736*53b381b3SDavid Woodhouse if (ret == PAGE_CACHE_SIZE) 737*53b381b3SDavid Woodhouse return 0; 738*53b381b3SDavid Woodhouse } 739*53b381b3SDavid Woodhouse } 740*53b381b3SDavid Woodhouse 741*53b381b3SDavid Woodhouse /* put a new bio on the list */ 742*53b381b3SDavid Woodhouse bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 743*53b381b3SDavid Woodhouse if (!bio) 744*53b381b3SDavid Woodhouse return -ENOMEM; 745*53b381b3SDavid Woodhouse 746*53b381b3SDavid Woodhouse bio->bi_size = 0; 747*53b381b3SDavid Woodhouse bio->bi_bdev = stripe->dev->bdev; 748*53b381b3SDavid Woodhouse bio->bi_sector = disk_start >> 9; 749*53b381b3SDavid Woodhouse set_bit(BIO_UPTODATE, &bio->bi_flags); 750*53b381b3SDavid Woodhouse 751*53b381b3SDavid Woodhouse bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 752*53b381b3SDavid Woodhouse bio_list_add(bio_list, bio); 753*53b381b3SDavid Woodhouse return 0; 754*53b381b3SDavid Woodhouse } 755*53b381b3SDavid Woodhouse 756*53b381b3SDavid Woodhouse /* 757*53b381b3SDavid Woodhouse * while we're doing the read/modify/write cycle, we could 758*53b381b3SDavid Woodhouse * have errors in reading pages off the disk. This checks 759*53b381b3SDavid Woodhouse * for errors and if we're not able to read the page it'll 760*53b381b3SDavid Woodhouse * trigger parity reconstruction. The rmw will be finished 761*53b381b3SDavid Woodhouse * after we've reconstructed the failed stripes 762*53b381b3SDavid Woodhouse */ 763*53b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 764*53b381b3SDavid Woodhouse { 765*53b381b3SDavid Woodhouse if (rbio->faila >= 0 || rbio->failb >= 0) { 766*53b381b3SDavid Woodhouse BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 767*53b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 768*53b381b3SDavid Woodhouse } else { 769*53b381b3SDavid Woodhouse finish_rmw(rbio); 770*53b381b3SDavid Woodhouse } 771*53b381b3SDavid Woodhouse } 772*53b381b3SDavid Woodhouse 773*53b381b3SDavid Woodhouse /* 774*53b381b3SDavid Woodhouse * these are just the pages from the rbio array, not from anything 775*53b381b3SDavid Woodhouse * the FS sent down to us 776*53b381b3SDavid Woodhouse */ 777*53b381b3SDavid Woodhouse static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 778*53b381b3SDavid Woodhouse { 779*53b381b3SDavid Woodhouse int index; 780*53b381b3SDavid Woodhouse index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 781*53b381b3SDavid Woodhouse index += page; 782*53b381b3SDavid Woodhouse return rbio->stripe_pages[index]; 783*53b381b3SDavid Woodhouse } 784*53b381b3SDavid Woodhouse 785*53b381b3SDavid Woodhouse /* 786*53b381b3SDavid Woodhouse * helper function to walk our bio list and populate the bio_pages array with 787*53b381b3SDavid Woodhouse * the result. This seems expensive, but it is faster than constantly 788*53b381b3SDavid Woodhouse * searching through the bio list as we setup the IO in finish_rmw or stripe 789*53b381b3SDavid Woodhouse * reconstruction. 790*53b381b3SDavid Woodhouse * 791*53b381b3SDavid Woodhouse * This must be called before you trust the answers from page_in_rbio 792*53b381b3SDavid Woodhouse */ 793*53b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio) 794*53b381b3SDavid Woodhouse { 795*53b381b3SDavid Woodhouse struct bio *bio; 796*53b381b3SDavid Woodhouse u64 start; 797*53b381b3SDavid Woodhouse unsigned long stripe_offset; 798*53b381b3SDavid Woodhouse unsigned long page_index; 799*53b381b3SDavid Woodhouse struct page *p; 800*53b381b3SDavid Woodhouse int i; 801*53b381b3SDavid Woodhouse 802*53b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 803*53b381b3SDavid Woodhouse bio_list_for_each(bio, &rbio->bio_list) { 804*53b381b3SDavid Woodhouse start = (u64)bio->bi_sector << 9; 805*53b381b3SDavid Woodhouse stripe_offset = start - rbio->raid_map[0]; 806*53b381b3SDavid Woodhouse page_index = stripe_offset >> PAGE_CACHE_SHIFT; 807*53b381b3SDavid Woodhouse 808*53b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 809*53b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 810*53b381b3SDavid Woodhouse rbio->bio_pages[page_index + i] = p; 811*53b381b3SDavid Woodhouse } 812*53b381b3SDavid Woodhouse } 813*53b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 814*53b381b3SDavid Woodhouse } 815*53b381b3SDavid Woodhouse 816*53b381b3SDavid Woodhouse /* 817*53b381b3SDavid Woodhouse * this is called from one of two situations. We either 818*53b381b3SDavid Woodhouse * have a full stripe from the higher layers, or we've read all 819*53b381b3SDavid Woodhouse * the missing bits off disk. 820*53b381b3SDavid Woodhouse * 821*53b381b3SDavid Woodhouse * This will calculate the parity and then send down any 822*53b381b3SDavid Woodhouse * changed blocks. 823*53b381b3SDavid Woodhouse */ 824*53b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 825*53b381b3SDavid Woodhouse { 826*53b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 827*53b381b3SDavid Woodhouse void *pointers[bbio->num_stripes]; 828*53b381b3SDavid Woodhouse int stripe_len = rbio->stripe_len; 829*53b381b3SDavid Woodhouse int nr_data = rbio->nr_data; 830*53b381b3SDavid Woodhouse int stripe; 831*53b381b3SDavid Woodhouse int pagenr; 832*53b381b3SDavid Woodhouse int p_stripe = -1; 833*53b381b3SDavid Woodhouse int q_stripe = -1; 834*53b381b3SDavid Woodhouse struct bio_list bio_list; 835*53b381b3SDavid Woodhouse struct bio *bio; 836*53b381b3SDavid Woodhouse int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 837*53b381b3SDavid Woodhouse int ret; 838*53b381b3SDavid Woodhouse 839*53b381b3SDavid Woodhouse bio_list_init(&bio_list); 840*53b381b3SDavid Woodhouse 841*53b381b3SDavid Woodhouse if (bbio->num_stripes - rbio->nr_data == 1) { 842*53b381b3SDavid Woodhouse p_stripe = bbio->num_stripes - 1; 843*53b381b3SDavid Woodhouse } else if (bbio->num_stripes - rbio->nr_data == 2) { 844*53b381b3SDavid Woodhouse p_stripe = bbio->num_stripes - 2; 845*53b381b3SDavid Woodhouse q_stripe = bbio->num_stripes - 1; 846*53b381b3SDavid Woodhouse } else { 847*53b381b3SDavid Woodhouse BUG(); 848*53b381b3SDavid Woodhouse } 849*53b381b3SDavid Woodhouse 850*53b381b3SDavid Woodhouse /* at this point we either have a full stripe, 851*53b381b3SDavid Woodhouse * or we've read the full stripe from the drive. 852*53b381b3SDavid Woodhouse * recalculate the parity and write the new results. 853*53b381b3SDavid Woodhouse * 854*53b381b3SDavid Woodhouse * We're not allowed to add any new bios to the 855*53b381b3SDavid Woodhouse * bio list here, anyone else that wants to 856*53b381b3SDavid Woodhouse * change this stripe needs to do their own rmw. 857*53b381b3SDavid Woodhouse */ 858*53b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 859*53b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 860*53b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 861*53b381b3SDavid Woodhouse 862*53b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 863*53b381b3SDavid Woodhouse 864*53b381b3SDavid Woodhouse /* 865*53b381b3SDavid Woodhouse * now that we've set rmw_locked, run through the 866*53b381b3SDavid Woodhouse * bio list one last time and map the page pointers 867*53b381b3SDavid Woodhouse */ 868*53b381b3SDavid Woodhouse index_rbio_pages(rbio); 869*53b381b3SDavid Woodhouse 870*53b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 871*53b381b3SDavid Woodhouse struct page *p; 872*53b381b3SDavid Woodhouse /* first collect one page from each data stripe */ 873*53b381b3SDavid Woodhouse for (stripe = 0; stripe < nr_data; stripe++) { 874*53b381b3SDavid Woodhouse p = page_in_rbio(rbio, stripe, pagenr, 0); 875*53b381b3SDavid Woodhouse pointers[stripe] = kmap(p); 876*53b381b3SDavid Woodhouse } 877*53b381b3SDavid Woodhouse 878*53b381b3SDavid Woodhouse /* then add the parity stripe */ 879*53b381b3SDavid Woodhouse p = rbio_pstripe_page(rbio, pagenr); 880*53b381b3SDavid Woodhouse SetPageUptodate(p); 881*53b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 882*53b381b3SDavid Woodhouse 883*53b381b3SDavid Woodhouse if (q_stripe != -1) { 884*53b381b3SDavid Woodhouse 885*53b381b3SDavid Woodhouse /* 886*53b381b3SDavid Woodhouse * raid6, add the qstripe and call the 887*53b381b3SDavid Woodhouse * library function to fill in our p/q 888*53b381b3SDavid Woodhouse */ 889*53b381b3SDavid Woodhouse p = rbio_qstripe_page(rbio, pagenr); 890*53b381b3SDavid Woodhouse SetPageUptodate(p); 891*53b381b3SDavid Woodhouse pointers[stripe++] = kmap(p); 892*53b381b3SDavid Woodhouse 893*53b381b3SDavid Woodhouse raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 894*53b381b3SDavid Woodhouse pointers); 895*53b381b3SDavid Woodhouse } else { 896*53b381b3SDavid Woodhouse /* raid5 */ 897*53b381b3SDavid Woodhouse memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 898*53b381b3SDavid Woodhouse run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 899*53b381b3SDavid Woodhouse } 900*53b381b3SDavid Woodhouse 901*53b381b3SDavid Woodhouse 902*53b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) 903*53b381b3SDavid Woodhouse kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 904*53b381b3SDavid Woodhouse } 905*53b381b3SDavid Woodhouse 906*53b381b3SDavid Woodhouse /* 907*53b381b3SDavid Woodhouse * time to start writing. Make bios for everything from the 908*53b381b3SDavid Woodhouse * higher layers (the bio_list in our rbio) and our p/q. Ignore 909*53b381b3SDavid Woodhouse * everything else. 910*53b381b3SDavid Woodhouse */ 911*53b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 912*53b381b3SDavid Woodhouse for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 913*53b381b3SDavid Woodhouse struct page *page; 914*53b381b3SDavid Woodhouse if (stripe < rbio->nr_data) { 915*53b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 916*53b381b3SDavid Woodhouse if (!page) 917*53b381b3SDavid Woodhouse continue; 918*53b381b3SDavid Woodhouse } else { 919*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 920*53b381b3SDavid Woodhouse } 921*53b381b3SDavid Woodhouse 922*53b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 923*53b381b3SDavid Woodhouse page, stripe, pagenr, rbio->stripe_len); 924*53b381b3SDavid Woodhouse if (ret) 925*53b381b3SDavid Woodhouse goto cleanup; 926*53b381b3SDavid Woodhouse } 927*53b381b3SDavid Woodhouse } 928*53b381b3SDavid Woodhouse 929*53b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 930*53b381b3SDavid Woodhouse BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 931*53b381b3SDavid Woodhouse 932*53b381b3SDavid Woodhouse while (1) { 933*53b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 934*53b381b3SDavid Woodhouse if (!bio) 935*53b381b3SDavid Woodhouse break; 936*53b381b3SDavid Woodhouse 937*53b381b3SDavid Woodhouse bio->bi_private = rbio; 938*53b381b3SDavid Woodhouse bio->bi_end_io = raid_write_end_io; 939*53b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 940*53b381b3SDavid Woodhouse submit_bio(WRITE, bio); 941*53b381b3SDavid Woodhouse } 942*53b381b3SDavid Woodhouse return; 943*53b381b3SDavid Woodhouse 944*53b381b3SDavid Woodhouse cleanup: 945*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 946*53b381b3SDavid Woodhouse } 947*53b381b3SDavid Woodhouse 948*53b381b3SDavid Woodhouse /* 949*53b381b3SDavid Woodhouse * helper to find the stripe number for a given bio. Used to figure out which 950*53b381b3SDavid Woodhouse * stripe has failed. This expects the bio to correspond to a physical disk, 951*53b381b3SDavid Woodhouse * so it looks up based on physical sector numbers. 952*53b381b3SDavid Woodhouse */ 953*53b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio, 954*53b381b3SDavid Woodhouse struct bio *bio) 955*53b381b3SDavid Woodhouse { 956*53b381b3SDavid Woodhouse u64 physical = bio->bi_sector; 957*53b381b3SDavid Woodhouse u64 stripe_start; 958*53b381b3SDavid Woodhouse int i; 959*53b381b3SDavid Woodhouse struct btrfs_bio_stripe *stripe; 960*53b381b3SDavid Woodhouse 961*53b381b3SDavid Woodhouse physical <<= 9; 962*53b381b3SDavid Woodhouse 963*53b381b3SDavid Woodhouse for (i = 0; i < rbio->bbio->num_stripes; i++) { 964*53b381b3SDavid Woodhouse stripe = &rbio->bbio->stripes[i]; 965*53b381b3SDavid Woodhouse stripe_start = stripe->physical; 966*53b381b3SDavid Woodhouse if (physical >= stripe_start && 967*53b381b3SDavid Woodhouse physical < stripe_start + rbio->stripe_len) { 968*53b381b3SDavid Woodhouse return i; 969*53b381b3SDavid Woodhouse } 970*53b381b3SDavid Woodhouse } 971*53b381b3SDavid Woodhouse return -1; 972*53b381b3SDavid Woodhouse } 973*53b381b3SDavid Woodhouse 974*53b381b3SDavid Woodhouse /* 975*53b381b3SDavid Woodhouse * helper to find the stripe number for a given 976*53b381b3SDavid Woodhouse * bio (before mapping). Used to figure out which stripe has 977*53b381b3SDavid Woodhouse * failed. This looks up based on logical block numbers. 978*53b381b3SDavid Woodhouse */ 979*53b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 980*53b381b3SDavid Woodhouse struct bio *bio) 981*53b381b3SDavid Woodhouse { 982*53b381b3SDavid Woodhouse u64 logical = bio->bi_sector; 983*53b381b3SDavid Woodhouse u64 stripe_start; 984*53b381b3SDavid Woodhouse int i; 985*53b381b3SDavid Woodhouse 986*53b381b3SDavid Woodhouse logical <<= 9; 987*53b381b3SDavid Woodhouse 988*53b381b3SDavid Woodhouse for (i = 0; i < rbio->nr_data; i++) { 989*53b381b3SDavid Woodhouse stripe_start = rbio->raid_map[i]; 990*53b381b3SDavid Woodhouse if (logical >= stripe_start && 991*53b381b3SDavid Woodhouse logical < stripe_start + rbio->stripe_len) { 992*53b381b3SDavid Woodhouse return i; 993*53b381b3SDavid Woodhouse } 994*53b381b3SDavid Woodhouse } 995*53b381b3SDavid Woodhouse return -1; 996*53b381b3SDavid Woodhouse } 997*53b381b3SDavid Woodhouse 998*53b381b3SDavid Woodhouse /* 999*53b381b3SDavid Woodhouse * returns -EIO if we had too many failures 1000*53b381b3SDavid Woodhouse */ 1001*53b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1002*53b381b3SDavid Woodhouse { 1003*53b381b3SDavid Woodhouse unsigned long flags; 1004*53b381b3SDavid Woodhouse int ret = 0; 1005*53b381b3SDavid Woodhouse 1006*53b381b3SDavid Woodhouse spin_lock_irqsave(&rbio->bio_list_lock, flags); 1007*53b381b3SDavid Woodhouse 1008*53b381b3SDavid Woodhouse /* we already know this stripe is bad, move on */ 1009*53b381b3SDavid Woodhouse if (rbio->faila == failed || rbio->failb == failed) 1010*53b381b3SDavid Woodhouse goto out; 1011*53b381b3SDavid Woodhouse 1012*53b381b3SDavid Woodhouse if (rbio->faila == -1) { 1013*53b381b3SDavid Woodhouse /* first failure on this rbio */ 1014*53b381b3SDavid Woodhouse rbio->faila = failed; 1015*53b381b3SDavid Woodhouse atomic_inc(&rbio->bbio->error); 1016*53b381b3SDavid Woodhouse } else if (rbio->failb == -1) { 1017*53b381b3SDavid Woodhouse /* second failure on this rbio */ 1018*53b381b3SDavid Woodhouse rbio->failb = failed; 1019*53b381b3SDavid Woodhouse atomic_inc(&rbio->bbio->error); 1020*53b381b3SDavid Woodhouse } else { 1021*53b381b3SDavid Woodhouse ret = -EIO; 1022*53b381b3SDavid Woodhouse } 1023*53b381b3SDavid Woodhouse out: 1024*53b381b3SDavid Woodhouse spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1025*53b381b3SDavid Woodhouse 1026*53b381b3SDavid Woodhouse return ret; 1027*53b381b3SDavid Woodhouse } 1028*53b381b3SDavid Woodhouse 1029*53b381b3SDavid Woodhouse /* 1030*53b381b3SDavid Woodhouse * helper to fail a stripe based on a physical disk 1031*53b381b3SDavid Woodhouse * bio. 1032*53b381b3SDavid Woodhouse */ 1033*53b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1034*53b381b3SDavid Woodhouse struct bio *bio) 1035*53b381b3SDavid Woodhouse { 1036*53b381b3SDavid Woodhouse int failed = find_bio_stripe(rbio, bio); 1037*53b381b3SDavid Woodhouse 1038*53b381b3SDavid Woodhouse if (failed < 0) 1039*53b381b3SDavid Woodhouse return -EIO; 1040*53b381b3SDavid Woodhouse 1041*53b381b3SDavid Woodhouse return fail_rbio_index(rbio, failed); 1042*53b381b3SDavid Woodhouse } 1043*53b381b3SDavid Woodhouse 1044*53b381b3SDavid Woodhouse /* 1045*53b381b3SDavid Woodhouse * this sets each page in the bio uptodate. It should only be used on private 1046*53b381b3SDavid Woodhouse * rbio pages, nothing that comes in from the higher layers 1047*53b381b3SDavid Woodhouse */ 1048*53b381b3SDavid Woodhouse static void set_bio_pages_uptodate(struct bio *bio) 1049*53b381b3SDavid Woodhouse { 1050*53b381b3SDavid Woodhouse int i; 1051*53b381b3SDavid Woodhouse struct page *p; 1052*53b381b3SDavid Woodhouse 1053*53b381b3SDavid Woodhouse for (i = 0; i < bio->bi_vcnt; i++) { 1054*53b381b3SDavid Woodhouse p = bio->bi_io_vec[i].bv_page; 1055*53b381b3SDavid Woodhouse SetPageUptodate(p); 1056*53b381b3SDavid Woodhouse } 1057*53b381b3SDavid Woodhouse } 1058*53b381b3SDavid Woodhouse 1059*53b381b3SDavid Woodhouse /* 1060*53b381b3SDavid Woodhouse * end io for the read phase of the rmw cycle. All the bios here are physical 1061*53b381b3SDavid Woodhouse * stripe bios we've read from the disk so we can recalculate the parity of the 1062*53b381b3SDavid Woodhouse * stripe. 1063*53b381b3SDavid Woodhouse * 1064*53b381b3SDavid Woodhouse * This will usually kick off finish_rmw once all the bios are read in, but it 1065*53b381b3SDavid Woodhouse * may trigger parity reconstruction if we had any errors along the way 1066*53b381b3SDavid Woodhouse */ 1067*53b381b3SDavid Woodhouse static void raid_rmw_end_io(struct bio *bio, int err) 1068*53b381b3SDavid Woodhouse { 1069*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 1070*53b381b3SDavid Woodhouse 1071*53b381b3SDavid Woodhouse if (err) 1072*53b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 1073*53b381b3SDavid Woodhouse else 1074*53b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 1075*53b381b3SDavid Woodhouse 1076*53b381b3SDavid Woodhouse bio_put(bio); 1077*53b381b3SDavid Woodhouse 1078*53b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1079*53b381b3SDavid Woodhouse return; 1080*53b381b3SDavid Woodhouse 1081*53b381b3SDavid Woodhouse err = 0; 1082*53b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1083*53b381b3SDavid Woodhouse goto cleanup; 1084*53b381b3SDavid Woodhouse 1085*53b381b3SDavid Woodhouse /* 1086*53b381b3SDavid Woodhouse * this will normally call finish_rmw to start our write 1087*53b381b3SDavid Woodhouse * but if there are any failed stripes we'll reconstruct 1088*53b381b3SDavid Woodhouse * from parity first 1089*53b381b3SDavid Woodhouse */ 1090*53b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 1091*53b381b3SDavid Woodhouse return; 1092*53b381b3SDavid Woodhouse 1093*53b381b3SDavid Woodhouse cleanup: 1094*53b381b3SDavid Woodhouse 1095*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 1096*53b381b3SDavid Woodhouse } 1097*53b381b3SDavid Woodhouse 1098*53b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1099*53b381b3SDavid Woodhouse { 1100*53b381b3SDavid Woodhouse rbio->work.flags = 0; 1101*53b381b3SDavid Woodhouse rbio->work.func = rmw_work; 1102*53b381b3SDavid Woodhouse 1103*53b381b3SDavid Woodhouse btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1104*53b381b3SDavid Woodhouse &rbio->work); 1105*53b381b3SDavid Woodhouse } 1106*53b381b3SDavid Woodhouse 1107*53b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1108*53b381b3SDavid Woodhouse { 1109*53b381b3SDavid Woodhouse rbio->work.flags = 0; 1110*53b381b3SDavid Woodhouse rbio->work.func = read_rebuild_work; 1111*53b381b3SDavid Woodhouse 1112*53b381b3SDavid Woodhouse btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1113*53b381b3SDavid Woodhouse &rbio->work); 1114*53b381b3SDavid Woodhouse } 1115*53b381b3SDavid Woodhouse 1116*53b381b3SDavid Woodhouse /* 1117*53b381b3SDavid Woodhouse * the stripe must be locked by the caller. It will 1118*53b381b3SDavid Woodhouse * unlock after all the writes are done 1119*53b381b3SDavid Woodhouse */ 1120*53b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1121*53b381b3SDavid Woodhouse { 1122*53b381b3SDavid Woodhouse int bios_to_read = 0; 1123*53b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 1124*53b381b3SDavid Woodhouse struct bio_list bio_list; 1125*53b381b3SDavid Woodhouse int ret; 1126*53b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1127*53b381b3SDavid Woodhouse int pagenr; 1128*53b381b3SDavid Woodhouse int stripe; 1129*53b381b3SDavid Woodhouse struct bio *bio; 1130*53b381b3SDavid Woodhouse 1131*53b381b3SDavid Woodhouse bio_list_init(&bio_list); 1132*53b381b3SDavid Woodhouse 1133*53b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 1134*53b381b3SDavid Woodhouse if (ret) 1135*53b381b3SDavid Woodhouse goto cleanup; 1136*53b381b3SDavid Woodhouse 1137*53b381b3SDavid Woodhouse index_rbio_pages(rbio); 1138*53b381b3SDavid Woodhouse 1139*53b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 1140*53b381b3SDavid Woodhouse /* 1141*53b381b3SDavid Woodhouse * build a list of bios to read all the missing parts of this 1142*53b381b3SDavid Woodhouse * stripe 1143*53b381b3SDavid Woodhouse */ 1144*53b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1145*53b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1146*53b381b3SDavid Woodhouse struct page *page; 1147*53b381b3SDavid Woodhouse /* 1148*53b381b3SDavid Woodhouse * we want to find all the pages missing from 1149*53b381b3SDavid Woodhouse * the rbio and read them from the disk. If 1150*53b381b3SDavid Woodhouse * page_in_rbio finds a page in the bio list 1151*53b381b3SDavid Woodhouse * we don't need to read it off the stripe. 1152*53b381b3SDavid Woodhouse */ 1153*53b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 1); 1154*53b381b3SDavid Woodhouse if (page) 1155*53b381b3SDavid Woodhouse continue; 1156*53b381b3SDavid Woodhouse 1157*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 1158*53b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, page, 1159*53b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 1160*53b381b3SDavid Woodhouse if (ret) 1161*53b381b3SDavid Woodhouse goto cleanup; 1162*53b381b3SDavid Woodhouse } 1163*53b381b3SDavid Woodhouse } 1164*53b381b3SDavid Woodhouse 1165*53b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 1166*53b381b3SDavid Woodhouse if (!bios_to_read) { 1167*53b381b3SDavid Woodhouse /* 1168*53b381b3SDavid Woodhouse * this can happen if others have merged with 1169*53b381b3SDavid Woodhouse * us, it means there is nothing left to read. 1170*53b381b3SDavid Woodhouse * But if there are missing devices it may not be 1171*53b381b3SDavid Woodhouse * safe to do the full stripe write yet. 1172*53b381b3SDavid Woodhouse */ 1173*53b381b3SDavid Woodhouse goto finish; 1174*53b381b3SDavid Woodhouse } 1175*53b381b3SDavid Woodhouse 1176*53b381b3SDavid Woodhouse /* 1177*53b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 1178*53b381b3SDavid Woodhouse * not to touch it after that 1179*53b381b3SDavid Woodhouse */ 1180*53b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bios_to_read); 1181*53b381b3SDavid Woodhouse while (1) { 1182*53b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 1183*53b381b3SDavid Woodhouse if (!bio) 1184*53b381b3SDavid Woodhouse break; 1185*53b381b3SDavid Woodhouse 1186*53b381b3SDavid Woodhouse bio->bi_private = rbio; 1187*53b381b3SDavid Woodhouse bio->bi_end_io = raid_rmw_end_io; 1188*53b381b3SDavid Woodhouse 1189*53b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 1190*53b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 1191*53b381b3SDavid Woodhouse 1192*53b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1193*53b381b3SDavid Woodhouse submit_bio(READ, bio); 1194*53b381b3SDavid Woodhouse } 1195*53b381b3SDavid Woodhouse /* the actual write will happen once the reads are done */ 1196*53b381b3SDavid Woodhouse return 0; 1197*53b381b3SDavid Woodhouse 1198*53b381b3SDavid Woodhouse cleanup: 1199*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 1200*53b381b3SDavid Woodhouse return -EIO; 1201*53b381b3SDavid Woodhouse 1202*53b381b3SDavid Woodhouse finish: 1203*53b381b3SDavid Woodhouse validate_rbio_for_rmw(rbio); 1204*53b381b3SDavid Woodhouse return 0; 1205*53b381b3SDavid Woodhouse } 1206*53b381b3SDavid Woodhouse 1207*53b381b3SDavid Woodhouse /* 1208*53b381b3SDavid Woodhouse * if the upper layers pass in a full stripe, we thank them by only allocating 1209*53b381b3SDavid Woodhouse * enough pages to hold the parity, and sending it all down quickly. 1210*53b381b3SDavid Woodhouse */ 1211*53b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio) 1212*53b381b3SDavid Woodhouse { 1213*53b381b3SDavid Woodhouse int ret; 1214*53b381b3SDavid Woodhouse 1215*53b381b3SDavid Woodhouse ret = alloc_rbio_parity_pages(rbio); 1216*53b381b3SDavid Woodhouse if (ret) 1217*53b381b3SDavid Woodhouse return ret; 1218*53b381b3SDavid Woodhouse 1219*53b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 1220*53b381b3SDavid Woodhouse if (ret == 0) 1221*53b381b3SDavid Woodhouse finish_rmw(rbio); 1222*53b381b3SDavid Woodhouse return 0; 1223*53b381b3SDavid Woodhouse } 1224*53b381b3SDavid Woodhouse 1225*53b381b3SDavid Woodhouse /* 1226*53b381b3SDavid Woodhouse * partial stripe writes get handed over to async helpers. 1227*53b381b3SDavid Woodhouse * We're really hoping to merge a few more writes into this 1228*53b381b3SDavid Woodhouse * rbio before calculating new parity 1229*53b381b3SDavid Woodhouse */ 1230*53b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1231*53b381b3SDavid Woodhouse { 1232*53b381b3SDavid Woodhouse int ret; 1233*53b381b3SDavid Woodhouse 1234*53b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 1235*53b381b3SDavid Woodhouse if (ret == 0) 1236*53b381b3SDavid Woodhouse async_rmw_stripe(rbio); 1237*53b381b3SDavid Woodhouse return 0; 1238*53b381b3SDavid Woodhouse } 1239*53b381b3SDavid Woodhouse 1240*53b381b3SDavid Woodhouse /* 1241*53b381b3SDavid Woodhouse * sometimes while we were reading from the drive to 1242*53b381b3SDavid Woodhouse * recalculate parity, enough new bios come into create 1243*53b381b3SDavid Woodhouse * a full stripe. So we do a check here to see if we can 1244*53b381b3SDavid Woodhouse * go directly to finish_rmw 1245*53b381b3SDavid Woodhouse */ 1246*53b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1247*53b381b3SDavid Woodhouse { 1248*53b381b3SDavid Woodhouse /* head off into rmw land if we don't have a full stripe */ 1249*53b381b3SDavid Woodhouse if (!rbio_is_full(rbio)) 1250*53b381b3SDavid Woodhouse return partial_stripe_write(rbio); 1251*53b381b3SDavid Woodhouse return full_stripe_write(rbio); 1252*53b381b3SDavid Woodhouse } 1253*53b381b3SDavid Woodhouse 1254*53b381b3SDavid Woodhouse /* 1255*53b381b3SDavid Woodhouse * our main entry point for writes from the rest of the FS. 1256*53b381b3SDavid Woodhouse */ 1257*53b381b3SDavid Woodhouse int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1258*53b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 1259*53b381b3SDavid Woodhouse u64 stripe_len) 1260*53b381b3SDavid Woodhouse { 1261*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 1262*53b381b3SDavid Woodhouse 1263*53b381b3SDavid Woodhouse rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1264*53b381b3SDavid Woodhouse if (IS_ERR(rbio)) { 1265*53b381b3SDavid Woodhouse kfree(raid_map); 1266*53b381b3SDavid Woodhouse kfree(bbio); 1267*53b381b3SDavid Woodhouse return PTR_ERR(rbio); 1268*53b381b3SDavid Woodhouse } 1269*53b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 1270*53b381b3SDavid Woodhouse rbio->bio_list_bytes = bio->bi_size; 1271*53b381b3SDavid Woodhouse return __raid56_parity_write(rbio); 1272*53b381b3SDavid Woodhouse } 1273*53b381b3SDavid Woodhouse 1274*53b381b3SDavid Woodhouse /* 1275*53b381b3SDavid Woodhouse * all parity reconstruction happens here. We've read in everything 1276*53b381b3SDavid Woodhouse * we can find from the drives and this does the heavy lifting of 1277*53b381b3SDavid Woodhouse * sorting the good from the bad. 1278*53b381b3SDavid Woodhouse */ 1279*53b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1280*53b381b3SDavid Woodhouse { 1281*53b381b3SDavid Woodhouse int pagenr, stripe; 1282*53b381b3SDavid Woodhouse void **pointers; 1283*53b381b3SDavid Woodhouse int faila = -1, failb = -1; 1284*53b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1285*53b381b3SDavid Woodhouse struct page *page; 1286*53b381b3SDavid Woodhouse int err; 1287*53b381b3SDavid Woodhouse int i; 1288*53b381b3SDavid Woodhouse 1289*53b381b3SDavid Woodhouse pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1290*53b381b3SDavid Woodhouse GFP_NOFS); 1291*53b381b3SDavid Woodhouse if (!pointers) { 1292*53b381b3SDavid Woodhouse err = -ENOMEM; 1293*53b381b3SDavid Woodhouse goto cleanup_io; 1294*53b381b3SDavid Woodhouse } 1295*53b381b3SDavid Woodhouse 1296*53b381b3SDavid Woodhouse faila = rbio->faila; 1297*53b381b3SDavid Woodhouse failb = rbio->failb; 1298*53b381b3SDavid Woodhouse 1299*53b381b3SDavid Woodhouse if (rbio->read_rebuild) { 1300*53b381b3SDavid Woodhouse spin_lock_irq(&rbio->bio_list_lock); 1301*53b381b3SDavid Woodhouse set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1302*53b381b3SDavid Woodhouse spin_unlock_irq(&rbio->bio_list_lock); 1303*53b381b3SDavid Woodhouse } 1304*53b381b3SDavid Woodhouse 1305*53b381b3SDavid Woodhouse index_rbio_pages(rbio); 1306*53b381b3SDavid Woodhouse 1307*53b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1308*53b381b3SDavid Woodhouse /* setup our array of pointers with pages 1309*53b381b3SDavid Woodhouse * from each stripe 1310*53b381b3SDavid Woodhouse */ 1311*53b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1312*53b381b3SDavid Woodhouse /* 1313*53b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 1314*53b381b3SDavid Woodhouse * pages from the bio list 1315*53b381b3SDavid Woodhouse */ 1316*53b381b3SDavid Woodhouse if (rbio->read_rebuild && 1317*53b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 1318*53b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 1319*53b381b3SDavid Woodhouse } else { 1320*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 1321*53b381b3SDavid Woodhouse } 1322*53b381b3SDavid Woodhouse pointers[stripe] = kmap(page); 1323*53b381b3SDavid Woodhouse } 1324*53b381b3SDavid Woodhouse 1325*53b381b3SDavid Woodhouse /* all raid6 handling here */ 1326*53b381b3SDavid Woodhouse if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1327*53b381b3SDavid Woodhouse RAID6_Q_STRIPE) { 1328*53b381b3SDavid Woodhouse 1329*53b381b3SDavid Woodhouse /* 1330*53b381b3SDavid Woodhouse * single failure, rebuild from parity raid5 1331*53b381b3SDavid Woodhouse * style 1332*53b381b3SDavid Woodhouse */ 1333*53b381b3SDavid Woodhouse if (failb < 0) { 1334*53b381b3SDavid Woodhouse if (faila == rbio->nr_data) { 1335*53b381b3SDavid Woodhouse /* 1336*53b381b3SDavid Woodhouse * Just the P stripe has failed, without 1337*53b381b3SDavid Woodhouse * a bad data or Q stripe. 1338*53b381b3SDavid Woodhouse * TODO, we should redo the xor here. 1339*53b381b3SDavid Woodhouse */ 1340*53b381b3SDavid Woodhouse err = -EIO; 1341*53b381b3SDavid Woodhouse goto cleanup; 1342*53b381b3SDavid Woodhouse } 1343*53b381b3SDavid Woodhouse /* 1344*53b381b3SDavid Woodhouse * a single failure in raid6 is rebuilt 1345*53b381b3SDavid Woodhouse * in the pstripe code below 1346*53b381b3SDavid Woodhouse */ 1347*53b381b3SDavid Woodhouse goto pstripe; 1348*53b381b3SDavid Woodhouse } 1349*53b381b3SDavid Woodhouse 1350*53b381b3SDavid Woodhouse /* make sure our ps and qs are in order */ 1351*53b381b3SDavid Woodhouse if (faila > failb) { 1352*53b381b3SDavid Woodhouse int tmp = failb; 1353*53b381b3SDavid Woodhouse failb = faila; 1354*53b381b3SDavid Woodhouse faila = tmp; 1355*53b381b3SDavid Woodhouse } 1356*53b381b3SDavid Woodhouse 1357*53b381b3SDavid Woodhouse /* if the q stripe is failed, do a pstripe reconstruction 1358*53b381b3SDavid Woodhouse * from the xors. 1359*53b381b3SDavid Woodhouse * If both the q stripe and the P stripe are failed, we're 1360*53b381b3SDavid Woodhouse * here due to a crc mismatch and we can't give them the 1361*53b381b3SDavid Woodhouse * data they want 1362*53b381b3SDavid Woodhouse */ 1363*53b381b3SDavid Woodhouse if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1364*53b381b3SDavid Woodhouse if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1365*53b381b3SDavid Woodhouse err = -EIO; 1366*53b381b3SDavid Woodhouse goto cleanup; 1367*53b381b3SDavid Woodhouse } 1368*53b381b3SDavid Woodhouse /* 1369*53b381b3SDavid Woodhouse * otherwise we have one bad data stripe and 1370*53b381b3SDavid Woodhouse * a good P stripe. raid5! 1371*53b381b3SDavid Woodhouse */ 1372*53b381b3SDavid Woodhouse goto pstripe; 1373*53b381b3SDavid Woodhouse } 1374*53b381b3SDavid Woodhouse 1375*53b381b3SDavid Woodhouse if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1376*53b381b3SDavid Woodhouse raid6_datap_recov(rbio->bbio->num_stripes, 1377*53b381b3SDavid Woodhouse PAGE_SIZE, faila, pointers); 1378*53b381b3SDavid Woodhouse } else { 1379*53b381b3SDavid Woodhouse raid6_2data_recov(rbio->bbio->num_stripes, 1380*53b381b3SDavid Woodhouse PAGE_SIZE, faila, failb, 1381*53b381b3SDavid Woodhouse pointers); 1382*53b381b3SDavid Woodhouse } 1383*53b381b3SDavid Woodhouse } else { 1384*53b381b3SDavid Woodhouse void *p; 1385*53b381b3SDavid Woodhouse 1386*53b381b3SDavid Woodhouse /* rebuild from P stripe here (raid5 or raid6) */ 1387*53b381b3SDavid Woodhouse BUG_ON(failb != -1); 1388*53b381b3SDavid Woodhouse pstripe: 1389*53b381b3SDavid Woodhouse /* Copy parity block into failed block to start with */ 1390*53b381b3SDavid Woodhouse memcpy(pointers[faila], 1391*53b381b3SDavid Woodhouse pointers[rbio->nr_data], 1392*53b381b3SDavid Woodhouse PAGE_CACHE_SIZE); 1393*53b381b3SDavid Woodhouse 1394*53b381b3SDavid Woodhouse /* rearrange the pointer array */ 1395*53b381b3SDavid Woodhouse p = pointers[faila]; 1396*53b381b3SDavid Woodhouse for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1397*53b381b3SDavid Woodhouse pointers[stripe] = pointers[stripe + 1]; 1398*53b381b3SDavid Woodhouse pointers[rbio->nr_data - 1] = p; 1399*53b381b3SDavid Woodhouse 1400*53b381b3SDavid Woodhouse /* xor in the rest */ 1401*53b381b3SDavid Woodhouse run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1402*53b381b3SDavid Woodhouse } 1403*53b381b3SDavid Woodhouse /* if we're doing this rebuild as part of an rmw, go through 1404*53b381b3SDavid Woodhouse * and set all of our private rbio pages in the 1405*53b381b3SDavid Woodhouse * failed stripes as uptodate. This way finish_rmw will 1406*53b381b3SDavid Woodhouse * know they can be trusted. If this was a read reconstruction, 1407*53b381b3SDavid Woodhouse * other endio functions will fiddle the uptodate bits 1408*53b381b3SDavid Woodhouse */ 1409*53b381b3SDavid Woodhouse if (!rbio->read_rebuild) { 1410*53b381b3SDavid Woodhouse for (i = 0; i < nr_pages; i++) { 1411*53b381b3SDavid Woodhouse if (faila != -1) { 1412*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, faila, i); 1413*53b381b3SDavid Woodhouse SetPageUptodate(page); 1414*53b381b3SDavid Woodhouse } 1415*53b381b3SDavid Woodhouse if (failb != -1) { 1416*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, failb, i); 1417*53b381b3SDavid Woodhouse SetPageUptodate(page); 1418*53b381b3SDavid Woodhouse } 1419*53b381b3SDavid Woodhouse } 1420*53b381b3SDavid Woodhouse } 1421*53b381b3SDavid Woodhouse for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1422*53b381b3SDavid Woodhouse /* 1423*53b381b3SDavid Woodhouse * if we're rebuilding a read, we have to use 1424*53b381b3SDavid Woodhouse * pages from the bio list 1425*53b381b3SDavid Woodhouse */ 1426*53b381b3SDavid Woodhouse if (rbio->read_rebuild && 1427*53b381b3SDavid Woodhouse (stripe == faila || stripe == failb)) { 1428*53b381b3SDavid Woodhouse page = page_in_rbio(rbio, stripe, pagenr, 0); 1429*53b381b3SDavid Woodhouse } else { 1430*53b381b3SDavid Woodhouse page = rbio_stripe_page(rbio, stripe, pagenr); 1431*53b381b3SDavid Woodhouse } 1432*53b381b3SDavid Woodhouse kunmap(page); 1433*53b381b3SDavid Woodhouse } 1434*53b381b3SDavid Woodhouse } 1435*53b381b3SDavid Woodhouse 1436*53b381b3SDavid Woodhouse err = 0; 1437*53b381b3SDavid Woodhouse cleanup: 1438*53b381b3SDavid Woodhouse kfree(pointers); 1439*53b381b3SDavid Woodhouse 1440*53b381b3SDavid Woodhouse cleanup_io: 1441*53b381b3SDavid Woodhouse 1442*53b381b3SDavid Woodhouse if (rbio->read_rebuild) { 1443*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, err == 0); 1444*53b381b3SDavid Woodhouse } else if (err == 0) { 1445*53b381b3SDavid Woodhouse rbio->faila = -1; 1446*53b381b3SDavid Woodhouse rbio->failb = -1; 1447*53b381b3SDavid Woodhouse finish_rmw(rbio); 1448*53b381b3SDavid Woodhouse } else { 1449*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, err, 0); 1450*53b381b3SDavid Woodhouse } 1451*53b381b3SDavid Woodhouse } 1452*53b381b3SDavid Woodhouse 1453*53b381b3SDavid Woodhouse /* 1454*53b381b3SDavid Woodhouse * This is called only for stripes we've read from disk to 1455*53b381b3SDavid Woodhouse * reconstruct the parity. 1456*53b381b3SDavid Woodhouse */ 1457*53b381b3SDavid Woodhouse static void raid_recover_end_io(struct bio *bio, int err) 1458*53b381b3SDavid Woodhouse { 1459*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio = bio->bi_private; 1460*53b381b3SDavid Woodhouse 1461*53b381b3SDavid Woodhouse /* 1462*53b381b3SDavid Woodhouse * we only read stripe pages off the disk, set them 1463*53b381b3SDavid Woodhouse * up to date if there were no errors 1464*53b381b3SDavid Woodhouse */ 1465*53b381b3SDavid Woodhouse if (err) 1466*53b381b3SDavid Woodhouse fail_bio_stripe(rbio, bio); 1467*53b381b3SDavid Woodhouse else 1468*53b381b3SDavid Woodhouse set_bio_pages_uptodate(bio); 1469*53b381b3SDavid Woodhouse bio_put(bio); 1470*53b381b3SDavid Woodhouse 1471*53b381b3SDavid Woodhouse if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1472*53b381b3SDavid Woodhouse return; 1473*53b381b3SDavid Woodhouse 1474*53b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1475*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 1476*53b381b3SDavid Woodhouse else 1477*53b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 1478*53b381b3SDavid Woodhouse } 1479*53b381b3SDavid Woodhouse 1480*53b381b3SDavid Woodhouse /* 1481*53b381b3SDavid Woodhouse * reads everything we need off the disk to reconstruct 1482*53b381b3SDavid Woodhouse * the parity. endio handlers trigger final reconstruction 1483*53b381b3SDavid Woodhouse * when the IO is done. 1484*53b381b3SDavid Woodhouse * 1485*53b381b3SDavid Woodhouse * This is used both for reads from the higher layers and for 1486*53b381b3SDavid Woodhouse * parity construction required to finish a rmw cycle. 1487*53b381b3SDavid Woodhouse */ 1488*53b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 1489*53b381b3SDavid Woodhouse { 1490*53b381b3SDavid Woodhouse int bios_to_read = 0; 1491*53b381b3SDavid Woodhouse struct btrfs_bio *bbio = rbio->bbio; 1492*53b381b3SDavid Woodhouse struct bio_list bio_list; 1493*53b381b3SDavid Woodhouse int ret; 1494*53b381b3SDavid Woodhouse int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1495*53b381b3SDavid Woodhouse int pagenr; 1496*53b381b3SDavid Woodhouse int stripe; 1497*53b381b3SDavid Woodhouse struct bio *bio; 1498*53b381b3SDavid Woodhouse 1499*53b381b3SDavid Woodhouse bio_list_init(&bio_list); 1500*53b381b3SDavid Woodhouse 1501*53b381b3SDavid Woodhouse ret = alloc_rbio_pages(rbio); 1502*53b381b3SDavid Woodhouse if (ret) 1503*53b381b3SDavid Woodhouse goto cleanup; 1504*53b381b3SDavid Woodhouse 1505*53b381b3SDavid Woodhouse atomic_set(&rbio->bbio->error, 0); 1506*53b381b3SDavid Woodhouse 1507*53b381b3SDavid Woodhouse /* 1508*53b381b3SDavid Woodhouse * read everything that hasn't failed. 1509*53b381b3SDavid Woodhouse */ 1510*53b381b3SDavid Woodhouse for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1511*53b381b3SDavid Woodhouse if (rbio->faila == stripe || 1512*53b381b3SDavid Woodhouse rbio->failb == stripe) 1513*53b381b3SDavid Woodhouse continue; 1514*53b381b3SDavid Woodhouse 1515*53b381b3SDavid Woodhouse for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1516*53b381b3SDavid Woodhouse struct page *p; 1517*53b381b3SDavid Woodhouse 1518*53b381b3SDavid Woodhouse /* 1519*53b381b3SDavid Woodhouse * the rmw code may have already read this 1520*53b381b3SDavid Woodhouse * page in 1521*53b381b3SDavid Woodhouse */ 1522*53b381b3SDavid Woodhouse p = rbio_stripe_page(rbio, stripe, pagenr); 1523*53b381b3SDavid Woodhouse if (PageUptodate(p)) 1524*53b381b3SDavid Woodhouse continue; 1525*53b381b3SDavid Woodhouse 1526*53b381b3SDavid Woodhouse ret = rbio_add_io_page(rbio, &bio_list, 1527*53b381b3SDavid Woodhouse rbio_stripe_page(rbio, stripe, pagenr), 1528*53b381b3SDavid Woodhouse stripe, pagenr, rbio->stripe_len); 1529*53b381b3SDavid Woodhouse if (ret < 0) 1530*53b381b3SDavid Woodhouse goto cleanup; 1531*53b381b3SDavid Woodhouse } 1532*53b381b3SDavid Woodhouse } 1533*53b381b3SDavid Woodhouse 1534*53b381b3SDavid Woodhouse bios_to_read = bio_list_size(&bio_list); 1535*53b381b3SDavid Woodhouse if (!bios_to_read) { 1536*53b381b3SDavid Woodhouse /* 1537*53b381b3SDavid Woodhouse * we might have no bios to read just because the pages 1538*53b381b3SDavid Woodhouse * were up to date, or we might have no bios to read because 1539*53b381b3SDavid Woodhouse * the devices were gone. 1540*53b381b3SDavid Woodhouse */ 1541*53b381b3SDavid Woodhouse if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 1542*53b381b3SDavid Woodhouse __raid_recover_end_io(rbio); 1543*53b381b3SDavid Woodhouse goto out; 1544*53b381b3SDavid Woodhouse } else { 1545*53b381b3SDavid Woodhouse goto cleanup; 1546*53b381b3SDavid Woodhouse } 1547*53b381b3SDavid Woodhouse } 1548*53b381b3SDavid Woodhouse 1549*53b381b3SDavid Woodhouse /* 1550*53b381b3SDavid Woodhouse * the bbio may be freed once we submit the last bio. Make sure 1551*53b381b3SDavid Woodhouse * not to touch it after that 1552*53b381b3SDavid Woodhouse */ 1553*53b381b3SDavid Woodhouse atomic_set(&bbio->stripes_pending, bios_to_read); 1554*53b381b3SDavid Woodhouse while (1) { 1555*53b381b3SDavid Woodhouse bio = bio_list_pop(&bio_list); 1556*53b381b3SDavid Woodhouse if (!bio) 1557*53b381b3SDavid Woodhouse break; 1558*53b381b3SDavid Woodhouse 1559*53b381b3SDavid Woodhouse bio->bi_private = rbio; 1560*53b381b3SDavid Woodhouse bio->bi_end_io = raid_recover_end_io; 1561*53b381b3SDavid Woodhouse 1562*53b381b3SDavid Woodhouse btrfs_bio_wq_end_io(rbio->fs_info, bio, 1563*53b381b3SDavid Woodhouse BTRFS_WQ_ENDIO_RAID56); 1564*53b381b3SDavid Woodhouse 1565*53b381b3SDavid Woodhouse BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1566*53b381b3SDavid Woodhouse submit_bio(READ, bio); 1567*53b381b3SDavid Woodhouse } 1568*53b381b3SDavid Woodhouse out: 1569*53b381b3SDavid Woodhouse return 0; 1570*53b381b3SDavid Woodhouse 1571*53b381b3SDavid Woodhouse cleanup: 1572*53b381b3SDavid Woodhouse if (rbio->read_rebuild) 1573*53b381b3SDavid Woodhouse rbio_orig_end_io(rbio, -EIO, 0); 1574*53b381b3SDavid Woodhouse return -EIO; 1575*53b381b3SDavid Woodhouse } 1576*53b381b3SDavid Woodhouse 1577*53b381b3SDavid Woodhouse /* 1578*53b381b3SDavid Woodhouse * the main entry point for reads from the higher layers. This 1579*53b381b3SDavid Woodhouse * is really only called when the normal read path had a failure, 1580*53b381b3SDavid Woodhouse * so we assume the bio they send down corresponds to a failed part 1581*53b381b3SDavid Woodhouse * of the drive. 1582*53b381b3SDavid Woodhouse */ 1583*53b381b3SDavid Woodhouse int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 1584*53b381b3SDavid Woodhouse struct btrfs_bio *bbio, u64 *raid_map, 1585*53b381b3SDavid Woodhouse u64 stripe_len, int mirror_num) 1586*53b381b3SDavid Woodhouse { 1587*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 1588*53b381b3SDavid Woodhouse int ret; 1589*53b381b3SDavid Woodhouse 1590*53b381b3SDavid Woodhouse rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1591*53b381b3SDavid Woodhouse if (IS_ERR(rbio)) { 1592*53b381b3SDavid Woodhouse return PTR_ERR(rbio); 1593*53b381b3SDavid Woodhouse } 1594*53b381b3SDavid Woodhouse 1595*53b381b3SDavid Woodhouse rbio->read_rebuild = 1; 1596*53b381b3SDavid Woodhouse bio_list_add(&rbio->bio_list, bio); 1597*53b381b3SDavid Woodhouse rbio->bio_list_bytes = bio->bi_size; 1598*53b381b3SDavid Woodhouse 1599*53b381b3SDavid Woodhouse rbio->faila = find_logical_bio_stripe(rbio, bio); 1600*53b381b3SDavid Woodhouse if (rbio->faila == -1) { 1601*53b381b3SDavid Woodhouse BUG(); 1602*53b381b3SDavid Woodhouse kfree(rbio); 1603*53b381b3SDavid Woodhouse return -EIO; 1604*53b381b3SDavid Woodhouse } 1605*53b381b3SDavid Woodhouse 1606*53b381b3SDavid Woodhouse /* 1607*53b381b3SDavid Woodhouse * reconstruct from the q stripe if they are 1608*53b381b3SDavid Woodhouse * asking for mirror 3 1609*53b381b3SDavid Woodhouse */ 1610*53b381b3SDavid Woodhouse if (mirror_num == 3) 1611*53b381b3SDavid Woodhouse rbio->failb = bbio->num_stripes - 2; 1612*53b381b3SDavid Woodhouse 1613*53b381b3SDavid Woodhouse ret = lock_stripe_add(rbio); 1614*53b381b3SDavid Woodhouse 1615*53b381b3SDavid Woodhouse /* 1616*53b381b3SDavid Woodhouse * __raid56_parity_recover will end the bio with 1617*53b381b3SDavid Woodhouse * any errors it hits. We don't want to return 1618*53b381b3SDavid Woodhouse * its error value up the stack because our caller 1619*53b381b3SDavid Woodhouse * will end up calling bio_endio with any nonzero 1620*53b381b3SDavid Woodhouse * return 1621*53b381b3SDavid Woodhouse */ 1622*53b381b3SDavid Woodhouse if (ret == 0) 1623*53b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 1624*53b381b3SDavid Woodhouse /* 1625*53b381b3SDavid Woodhouse * our rbio has been added to the list of 1626*53b381b3SDavid Woodhouse * rbios that will be handled after the 1627*53b381b3SDavid Woodhouse * currently lock owner is done 1628*53b381b3SDavid Woodhouse */ 1629*53b381b3SDavid Woodhouse return 0; 1630*53b381b3SDavid Woodhouse 1631*53b381b3SDavid Woodhouse } 1632*53b381b3SDavid Woodhouse 1633*53b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work) 1634*53b381b3SDavid Woodhouse { 1635*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 1636*53b381b3SDavid Woodhouse 1637*53b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 1638*53b381b3SDavid Woodhouse raid56_rmw_stripe(rbio); 1639*53b381b3SDavid Woodhouse } 1640*53b381b3SDavid Woodhouse 1641*53b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work) 1642*53b381b3SDavid Woodhouse { 1643*53b381b3SDavid Woodhouse struct btrfs_raid_bio *rbio; 1644*53b381b3SDavid Woodhouse 1645*53b381b3SDavid Woodhouse rbio = container_of(work, struct btrfs_raid_bio, work); 1646*53b381b3SDavid Woodhouse __raid56_parity_recover(rbio); 1647*53b381b3SDavid Woodhouse } 1648