xref: /linux/fs/btrfs/raid56.c (revision 53b381b3abeb86f12787a6c40fee9b2f71edc23b)
1*53b381b3SDavid Woodhouse /*
2*53b381b3SDavid Woodhouse  * Copyright (C) 2012 Fusion-io  All rights reserved.
3*53b381b3SDavid Woodhouse  * Copyright (C) 2012 Intel Corp. All rights reserved.
4*53b381b3SDavid Woodhouse  *
5*53b381b3SDavid Woodhouse  * This program is free software; you can redistribute it and/or
6*53b381b3SDavid Woodhouse  * modify it under the terms of the GNU General Public
7*53b381b3SDavid Woodhouse  * License v2 as published by the Free Software Foundation.
8*53b381b3SDavid Woodhouse  *
9*53b381b3SDavid Woodhouse  * This program is distributed in the hope that it will be useful,
10*53b381b3SDavid Woodhouse  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11*53b381b3SDavid Woodhouse  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12*53b381b3SDavid Woodhouse  * General Public License for more details.
13*53b381b3SDavid Woodhouse  *
14*53b381b3SDavid Woodhouse  * You should have received a copy of the GNU General Public
15*53b381b3SDavid Woodhouse  * License along with this program; if not, write to the
16*53b381b3SDavid Woodhouse  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17*53b381b3SDavid Woodhouse  * Boston, MA 021110-1307, USA.
18*53b381b3SDavid Woodhouse  */
19*53b381b3SDavid Woodhouse #include <linux/sched.h>
20*53b381b3SDavid Woodhouse #include <linux/wait.h>
21*53b381b3SDavid Woodhouse #include <linux/bio.h>
22*53b381b3SDavid Woodhouse #include <linux/slab.h>
23*53b381b3SDavid Woodhouse #include <linux/buffer_head.h>
24*53b381b3SDavid Woodhouse #include <linux/blkdev.h>
25*53b381b3SDavid Woodhouse #include <linux/random.h>
26*53b381b3SDavid Woodhouse #include <linux/iocontext.h>
27*53b381b3SDavid Woodhouse #include <linux/capability.h>
28*53b381b3SDavid Woodhouse #include <linux/ratelimit.h>
29*53b381b3SDavid Woodhouse #include <linux/kthread.h>
30*53b381b3SDavid Woodhouse #include <linux/raid/pq.h>
31*53b381b3SDavid Woodhouse #include <linux/hash.h>
32*53b381b3SDavid Woodhouse #include <linux/list_sort.h>
33*53b381b3SDavid Woodhouse #include <linux/raid/xor.h>
34*53b381b3SDavid Woodhouse #include <asm/div64.h>
35*53b381b3SDavid Woodhouse #include "compat.h"
36*53b381b3SDavid Woodhouse #include "ctree.h"
37*53b381b3SDavid Woodhouse #include "extent_map.h"
38*53b381b3SDavid Woodhouse #include "disk-io.h"
39*53b381b3SDavid Woodhouse #include "transaction.h"
40*53b381b3SDavid Woodhouse #include "print-tree.h"
41*53b381b3SDavid Woodhouse #include "volumes.h"
42*53b381b3SDavid Woodhouse #include "raid56.h"
43*53b381b3SDavid Woodhouse #include "async-thread.h"
44*53b381b3SDavid Woodhouse #include "check-integrity.h"
45*53b381b3SDavid Woodhouse #include "rcu-string.h"
46*53b381b3SDavid Woodhouse 
47*53b381b3SDavid Woodhouse /* set when additional merges to this rbio are not allowed */
48*53b381b3SDavid Woodhouse #define RBIO_RMW_LOCKED_BIT	1
49*53b381b3SDavid Woodhouse 
50*53b381b3SDavid Woodhouse struct btrfs_raid_bio {
51*53b381b3SDavid Woodhouse 	struct btrfs_fs_info *fs_info;
52*53b381b3SDavid Woodhouse 	struct btrfs_bio *bbio;
53*53b381b3SDavid Woodhouse 
54*53b381b3SDavid Woodhouse 	/*
55*53b381b3SDavid Woodhouse 	 * logical block numbers for the start of each stripe
56*53b381b3SDavid Woodhouse 	 * The last one or two are p/q.  These are sorted,
57*53b381b3SDavid Woodhouse 	 * so raid_map[0] is the start of our full stripe
58*53b381b3SDavid Woodhouse 	 */
59*53b381b3SDavid Woodhouse 	u64 *raid_map;
60*53b381b3SDavid Woodhouse 
61*53b381b3SDavid Woodhouse 	/* while we're doing rmw on a stripe
62*53b381b3SDavid Woodhouse 	 * we put it into a hash table so we can
63*53b381b3SDavid Woodhouse 	 * lock the stripe and merge more rbios
64*53b381b3SDavid Woodhouse 	 * into it.
65*53b381b3SDavid Woodhouse 	 */
66*53b381b3SDavid Woodhouse 	struct list_head hash_list;
67*53b381b3SDavid Woodhouse 
68*53b381b3SDavid Woodhouse 	/*
69*53b381b3SDavid Woodhouse 	 * for scheduling work in the helper threads
70*53b381b3SDavid Woodhouse 	 */
71*53b381b3SDavid Woodhouse 	struct btrfs_work work;
72*53b381b3SDavid Woodhouse 
73*53b381b3SDavid Woodhouse 	/*
74*53b381b3SDavid Woodhouse 	 * bio list and bio_list_lock are used
75*53b381b3SDavid Woodhouse 	 * to add more bios into the stripe
76*53b381b3SDavid Woodhouse 	 * in hopes of avoiding the full rmw
77*53b381b3SDavid Woodhouse 	 */
78*53b381b3SDavid Woodhouse 	struct bio_list bio_list;
79*53b381b3SDavid Woodhouse 	spinlock_t bio_list_lock;
80*53b381b3SDavid Woodhouse 
81*53b381b3SDavid Woodhouse 	/*
82*53b381b3SDavid Woodhouse 	 * also protected by the bio_list_lock, the
83*53b381b3SDavid Woodhouse 	 * stripe locking code uses plug_list to hand off
84*53b381b3SDavid Woodhouse 	 * the stripe lock to the next pending IO
85*53b381b3SDavid Woodhouse 	 */
86*53b381b3SDavid Woodhouse 	struct list_head plug_list;
87*53b381b3SDavid Woodhouse 
88*53b381b3SDavid Woodhouse 	/*
89*53b381b3SDavid Woodhouse 	 * flags that tell us if it is safe to
90*53b381b3SDavid Woodhouse 	 * merge with this bio
91*53b381b3SDavid Woodhouse 	 */
92*53b381b3SDavid Woodhouse 	unsigned long flags;
93*53b381b3SDavid Woodhouse 
94*53b381b3SDavid Woodhouse 	/* size of each individual stripe on disk */
95*53b381b3SDavid Woodhouse 	int stripe_len;
96*53b381b3SDavid Woodhouse 
97*53b381b3SDavid Woodhouse 	/* number of data stripes (no p/q) */
98*53b381b3SDavid Woodhouse 	int nr_data;
99*53b381b3SDavid Woodhouse 
100*53b381b3SDavid Woodhouse 	/*
101*53b381b3SDavid Woodhouse 	 * set if we're doing a parity rebuild
102*53b381b3SDavid Woodhouse 	 * for a read from higher up, which is handled
103*53b381b3SDavid Woodhouse 	 * differently from a parity rebuild as part of
104*53b381b3SDavid Woodhouse 	 * rmw
105*53b381b3SDavid Woodhouse 	 */
106*53b381b3SDavid Woodhouse 	int read_rebuild;
107*53b381b3SDavid Woodhouse 
108*53b381b3SDavid Woodhouse 	/* first bad stripe */
109*53b381b3SDavid Woodhouse 	int faila;
110*53b381b3SDavid Woodhouse 
111*53b381b3SDavid Woodhouse 	/* second bad stripe (for raid6 use) */
112*53b381b3SDavid Woodhouse 	int failb;
113*53b381b3SDavid Woodhouse 
114*53b381b3SDavid Woodhouse 	/*
115*53b381b3SDavid Woodhouse 	 * number of pages needed to represent the full
116*53b381b3SDavid Woodhouse 	 * stripe
117*53b381b3SDavid Woodhouse 	 */
118*53b381b3SDavid Woodhouse 	int nr_pages;
119*53b381b3SDavid Woodhouse 
120*53b381b3SDavid Woodhouse 	/*
121*53b381b3SDavid Woodhouse 	 * size of all the bios in the bio_list.  This
122*53b381b3SDavid Woodhouse 	 * helps us decide if the rbio maps to a full
123*53b381b3SDavid Woodhouse 	 * stripe or not
124*53b381b3SDavid Woodhouse 	 */
125*53b381b3SDavid Woodhouse 	int bio_list_bytes;
126*53b381b3SDavid Woodhouse 
127*53b381b3SDavid Woodhouse 	atomic_t refs;
128*53b381b3SDavid Woodhouse 
129*53b381b3SDavid Woodhouse 	/*
130*53b381b3SDavid Woodhouse 	 * these are two arrays of pointers.  We allocate the
131*53b381b3SDavid Woodhouse 	 * rbio big enough to hold them both and setup their
132*53b381b3SDavid Woodhouse 	 * locations when the rbio is allocated
133*53b381b3SDavid Woodhouse 	 */
134*53b381b3SDavid Woodhouse 
135*53b381b3SDavid Woodhouse 	/* pointers to pages that we allocated for
136*53b381b3SDavid Woodhouse 	 * reading/writing stripes directly from the disk (including P/Q)
137*53b381b3SDavid Woodhouse 	 */
138*53b381b3SDavid Woodhouse 	struct page **stripe_pages;
139*53b381b3SDavid Woodhouse 
140*53b381b3SDavid Woodhouse 	/*
141*53b381b3SDavid Woodhouse 	 * pointers to the pages in the bio_list.  Stored
142*53b381b3SDavid Woodhouse 	 * here for faster lookup
143*53b381b3SDavid Woodhouse 	 */
144*53b381b3SDavid Woodhouse 	struct page **bio_pages;
145*53b381b3SDavid Woodhouse };
146*53b381b3SDavid Woodhouse 
147*53b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
148*53b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
149*53b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work);
150*53b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work);
151*53b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
152*53b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio);
153*53b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
154*53b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
155*53b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio);
156*53b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio);
157*53b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
158*53b381b3SDavid Woodhouse 
159*53b381b3SDavid Woodhouse /*
160*53b381b3SDavid Woodhouse  * the stripe hash table is used for locking, and to collect
161*53b381b3SDavid Woodhouse  * bios in hopes of making a full stripe
162*53b381b3SDavid Woodhouse  */
163*53b381b3SDavid Woodhouse int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
164*53b381b3SDavid Woodhouse {
165*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash_table *table;
166*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash_table *x;
167*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash *cur;
168*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash *h;
169*53b381b3SDavid Woodhouse 	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
170*53b381b3SDavid Woodhouse 	int i;
171*53b381b3SDavid Woodhouse 
172*53b381b3SDavid Woodhouse 	if (info->stripe_hash_table)
173*53b381b3SDavid Woodhouse 		return 0;
174*53b381b3SDavid Woodhouse 
175*53b381b3SDavid Woodhouse 	table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
176*53b381b3SDavid Woodhouse 	if (!table)
177*53b381b3SDavid Woodhouse 		return -ENOMEM;
178*53b381b3SDavid Woodhouse 
179*53b381b3SDavid Woodhouse 	table->table = (void *)(table + 1);
180*53b381b3SDavid Woodhouse 	h = table->table;
181*53b381b3SDavid Woodhouse 
182*53b381b3SDavid Woodhouse 	for (i = 0; i < num_entries; i++) {
183*53b381b3SDavid Woodhouse 		cur = h + i;
184*53b381b3SDavid Woodhouse 		INIT_LIST_HEAD(&cur->hash_list);
185*53b381b3SDavid Woodhouse 		spin_lock_init(&cur->lock);
186*53b381b3SDavid Woodhouse 		init_waitqueue_head(&cur->wait);
187*53b381b3SDavid Woodhouse 	}
188*53b381b3SDavid Woodhouse 
189*53b381b3SDavid Woodhouse 	x = cmpxchg(&info->stripe_hash_table, NULL, table);
190*53b381b3SDavid Woodhouse 	if (x)
191*53b381b3SDavid Woodhouse 		kfree(x);
192*53b381b3SDavid Woodhouse 	return 0;
193*53b381b3SDavid Woodhouse }
194*53b381b3SDavid Woodhouse 
195*53b381b3SDavid Woodhouse /*
196*53b381b3SDavid Woodhouse  * we hash on the first logical address of the stripe
197*53b381b3SDavid Woodhouse  */
198*53b381b3SDavid Woodhouse static int rbio_bucket(struct btrfs_raid_bio *rbio)
199*53b381b3SDavid Woodhouse {
200*53b381b3SDavid Woodhouse 	u64 num = rbio->raid_map[0];
201*53b381b3SDavid Woodhouse 
202*53b381b3SDavid Woodhouse 	/*
203*53b381b3SDavid Woodhouse 	 * we shift down quite a bit.  We're using byte
204*53b381b3SDavid Woodhouse 	 * addressing, and most of the lower bits are zeros.
205*53b381b3SDavid Woodhouse 	 * This tends to upset hash_64, and it consistently
206*53b381b3SDavid Woodhouse 	 * returns just one or two different values.
207*53b381b3SDavid Woodhouse 	 *
208*53b381b3SDavid Woodhouse 	 * shifting off the lower bits fixes things.
209*53b381b3SDavid Woodhouse 	 */
210*53b381b3SDavid Woodhouse 	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
211*53b381b3SDavid Woodhouse }
212*53b381b3SDavid Woodhouse 
213*53b381b3SDavid Woodhouse /*
214*53b381b3SDavid Woodhouse  * merging means we take the bio_list from the victim and
215*53b381b3SDavid Woodhouse  * splice it into the destination.  The victim should
216*53b381b3SDavid Woodhouse  * be discarded afterwards.
217*53b381b3SDavid Woodhouse  *
218*53b381b3SDavid Woodhouse  * must be called with dest->rbio_list_lock held
219*53b381b3SDavid Woodhouse  */
220*53b381b3SDavid Woodhouse static void merge_rbio(struct btrfs_raid_bio *dest,
221*53b381b3SDavid Woodhouse 		       struct btrfs_raid_bio *victim)
222*53b381b3SDavid Woodhouse {
223*53b381b3SDavid Woodhouse 	bio_list_merge(&dest->bio_list, &victim->bio_list);
224*53b381b3SDavid Woodhouse 	dest->bio_list_bytes += victim->bio_list_bytes;
225*53b381b3SDavid Woodhouse 	bio_list_init(&victim->bio_list);
226*53b381b3SDavid Woodhouse }
227*53b381b3SDavid Woodhouse 
228*53b381b3SDavid Woodhouse /*
229*53b381b3SDavid Woodhouse  * free the hash table used by unmount
230*53b381b3SDavid Woodhouse  */
231*53b381b3SDavid Woodhouse void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232*53b381b3SDavid Woodhouse {
233*53b381b3SDavid Woodhouse 	if (!info->stripe_hash_table)
234*53b381b3SDavid Woodhouse 		return;
235*53b381b3SDavid Woodhouse 	kfree(info->stripe_hash_table);
236*53b381b3SDavid Woodhouse 	info->stripe_hash_table = NULL;
237*53b381b3SDavid Woodhouse }
238*53b381b3SDavid Woodhouse 
239*53b381b3SDavid Woodhouse /*
240*53b381b3SDavid Woodhouse  * helper function to run the xor_blocks api.  It is only
241*53b381b3SDavid Woodhouse  * able to do MAX_XOR_BLOCKS at a time, so we need to
242*53b381b3SDavid Woodhouse  * loop through.
243*53b381b3SDavid Woodhouse  */
244*53b381b3SDavid Woodhouse static void run_xor(void **pages, int src_cnt, ssize_t len)
245*53b381b3SDavid Woodhouse {
246*53b381b3SDavid Woodhouse 	int src_off = 0;
247*53b381b3SDavid Woodhouse 	int xor_src_cnt = 0;
248*53b381b3SDavid Woodhouse 	void *dest = pages[src_cnt];
249*53b381b3SDavid Woodhouse 
250*53b381b3SDavid Woodhouse 	while(src_cnt > 0) {
251*53b381b3SDavid Woodhouse 		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
252*53b381b3SDavid Woodhouse 		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
253*53b381b3SDavid Woodhouse 
254*53b381b3SDavid Woodhouse 		src_cnt -= xor_src_cnt;
255*53b381b3SDavid Woodhouse 		src_off += xor_src_cnt;
256*53b381b3SDavid Woodhouse 	}
257*53b381b3SDavid Woodhouse }
258*53b381b3SDavid Woodhouse 
259*53b381b3SDavid Woodhouse /*
260*53b381b3SDavid Woodhouse  * returns true if the bio list inside this rbio
261*53b381b3SDavid Woodhouse  * covers an entire stripe (no rmw required).
262*53b381b3SDavid Woodhouse  * Must be called with the bio list lock held, or
263*53b381b3SDavid Woodhouse  * at a time when you know it is impossible to add
264*53b381b3SDavid Woodhouse  * new bios into the list
265*53b381b3SDavid Woodhouse  */
266*53b381b3SDavid Woodhouse static int __rbio_is_full(struct btrfs_raid_bio *rbio)
267*53b381b3SDavid Woodhouse {
268*53b381b3SDavid Woodhouse 	unsigned long size = rbio->bio_list_bytes;
269*53b381b3SDavid Woodhouse 	int ret = 1;
270*53b381b3SDavid Woodhouse 
271*53b381b3SDavid Woodhouse 	if (size != rbio->nr_data * rbio->stripe_len)
272*53b381b3SDavid Woodhouse 		ret = 0;
273*53b381b3SDavid Woodhouse 
274*53b381b3SDavid Woodhouse 	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
275*53b381b3SDavid Woodhouse 	return ret;
276*53b381b3SDavid Woodhouse }
277*53b381b3SDavid Woodhouse 
278*53b381b3SDavid Woodhouse static int rbio_is_full(struct btrfs_raid_bio *rbio)
279*53b381b3SDavid Woodhouse {
280*53b381b3SDavid Woodhouse 	unsigned long flags;
281*53b381b3SDavid Woodhouse 	int ret;
282*53b381b3SDavid Woodhouse 
283*53b381b3SDavid Woodhouse 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
284*53b381b3SDavid Woodhouse 	ret = __rbio_is_full(rbio);
285*53b381b3SDavid Woodhouse 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
286*53b381b3SDavid Woodhouse 	return ret;
287*53b381b3SDavid Woodhouse }
288*53b381b3SDavid Woodhouse 
289*53b381b3SDavid Woodhouse /*
290*53b381b3SDavid Woodhouse  * returns 1 if it is safe to merge two rbios together.
291*53b381b3SDavid Woodhouse  * The merging is safe if the two rbios correspond to
292*53b381b3SDavid Woodhouse  * the same stripe and if they are both going in the same
293*53b381b3SDavid Woodhouse  * direction (read vs write), and if neither one is
294*53b381b3SDavid Woodhouse  * locked for final IO
295*53b381b3SDavid Woodhouse  *
296*53b381b3SDavid Woodhouse  * The caller is responsible for locking such that
297*53b381b3SDavid Woodhouse  * rmw_locked is safe to test
298*53b381b3SDavid Woodhouse  */
299*53b381b3SDavid Woodhouse static int rbio_can_merge(struct btrfs_raid_bio *last,
300*53b381b3SDavid Woodhouse 			  struct btrfs_raid_bio *cur)
301*53b381b3SDavid Woodhouse {
302*53b381b3SDavid Woodhouse 	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
303*53b381b3SDavid Woodhouse 	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304*53b381b3SDavid Woodhouse 		return 0;
305*53b381b3SDavid Woodhouse 
306*53b381b3SDavid Woodhouse 	if (last->raid_map[0] !=
307*53b381b3SDavid Woodhouse 	    cur->raid_map[0])
308*53b381b3SDavid Woodhouse 		return 0;
309*53b381b3SDavid Woodhouse 
310*53b381b3SDavid Woodhouse 	/* reads can't merge with writes */
311*53b381b3SDavid Woodhouse 	if (last->read_rebuild !=
312*53b381b3SDavid Woodhouse 	    cur->read_rebuild) {
313*53b381b3SDavid Woodhouse 		return 0;
314*53b381b3SDavid Woodhouse 	}
315*53b381b3SDavid Woodhouse 
316*53b381b3SDavid Woodhouse 	return 1;
317*53b381b3SDavid Woodhouse }
318*53b381b3SDavid Woodhouse 
319*53b381b3SDavid Woodhouse /*
320*53b381b3SDavid Woodhouse  * helper to index into the pstripe
321*53b381b3SDavid Woodhouse  */
322*53b381b3SDavid Woodhouse static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
323*53b381b3SDavid Woodhouse {
324*53b381b3SDavid Woodhouse 	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
325*53b381b3SDavid Woodhouse 	return rbio->stripe_pages[index];
326*53b381b3SDavid Woodhouse }
327*53b381b3SDavid Woodhouse 
328*53b381b3SDavid Woodhouse /*
329*53b381b3SDavid Woodhouse  * helper to index into the qstripe, returns null
330*53b381b3SDavid Woodhouse  * if there is no qstripe
331*53b381b3SDavid Woodhouse  */
332*53b381b3SDavid Woodhouse static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
333*53b381b3SDavid Woodhouse {
334*53b381b3SDavid Woodhouse 	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
335*53b381b3SDavid Woodhouse 		return NULL;
336*53b381b3SDavid Woodhouse 
337*53b381b3SDavid Woodhouse 	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
338*53b381b3SDavid Woodhouse 		PAGE_CACHE_SHIFT;
339*53b381b3SDavid Woodhouse 	return rbio->stripe_pages[index];
340*53b381b3SDavid Woodhouse }
341*53b381b3SDavid Woodhouse 
342*53b381b3SDavid Woodhouse /*
343*53b381b3SDavid Woodhouse  * The first stripe in the table for a logical address
344*53b381b3SDavid Woodhouse  * has the lock.  rbios are added in one of three ways:
345*53b381b3SDavid Woodhouse  *
346*53b381b3SDavid Woodhouse  * 1) Nobody has the stripe locked yet.  The rbio is given
347*53b381b3SDavid Woodhouse  * the lock and 0 is returned.  The caller must start the IO
348*53b381b3SDavid Woodhouse  * themselves.
349*53b381b3SDavid Woodhouse  *
350*53b381b3SDavid Woodhouse  * 2) Someone has the stripe locked, but we're able to merge
351*53b381b3SDavid Woodhouse  * with the lock owner.  The rbio is freed and the IO will
352*53b381b3SDavid Woodhouse  * start automatically along with the existing rbio.  1 is returned.
353*53b381b3SDavid Woodhouse  *
354*53b381b3SDavid Woodhouse  * 3) Someone has the stripe locked, but we're not able to merge.
355*53b381b3SDavid Woodhouse  * The rbio is added to the lock owner's plug list, or merged into
356*53b381b3SDavid Woodhouse  * an rbio already on the plug list.  When the lock owner unlocks,
357*53b381b3SDavid Woodhouse  * the next rbio on the list is run and the IO is started automatically.
358*53b381b3SDavid Woodhouse  * 1 is returned
359*53b381b3SDavid Woodhouse  *
360*53b381b3SDavid Woodhouse  * If we return 0, the caller still owns the rbio and must continue with
361*53b381b3SDavid Woodhouse  * IO submission.  If we return 1, the caller must assume the rbio has
362*53b381b3SDavid Woodhouse  * already been freed.
363*53b381b3SDavid Woodhouse  */
364*53b381b3SDavid Woodhouse static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
365*53b381b3SDavid Woodhouse {
366*53b381b3SDavid Woodhouse 	int bucket = rbio_bucket(rbio);
367*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
368*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *cur;
369*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *pending;
370*53b381b3SDavid Woodhouse 	unsigned long flags;
371*53b381b3SDavid Woodhouse 	DEFINE_WAIT(wait);
372*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *freeit = NULL;
373*53b381b3SDavid Woodhouse 	int ret = 0;
374*53b381b3SDavid Woodhouse 	int walk = 0;
375*53b381b3SDavid Woodhouse 
376*53b381b3SDavid Woodhouse 	spin_lock_irqsave(&h->lock, flags);
377*53b381b3SDavid Woodhouse 	list_for_each_entry(cur, &h->hash_list, hash_list) {
378*53b381b3SDavid Woodhouse 		walk++;
379*53b381b3SDavid Woodhouse 		if (cur->raid_map[0] == rbio->raid_map[0]) {
380*53b381b3SDavid Woodhouse 			spin_lock(&cur->bio_list_lock);
381*53b381b3SDavid Woodhouse 
382*53b381b3SDavid Woodhouse 			/* can we merge into the lock owner? */
383*53b381b3SDavid Woodhouse 			if (rbio_can_merge(cur, rbio)) {
384*53b381b3SDavid Woodhouse 				merge_rbio(cur, rbio);
385*53b381b3SDavid Woodhouse 				spin_unlock(&cur->bio_list_lock);
386*53b381b3SDavid Woodhouse 				freeit = rbio;
387*53b381b3SDavid Woodhouse 				ret = 1;
388*53b381b3SDavid Woodhouse 				goto out;
389*53b381b3SDavid Woodhouse 			}
390*53b381b3SDavid Woodhouse 
391*53b381b3SDavid Woodhouse 			/*
392*53b381b3SDavid Woodhouse 			 * we couldn't merge with the running
393*53b381b3SDavid Woodhouse 			 * rbio, see if we can merge with the
394*53b381b3SDavid Woodhouse 			 * pending ones.  We don't have to
395*53b381b3SDavid Woodhouse 			 * check for rmw_locked because there
396*53b381b3SDavid Woodhouse 			 * is no way they are inside finish_rmw
397*53b381b3SDavid Woodhouse 			 * right now
398*53b381b3SDavid Woodhouse 			 */
399*53b381b3SDavid Woodhouse 			list_for_each_entry(pending, &cur->plug_list,
400*53b381b3SDavid Woodhouse 					    plug_list) {
401*53b381b3SDavid Woodhouse 				if (rbio_can_merge(pending, rbio)) {
402*53b381b3SDavid Woodhouse 					merge_rbio(pending, rbio);
403*53b381b3SDavid Woodhouse 					spin_unlock(&cur->bio_list_lock);
404*53b381b3SDavid Woodhouse 					freeit = rbio;
405*53b381b3SDavid Woodhouse 					ret = 1;
406*53b381b3SDavid Woodhouse 					goto out;
407*53b381b3SDavid Woodhouse 				}
408*53b381b3SDavid Woodhouse 			}
409*53b381b3SDavid Woodhouse 
410*53b381b3SDavid Woodhouse 			/* no merging, put us on the tail of the plug list,
411*53b381b3SDavid Woodhouse 			 * our rbio will be started with the currently
412*53b381b3SDavid Woodhouse 			 * running rbio unlocks
413*53b381b3SDavid Woodhouse 			 */
414*53b381b3SDavid Woodhouse 			list_add_tail(&rbio->plug_list, &cur->plug_list);
415*53b381b3SDavid Woodhouse 			spin_unlock(&cur->bio_list_lock);
416*53b381b3SDavid Woodhouse 			ret = 1;
417*53b381b3SDavid Woodhouse 			goto out;
418*53b381b3SDavid Woodhouse 		}
419*53b381b3SDavid Woodhouse 	}
420*53b381b3SDavid Woodhouse 
421*53b381b3SDavid Woodhouse 	atomic_inc(&rbio->refs);
422*53b381b3SDavid Woodhouse 	list_add(&rbio->hash_list, &h->hash_list);
423*53b381b3SDavid Woodhouse out:
424*53b381b3SDavid Woodhouse 	spin_unlock_irqrestore(&h->lock, flags);
425*53b381b3SDavid Woodhouse 	if (freeit)
426*53b381b3SDavid Woodhouse 		__free_raid_bio(freeit);
427*53b381b3SDavid Woodhouse 	return ret;
428*53b381b3SDavid Woodhouse }
429*53b381b3SDavid Woodhouse 
430*53b381b3SDavid Woodhouse /*
431*53b381b3SDavid Woodhouse  * called as rmw or parity rebuild is completed.  If the plug list has more
432*53b381b3SDavid Woodhouse  * rbios waiting for this stripe, the next one on the list will be started
433*53b381b3SDavid Woodhouse  */
434*53b381b3SDavid Woodhouse static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
435*53b381b3SDavid Woodhouse {
436*53b381b3SDavid Woodhouse 	int bucket;
437*53b381b3SDavid Woodhouse 	struct btrfs_stripe_hash *h;
438*53b381b3SDavid Woodhouse 	unsigned long flags;
439*53b381b3SDavid Woodhouse 
440*53b381b3SDavid Woodhouse 	bucket = rbio_bucket(rbio);
441*53b381b3SDavid Woodhouse 	h = rbio->fs_info->stripe_hash_table->table + bucket;
442*53b381b3SDavid Woodhouse 
443*53b381b3SDavid Woodhouse 	spin_lock_irqsave(&h->lock, flags);
444*53b381b3SDavid Woodhouse 	spin_lock(&rbio->bio_list_lock);
445*53b381b3SDavid Woodhouse 
446*53b381b3SDavid Woodhouse 	if (!list_empty(&rbio->hash_list)) {
447*53b381b3SDavid Woodhouse 
448*53b381b3SDavid Woodhouse 		list_del_init(&rbio->hash_list);
449*53b381b3SDavid Woodhouse 		atomic_dec(&rbio->refs);
450*53b381b3SDavid Woodhouse 
451*53b381b3SDavid Woodhouse 		/*
452*53b381b3SDavid Woodhouse 		 * we use the plug list to hold all the rbios
453*53b381b3SDavid Woodhouse 		 * waiting for the chance to lock this stripe.
454*53b381b3SDavid Woodhouse 		 * hand the lock over to one of them.
455*53b381b3SDavid Woodhouse 		 */
456*53b381b3SDavid Woodhouse 		if (!list_empty(&rbio->plug_list)) {
457*53b381b3SDavid Woodhouse 			struct btrfs_raid_bio *next;
458*53b381b3SDavid Woodhouse 			struct list_head *head = rbio->plug_list.next;
459*53b381b3SDavid Woodhouse 
460*53b381b3SDavid Woodhouse 			next = list_entry(head, struct btrfs_raid_bio,
461*53b381b3SDavid Woodhouse 					  plug_list);
462*53b381b3SDavid Woodhouse 
463*53b381b3SDavid Woodhouse 			list_del_init(&rbio->plug_list);
464*53b381b3SDavid Woodhouse 
465*53b381b3SDavid Woodhouse 			list_add(&next->hash_list, &h->hash_list);
466*53b381b3SDavid Woodhouse 			atomic_inc(&next->refs);
467*53b381b3SDavid Woodhouse 			spin_unlock(&rbio->bio_list_lock);
468*53b381b3SDavid Woodhouse 			spin_unlock_irqrestore(&h->lock, flags);
469*53b381b3SDavid Woodhouse 
470*53b381b3SDavid Woodhouse 			if (next->read_rebuild)
471*53b381b3SDavid Woodhouse 				async_read_rebuild(next);
472*53b381b3SDavid Woodhouse 			else
473*53b381b3SDavid Woodhouse 				async_rmw_stripe(next);
474*53b381b3SDavid Woodhouse 
475*53b381b3SDavid Woodhouse 			goto done_nolock;
476*53b381b3SDavid Woodhouse 
477*53b381b3SDavid Woodhouse 		} else  if (waitqueue_active(&h->wait)) {
478*53b381b3SDavid Woodhouse 			spin_unlock(&rbio->bio_list_lock);
479*53b381b3SDavid Woodhouse 			spin_unlock_irqrestore(&h->lock, flags);
480*53b381b3SDavid Woodhouse 			wake_up(&h->wait);
481*53b381b3SDavid Woodhouse 			goto done_nolock;
482*53b381b3SDavid Woodhouse 		}
483*53b381b3SDavid Woodhouse 	}
484*53b381b3SDavid Woodhouse 	spin_unlock(&rbio->bio_list_lock);
485*53b381b3SDavid Woodhouse 	spin_unlock_irqrestore(&h->lock, flags);
486*53b381b3SDavid Woodhouse 
487*53b381b3SDavid Woodhouse done_nolock:
488*53b381b3SDavid Woodhouse 	return;
489*53b381b3SDavid Woodhouse }
490*53b381b3SDavid Woodhouse 
491*53b381b3SDavid Woodhouse static void __free_raid_bio(struct btrfs_raid_bio *rbio)
492*53b381b3SDavid Woodhouse {
493*53b381b3SDavid Woodhouse 	int i;
494*53b381b3SDavid Woodhouse 
495*53b381b3SDavid Woodhouse 	WARN_ON(atomic_read(&rbio->refs) < 0);
496*53b381b3SDavid Woodhouse 	if (!atomic_dec_and_test(&rbio->refs))
497*53b381b3SDavid Woodhouse 		return;
498*53b381b3SDavid Woodhouse 
499*53b381b3SDavid Woodhouse 	WARN_ON(!list_empty(&rbio->hash_list));
500*53b381b3SDavid Woodhouse 	WARN_ON(!bio_list_empty(&rbio->bio_list));
501*53b381b3SDavid Woodhouse 
502*53b381b3SDavid Woodhouse 	for (i = 0; i < rbio->nr_pages; i++) {
503*53b381b3SDavid Woodhouse 		if (rbio->stripe_pages[i]) {
504*53b381b3SDavid Woodhouse 			__free_page(rbio->stripe_pages[i]);
505*53b381b3SDavid Woodhouse 			rbio->stripe_pages[i] = NULL;
506*53b381b3SDavid Woodhouse 		}
507*53b381b3SDavid Woodhouse 	}
508*53b381b3SDavid Woodhouse 	kfree(rbio->raid_map);
509*53b381b3SDavid Woodhouse 	kfree(rbio->bbio);
510*53b381b3SDavid Woodhouse 	kfree(rbio);
511*53b381b3SDavid Woodhouse }
512*53b381b3SDavid Woodhouse 
513*53b381b3SDavid Woodhouse static void free_raid_bio(struct btrfs_raid_bio *rbio)
514*53b381b3SDavid Woodhouse {
515*53b381b3SDavid Woodhouse 	unlock_stripe(rbio);
516*53b381b3SDavid Woodhouse 	__free_raid_bio(rbio);
517*53b381b3SDavid Woodhouse }
518*53b381b3SDavid Woodhouse 
519*53b381b3SDavid Woodhouse /*
520*53b381b3SDavid Woodhouse  * this frees the rbio and runs through all the bios in the
521*53b381b3SDavid Woodhouse  * bio_list and calls end_io on them
522*53b381b3SDavid Woodhouse  */
523*53b381b3SDavid Woodhouse static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
524*53b381b3SDavid Woodhouse {
525*53b381b3SDavid Woodhouse 	struct bio *cur = bio_list_get(&rbio->bio_list);
526*53b381b3SDavid Woodhouse 	struct bio *next;
527*53b381b3SDavid Woodhouse 	free_raid_bio(rbio);
528*53b381b3SDavid Woodhouse 
529*53b381b3SDavid Woodhouse 	while (cur) {
530*53b381b3SDavid Woodhouse 		next = cur->bi_next;
531*53b381b3SDavid Woodhouse 		cur->bi_next = NULL;
532*53b381b3SDavid Woodhouse 		if (uptodate)
533*53b381b3SDavid Woodhouse 			set_bit(BIO_UPTODATE, &cur->bi_flags);
534*53b381b3SDavid Woodhouse 		bio_endio(cur, err);
535*53b381b3SDavid Woodhouse 		cur = next;
536*53b381b3SDavid Woodhouse 	}
537*53b381b3SDavid Woodhouse }
538*53b381b3SDavid Woodhouse 
539*53b381b3SDavid Woodhouse /*
540*53b381b3SDavid Woodhouse  * end io function used by finish_rmw.  When we finally
541*53b381b3SDavid Woodhouse  * get here, we've written a full stripe
542*53b381b3SDavid Woodhouse  */
543*53b381b3SDavid Woodhouse static void raid_write_end_io(struct bio *bio, int err)
544*53b381b3SDavid Woodhouse {
545*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio = bio->bi_private;
546*53b381b3SDavid Woodhouse 
547*53b381b3SDavid Woodhouse 	if (err)
548*53b381b3SDavid Woodhouse 		fail_bio_stripe(rbio, bio);
549*53b381b3SDavid Woodhouse 
550*53b381b3SDavid Woodhouse 	bio_put(bio);
551*53b381b3SDavid Woodhouse 
552*53b381b3SDavid Woodhouse 	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
553*53b381b3SDavid Woodhouse 		return;
554*53b381b3SDavid Woodhouse 
555*53b381b3SDavid Woodhouse 	err = 0;
556*53b381b3SDavid Woodhouse 
557*53b381b3SDavid Woodhouse 	/* OK, we have read all the stripes we need to. */
558*53b381b3SDavid Woodhouse 	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
559*53b381b3SDavid Woodhouse 		err = -EIO;
560*53b381b3SDavid Woodhouse 
561*53b381b3SDavid Woodhouse 	rbio_orig_end_io(rbio, err, 0);
562*53b381b3SDavid Woodhouse 	return;
563*53b381b3SDavid Woodhouse }
564*53b381b3SDavid Woodhouse 
565*53b381b3SDavid Woodhouse /*
566*53b381b3SDavid Woodhouse  * the read/modify/write code wants to use the original bio for
567*53b381b3SDavid Woodhouse  * any pages it included, and then use the rbio for everything
568*53b381b3SDavid Woodhouse  * else.  This function decides if a given index (stripe number)
569*53b381b3SDavid Woodhouse  * and page number in that stripe fall inside the original bio
570*53b381b3SDavid Woodhouse  * or the rbio.
571*53b381b3SDavid Woodhouse  *
572*53b381b3SDavid Woodhouse  * if you set bio_list_only, you'll get a NULL back for any ranges
573*53b381b3SDavid Woodhouse  * that are outside the bio_list
574*53b381b3SDavid Woodhouse  *
575*53b381b3SDavid Woodhouse  * This doesn't take any refs on anything, you get a bare page pointer
576*53b381b3SDavid Woodhouse  * and the caller must bump refs as required.
577*53b381b3SDavid Woodhouse  *
578*53b381b3SDavid Woodhouse  * You must call index_rbio_pages once before you can trust
579*53b381b3SDavid Woodhouse  * the answers from this function.
580*53b381b3SDavid Woodhouse  */
581*53b381b3SDavid Woodhouse static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
582*53b381b3SDavid Woodhouse 				 int index, int pagenr, int bio_list_only)
583*53b381b3SDavid Woodhouse {
584*53b381b3SDavid Woodhouse 	int chunk_page;
585*53b381b3SDavid Woodhouse 	struct page *p = NULL;
586*53b381b3SDavid Woodhouse 
587*53b381b3SDavid Woodhouse 	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
588*53b381b3SDavid Woodhouse 
589*53b381b3SDavid Woodhouse 	spin_lock_irq(&rbio->bio_list_lock);
590*53b381b3SDavid Woodhouse 	p = rbio->bio_pages[chunk_page];
591*53b381b3SDavid Woodhouse 	spin_unlock_irq(&rbio->bio_list_lock);
592*53b381b3SDavid Woodhouse 
593*53b381b3SDavid Woodhouse 	if (p || bio_list_only)
594*53b381b3SDavid Woodhouse 		return p;
595*53b381b3SDavid Woodhouse 
596*53b381b3SDavid Woodhouse 	return rbio->stripe_pages[chunk_page];
597*53b381b3SDavid Woodhouse }
598*53b381b3SDavid Woodhouse 
599*53b381b3SDavid Woodhouse /*
600*53b381b3SDavid Woodhouse  * number of pages we need for the entire stripe across all the
601*53b381b3SDavid Woodhouse  * drives
602*53b381b3SDavid Woodhouse  */
603*53b381b3SDavid Woodhouse static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
604*53b381b3SDavid Woodhouse {
605*53b381b3SDavid Woodhouse 	unsigned long nr = stripe_len * nr_stripes;
606*53b381b3SDavid Woodhouse 	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
607*53b381b3SDavid Woodhouse }
608*53b381b3SDavid Woodhouse 
609*53b381b3SDavid Woodhouse /*
610*53b381b3SDavid Woodhouse  * allocation and initial setup for the btrfs_raid_bio.  Not
611*53b381b3SDavid Woodhouse  * this does not allocate any pages for rbio->pages.
612*53b381b3SDavid Woodhouse  */
613*53b381b3SDavid Woodhouse static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
614*53b381b3SDavid Woodhouse 			  struct btrfs_bio *bbio, u64 *raid_map,
615*53b381b3SDavid Woodhouse 			  u64 stripe_len)
616*53b381b3SDavid Woodhouse {
617*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio;
618*53b381b3SDavid Woodhouse 	int nr_data = 0;
619*53b381b3SDavid Woodhouse 	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
620*53b381b3SDavid Woodhouse 	void *p;
621*53b381b3SDavid Woodhouse 
622*53b381b3SDavid Woodhouse 	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
623*53b381b3SDavid Woodhouse 			GFP_NOFS);
624*53b381b3SDavid Woodhouse 	if (!rbio) {
625*53b381b3SDavid Woodhouse 		kfree(raid_map);
626*53b381b3SDavid Woodhouse 		kfree(bbio);
627*53b381b3SDavid Woodhouse 		return ERR_PTR(-ENOMEM);
628*53b381b3SDavid Woodhouse 	}
629*53b381b3SDavid Woodhouse 
630*53b381b3SDavid Woodhouse 	bio_list_init(&rbio->bio_list);
631*53b381b3SDavid Woodhouse 	INIT_LIST_HEAD(&rbio->plug_list);
632*53b381b3SDavid Woodhouse 	spin_lock_init(&rbio->bio_list_lock);
633*53b381b3SDavid Woodhouse 	INIT_LIST_HEAD(&rbio->hash_list);
634*53b381b3SDavid Woodhouse 	rbio->bbio = bbio;
635*53b381b3SDavid Woodhouse 	rbio->raid_map = raid_map;
636*53b381b3SDavid Woodhouse 	rbio->fs_info = root->fs_info;
637*53b381b3SDavid Woodhouse 	rbio->stripe_len = stripe_len;
638*53b381b3SDavid Woodhouse 	rbio->nr_pages = num_pages;
639*53b381b3SDavid Woodhouse 	rbio->faila = -1;
640*53b381b3SDavid Woodhouse 	rbio->failb = -1;
641*53b381b3SDavid Woodhouse 	atomic_set(&rbio->refs, 1);
642*53b381b3SDavid Woodhouse 
643*53b381b3SDavid Woodhouse 	/*
644*53b381b3SDavid Woodhouse 	 * the stripe_pages and bio_pages array point to the extra
645*53b381b3SDavid Woodhouse 	 * memory we allocated past the end of the rbio
646*53b381b3SDavid Woodhouse 	 */
647*53b381b3SDavid Woodhouse 	p = rbio + 1;
648*53b381b3SDavid Woodhouse 	rbio->stripe_pages = p;
649*53b381b3SDavid Woodhouse 	rbio->bio_pages = p + sizeof(struct page *) * num_pages;
650*53b381b3SDavid Woodhouse 
651*53b381b3SDavid Woodhouse 	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
652*53b381b3SDavid Woodhouse 		nr_data = bbio->num_stripes - 2;
653*53b381b3SDavid Woodhouse 	else
654*53b381b3SDavid Woodhouse 		nr_data = bbio->num_stripes - 1;
655*53b381b3SDavid Woodhouse 
656*53b381b3SDavid Woodhouse 	rbio->nr_data = nr_data;
657*53b381b3SDavid Woodhouse 	return rbio;
658*53b381b3SDavid Woodhouse }
659*53b381b3SDavid Woodhouse 
660*53b381b3SDavid Woodhouse /* allocate pages for all the stripes in the bio, including parity */
661*53b381b3SDavid Woodhouse static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
662*53b381b3SDavid Woodhouse {
663*53b381b3SDavid Woodhouse 	int i;
664*53b381b3SDavid Woodhouse 	struct page *page;
665*53b381b3SDavid Woodhouse 
666*53b381b3SDavid Woodhouse 	for (i = 0; i < rbio->nr_pages; i++) {
667*53b381b3SDavid Woodhouse 		if (rbio->stripe_pages[i])
668*53b381b3SDavid Woodhouse 			continue;
669*53b381b3SDavid Woodhouse 		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
670*53b381b3SDavid Woodhouse 		if (!page)
671*53b381b3SDavid Woodhouse 			return -ENOMEM;
672*53b381b3SDavid Woodhouse 		rbio->stripe_pages[i] = page;
673*53b381b3SDavid Woodhouse 		ClearPageUptodate(page);
674*53b381b3SDavid Woodhouse 	}
675*53b381b3SDavid Woodhouse 	return 0;
676*53b381b3SDavid Woodhouse }
677*53b381b3SDavid Woodhouse 
678*53b381b3SDavid Woodhouse /* allocate pages for just the p/q stripes */
679*53b381b3SDavid Woodhouse static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
680*53b381b3SDavid Woodhouse {
681*53b381b3SDavid Woodhouse 	int i;
682*53b381b3SDavid Woodhouse 	struct page *page;
683*53b381b3SDavid Woodhouse 
684*53b381b3SDavid Woodhouse 	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
685*53b381b3SDavid Woodhouse 
686*53b381b3SDavid Woodhouse 	for (; i < rbio->nr_pages; i++) {
687*53b381b3SDavid Woodhouse 		if (rbio->stripe_pages[i])
688*53b381b3SDavid Woodhouse 			continue;
689*53b381b3SDavid Woodhouse 		page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
690*53b381b3SDavid Woodhouse 		if (!page)
691*53b381b3SDavid Woodhouse 			return -ENOMEM;
692*53b381b3SDavid Woodhouse 		rbio->stripe_pages[i] = page;
693*53b381b3SDavid Woodhouse 	}
694*53b381b3SDavid Woodhouse 	return 0;
695*53b381b3SDavid Woodhouse }
696*53b381b3SDavid Woodhouse 
697*53b381b3SDavid Woodhouse /*
698*53b381b3SDavid Woodhouse  * add a single page from a specific stripe into our list of bios for IO
699*53b381b3SDavid Woodhouse  * this will try to merge into existing bios if possible, and returns
700*53b381b3SDavid Woodhouse  * zero if all went well.
701*53b381b3SDavid Woodhouse  */
702*53b381b3SDavid Woodhouse int rbio_add_io_page(struct btrfs_raid_bio *rbio,
703*53b381b3SDavid Woodhouse 		     struct bio_list *bio_list,
704*53b381b3SDavid Woodhouse 		     struct page *page,
705*53b381b3SDavid Woodhouse 		     int stripe_nr,
706*53b381b3SDavid Woodhouse 		     unsigned long page_index,
707*53b381b3SDavid Woodhouse 		     unsigned long bio_max_len)
708*53b381b3SDavid Woodhouse {
709*53b381b3SDavid Woodhouse 	struct bio *last = bio_list->tail;
710*53b381b3SDavid Woodhouse 	u64 last_end = 0;
711*53b381b3SDavid Woodhouse 	int ret;
712*53b381b3SDavid Woodhouse 	struct bio *bio;
713*53b381b3SDavid Woodhouse 	struct btrfs_bio_stripe *stripe;
714*53b381b3SDavid Woodhouse 	u64 disk_start;
715*53b381b3SDavid Woodhouse 
716*53b381b3SDavid Woodhouse 	stripe = &rbio->bbio->stripes[stripe_nr];
717*53b381b3SDavid Woodhouse 	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
718*53b381b3SDavid Woodhouse 
719*53b381b3SDavid Woodhouse 	/* if the device is missing, just fail this stripe */
720*53b381b3SDavid Woodhouse 	if (!stripe->dev->bdev)
721*53b381b3SDavid Woodhouse 		return fail_rbio_index(rbio, stripe_nr);
722*53b381b3SDavid Woodhouse 
723*53b381b3SDavid Woodhouse 	/* see if we can add this page onto our existing bio */
724*53b381b3SDavid Woodhouse 	if (last) {
725*53b381b3SDavid Woodhouse 		last_end = (u64)last->bi_sector << 9;
726*53b381b3SDavid Woodhouse 		last_end += last->bi_size;
727*53b381b3SDavid Woodhouse 
728*53b381b3SDavid Woodhouse 		/*
729*53b381b3SDavid Woodhouse 		 * we can't merge these if they are from different
730*53b381b3SDavid Woodhouse 		 * devices or if they are not contiguous
731*53b381b3SDavid Woodhouse 		 */
732*53b381b3SDavid Woodhouse 		if (last_end == disk_start && stripe->dev->bdev &&
733*53b381b3SDavid Woodhouse 		    test_bit(BIO_UPTODATE, &last->bi_flags) &&
734*53b381b3SDavid Woodhouse 		    last->bi_bdev == stripe->dev->bdev) {
735*53b381b3SDavid Woodhouse 			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
736*53b381b3SDavid Woodhouse 			if (ret == PAGE_CACHE_SIZE)
737*53b381b3SDavid Woodhouse 				return 0;
738*53b381b3SDavid Woodhouse 		}
739*53b381b3SDavid Woodhouse 	}
740*53b381b3SDavid Woodhouse 
741*53b381b3SDavid Woodhouse 	/* put a new bio on the list */
742*53b381b3SDavid Woodhouse 	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
743*53b381b3SDavid Woodhouse 	if (!bio)
744*53b381b3SDavid Woodhouse 		return -ENOMEM;
745*53b381b3SDavid Woodhouse 
746*53b381b3SDavid Woodhouse 	bio->bi_size = 0;
747*53b381b3SDavid Woodhouse 	bio->bi_bdev = stripe->dev->bdev;
748*53b381b3SDavid Woodhouse 	bio->bi_sector = disk_start >> 9;
749*53b381b3SDavid Woodhouse 	set_bit(BIO_UPTODATE, &bio->bi_flags);
750*53b381b3SDavid Woodhouse 
751*53b381b3SDavid Woodhouse 	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
752*53b381b3SDavid Woodhouse 	bio_list_add(bio_list, bio);
753*53b381b3SDavid Woodhouse 	return 0;
754*53b381b3SDavid Woodhouse }
755*53b381b3SDavid Woodhouse 
756*53b381b3SDavid Woodhouse /*
757*53b381b3SDavid Woodhouse  * while we're doing the read/modify/write cycle, we could
758*53b381b3SDavid Woodhouse  * have errors in reading pages off the disk.  This checks
759*53b381b3SDavid Woodhouse  * for errors and if we're not able to read the page it'll
760*53b381b3SDavid Woodhouse  * trigger parity reconstruction.  The rmw will be finished
761*53b381b3SDavid Woodhouse  * after we've reconstructed the failed stripes
762*53b381b3SDavid Woodhouse  */
763*53b381b3SDavid Woodhouse static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
764*53b381b3SDavid Woodhouse {
765*53b381b3SDavid Woodhouse 	if (rbio->faila >= 0 || rbio->failb >= 0) {
766*53b381b3SDavid Woodhouse 		BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
767*53b381b3SDavid Woodhouse 		__raid56_parity_recover(rbio);
768*53b381b3SDavid Woodhouse 	} else {
769*53b381b3SDavid Woodhouse 		finish_rmw(rbio);
770*53b381b3SDavid Woodhouse 	}
771*53b381b3SDavid Woodhouse }
772*53b381b3SDavid Woodhouse 
773*53b381b3SDavid Woodhouse /*
774*53b381b3SDavid Woodhouse  * these are just the pages from the rbio array, not from anything
775*53b381b3SDavid Woodhouse  * the FS sent down to us
776*53b381b3SDavid Woodhouse  */
777*53b381b3SDavid Woodhouse static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
778*53b381b3SDavid Woodhouse {
779*53b381b3SDavid Woodhouse 	int index;
780*53b381b3SDavid Woodhouse 	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
781*53b381b3SDavid Woodhouse 	index += page;
782*53b381b3SDavid Woodhouse 	return rbio->stripe_pages[index];
783*53b381b3SDavid Woodhouse }
784*53b381b3SDavid Woodhouse 
785*53b381b3SDavid Woodhouse /*
786*53b381b3SDavid Woodhouse  * helper function to walk our bio list and populate the bio_pages array with
787*53b381b3SDavid Woodhouse  * the result.  This seems expensive, but it is faster than constantly
788*53b381b3SDavid Woodhouse  * searching through the bio list as we setup the IO in finish_rmw or stripe
789*53b381b3SDavid Woodhouse  * reconstruction.
790*53b381b3SDavid Woodhouse  *
791*53b381b3SDavid Woodhouse  * This must be called before you trust the answers from page_in_rbio
792*53b381b3SDavid Woodhouse  */
793*53b381b3SDavid Woodhouse static void index_rbio_pages(struct btrfs_raid_bio *rbio)
794*53b381b3SDavid Woodhouse {
795*53b381b3SDavid Woodhouse 	struct bio *bio;
796*53b381b3SDavid Woodhouse 	u64 start;
797*53b381b3SDavid Woodhouse 	unsigned long stripe_offset;
798*53b381b3SDavid Woodhouse 	unsigned long page_index;
799*53b381b3SDavid Woodhouse 	struct page *p;
800*53b381b3SDavid Woodhouse 	int i;
801*53b381b3SDavid Woodhouse 
802*53b381b3SDavid Woodhouse 	spin_lock_irq(&rbio->bio_list_lock);
803*53b381b3SDavid Woodhouse 	bio_list_for_each(bio, &rbio->bio_list) {
804*53b381b3SDavid Woodhouse 		start = (u64)bio->bi_sector << 9;
805*53b381b3SDavid Woodhouse 		stripe_offset = start - rbio->raid_map[0];
806*53b381b3SDavid Woodhouse 		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
807*53b381b3SDavid Woodhouse 
808*53b381b3SDavid Woodhouse 		for (i = 0; i < bio->bi_vcnt; i++) {
809*53b381b3SDavid Woodhouse 			p = bio->bi_io_vec[i].bv_page;
810*53b381b3SDavid Woodhouse 			rbio->bio_pages[page_index + i] = p;
811*53b381b3SDavid Woodhouse 		}
812*53b381b3SDavid Woodhouse 	}
813*53b381b3SDavid Woodhouse 	spin_unlock_irq(&rbio->bio_list_lock);
814*53b381b3SDavid Woodhouse }
815*53b381b3SDavid Woodhouse 
816*53b381b3SDavid Woodhouse /*
817*53b381b3SDavid Woodhouse  * this is called from one of two situations.  We either
818*53b381b3SDavid Woodhouse  * have a full stripe from the higher layers, or we've read all
819*53b381b3SDavid Woodhouse  * the missing bits off disk.
820*53b381b3SDavid Woodhouse  *
821*53b381b3SDavid Woodhouse  * This will calculate the parity and then send down any
822*53b381b3SDavid Woodhouse  * changed blocks.
823*53b381b3SDavid Woodhouse  */
824*53b381b3SDavid Woodhouse static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
825*53b381b3SDavid Woodhouse {
826*53b381b3SDavid Woodhouse 	struct btrfs_bio *bbio = rbio->bbio;
827*53b381b3SDavid Woodhouse 	void *pointers[bbio->num_stripes];
828*53b381b3SDavid Woodhouse 	int stripe_len = rbio->stripe_len;
829*53b381b3SDavid Woodhouse 	int nr_data = rbio->nr_data;
830*53b381b3SDavid Woodhouse 	int stripe;
831*53b381b3SDavid Woodhouse 	int pagenr;
832*53b381b3SDavid Woodhouse 	int p_stripe = -1;
833*53b381b3SDavid Woodhouse 	int q_stripe = -1;
834*53b381b3SDavid Woodhouse 	struct bio_list bio_list;
835*53b381b3SDavid Woodhouse 	struct bio *bio;
836*53b381b3SDavid Woodhouse 	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
837*53b381b3SDavid Woodhouse 	int ret;
838*53b381b3SDavid Woodhouse 
839*53b381b3SDavid Woodhouse 	bio_list_init(&bio_list);
840*53b381b3SDavid Woodhouse 
841*53b381b3SDavid Woodhouse 	if (bbio->num_stripes - rbio->nr_data == 1) {
842*53b381b3SDavid Woodhouse 		p_stripe = bbio->num_stripes - 1;
843*53b381b3SDavid Woodhouse 	} else if (bbio->num_stripes - rbio->nr_data == 2) {
844*53b381b3SDavid Woodhouse 		p_stripe = bbio->num_stripes - 2;
845*53b381b3SDavid Woodhouse 		q_stripe = bbio->num_stripes - 1;
846*53b381b3SDavid Woodhouse 	} else {
847*53b381b3SDavid Woodhouse 		BUG();
848*53b381b3SDavid Woodhouse 	}
849*53b381b3SDavid Woodhouse 
850*53b381b3SDavid Woodhouse 	/* at this point we either have a full stripe,
851*53b381b3SDavid Woodhouse 	 * or we've read the full stripe from the drive.
852*53b381b3SDavid Woodhouse 	 * recalculate the parity and write the new results.
853*53b381b3SDavid Woodhouse 	 *
854*53b381b3SDavid Woodhouse 	 * We're not allowed to add any new bios to the
855*53b381b3SDavid Woodhouse 	 * bio list here, anyone else that wants to
856*53b381b3SDavid Woodhouse 	 * change this stripe needs to do their own rmw.
857*53b381b3SDavid Woodhouse 	 */
858*53b381b3SDavid Woodhouse 	spin_lock_irq(&rbio->bio_list_lock);
859*53b381b3SDavid Woodhouse 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
860*53b381b3SDavid Woodhouse 	spin_unlock_irq(&rbio->bio_list_lock);
861*53b381b3SDavid Woodhouse 
862*53b381b3SDavid Woodhouse 	atomic_set(&rbio->bbio->error, 0);
863*53b381b3SDavid Woodhouse 
864*53b381b3SDavid Woodhouse 	/*
865*53b381b3SDavid Woodhouse 	 * now that we've set rmw_locked, run through the
866*53b381b3SDavid Woodhouse 	 * bio list one last time and map the page pointers
867*53b381b3SDavid Woodhouse 	 */
868*53b381b3SDavid Woodhouse 	index_rbio_pages(rbio);
869*53b381b3SDavid Woodhouse 
870*53b381b3SDavid Woodhouse 	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871*53b381b3SDavid Woodhouse 		struct page *p;
872*53b381b3SDavid Woodhouse 		/* first collect one page from each data stripe */
873*53b381b3SDavid Woodhouse 		for (stripe = 0; stripe < nr_data; stripe++) {
874*53b381b3SDavid Woodhouse 			p = page_in_rbio(rbio, stripe, pagenr, 0);
875*53b381b3SDavid Woodhouse 			pointers[stripe] = kmap(p);
876*53b381b3SDavid Woodhouse 		}
877*53b381b3SDavid Woodhouse 
878*53b381b3SDavid Woodhouse 		/* then add the parity stripe */
879*53b381b3SDavid Woodhouse 		p = rbio_pstripe_page(rbio, pagenr);
880*53b381b3SDavid Woodhouse 		SetPageUptodate(p);
881*53b381b3SDavid Woodhouse 		pointers[stripe++] = kmap(p);
882*53b381b3SDavid Woodhouse 
883*53b381b3SDavid Woodhouse 		if (q_stripe != -1) {
884*53b381b3SDavid Woodhouse 
885*53b381b3SDavid Woodhouse 			/*
886*53b381b3SDavid Woodhouse 			 * raid6, add the qstripe and call the
887*53b381b3SDavid Woodhouse 			 * library function to fill in our p/q
888*53b381b3SDavid Woodhouse 			 */
889*53b381b3SDavid Woodhouse 			p = rbio_qstripe_page(rbio, pagenr);
890*53b381b3SDavid Woodhouse 			SetPageUptodate(p);
891*53b381b3SDavid Woodhouse 			pointers[stripe++] = kmap(p);
892*53b381b3SDavid Woodhouse 
893*53b381b3SDavid Woodhouse 			raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
894*53b381b3SDavid Woodhouse 						pointers);
895*53b381b3SDavid Woodhouse 		} else {
896*53b381b3SDavid Woodhouse 			/* raid5 */
897*53b381b3SDavid Woodhouse 			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
898*53b381b3SDavid Woodhouse 			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
899*53b381b3SDavid Woodhouse 		}
900*53b381b3SDavid Woodhouse 
901*53b381b3SDavid Woodhouse 
902*53b381b3SDavid Woodhouse 		for (stripe = 0; stripe < bbio->num_stripes; stripe++)
903*53b381b3SDavid Woodhouse 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
904*53b381b3SDavid Woodhouse 	}
905*53b381b3SDavid Woodhouse 
906*53b381b3SDavid Woodhouse 	/*
907*53b381b3SDavid Woodhouse 	 * time to start writing.  Make bios for everything from the
908*53b381b3SDavid Woodhouse 	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
909*53b381b3SDavid Woodhouse 	 * everything else.
910*53b381b3SDavid Woodhouse 	 */
911*53b381b3SDavid Woodhouse 	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
912*53b381b3SDavid Woodhouse 		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
913*53b381b3SDavid Woodhouse 			struct page *page;
914*53b381b3SDavid Woodhouse 			if (stripe < rbio->nr_data) {
915*53b381b3SDavid Woodhouse 				page = page_in_rbio(rbio, stripe, pagenr, 1);
916*53b381b3SDavid Woodhouse 				if (!page)
917*53b381b3SDavid Woodhouse 					continue;
918*53b381b3SDavid Woodhouse 			} else {
919*53b381b3SDavid Woodhouse 			       page = rbio_stripe_page(rbio, stripe, pagenr);
920*53b381b3SDavid Woodhouse 			}
921*53b381b3SDavid Woodhouse 
922*53b381b3SDavid Woodhouse 			ret = rbio_add_io_page(rbio, &bio_list,
923*53b381b3SDavid Woodhouse 				       page, stripe, pagenr, rbio->stripe_len);
924*53b381b3SDavid Woodhouse 			if (ret)
925*53b381b3SDavid Woodhouse 				goto cleanup;
926*53b381b3SDavid Woodhouse 		}
927*53b381b3SDavid Woodhouse 	}
928*53b381b3SDavid Woodhouse 
929*53b381b3SDavid Woodhouse 	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
930*53b381b3SDavid Woodhouse 	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
931*53b381b3SDavid Woodhouse 
932*53b381b3SDavid Woodhouse 	while (1) {
933*53b381b3SDavid Woodhouse 		bio = bio_list_pop(&bio_list);
934*53b381b3SDavid Woodhouse 		if (!bio)
935*53b381b3SDavid Woodhouse 			break;
936*53b381b3SDavid Woodhouse 
937*53b381b3SDavid Woodhouse 		bio->bi_private = rbio;
938*53b381b3SDavid Woodhouse 		bio->bi_end_io = raid_write_end_io;
939*53b381b3SDavid Woodhouse 		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
940*53b381b3SDavid Woodhouse 		submit_bio(WRITE, bio);
941*53b381b3SDavid Woodhouse 	}
942*53b381b3SDavid Woodhouse 	return;
943*53b381b3SDavid Woodhouse 
944*53b381b3SDavid Woodhouse cleanup:
945*53b381b3SDavid Woodhouse 	rbio_orig_end_io(rbio, -EIO, 0);
946*53b381b3SDavid Woodhouse }
947*53b381b3SDavid Woodhouse 
948*53b381b3SDavid Woodhouse /*
949*53b381b3SDavid Woodhouse  * helper to find the stripe number for a given bio.  Used to figure out which
950*53b381b3SDavid Woodhouse  * stripe has failed.  This expects the bio to correspond to a physical disk,
951*53b381b3SDavid Woodhouse  * so it looks up based on physical sector numbers.
952*53b381b3SDavid Woodhouse  */
953*53b381b3SDavid Woodhouse static int find_bio_stripe(struct btrfs_raid_bio *rbio,
954*53b381b3SDavid Woodhouse 			   struct bio *bio)
955*53b381b3SDavid Woodhouse {
956*53b381b3SDavid Woodhouse 	u64 physical = bio->bi_sector;
957*53b381b3SDavid Woodhouse 	u64 stripe_start;
958*53b381b3SDavid Woodhouse 	int i;
959*53b381b3SDavid Woodhouse 	struct btrfs_bio_stripe *stripe;
960*53b381b3SDavid Woodhouse 
961*53b381b3SDavid Woodhouse 	physical <<= 9;
962*53b381b3SDavid Woodhouse 
963*53b381b3SDavid Woodhouse 	for (i = 0; i < rbio->bbio->num_stripes; i++) {
964*53b381b3SDavid Woodhouse 		stripe = &rbio->bbio->stripes[i];
965*53b381b3SDavid Woodhouse 		stripe_start = stripe->physical;
966*53b381b3SDavid Woodhouse 		if (physical >= stripe_start &&
967*53b381b3SDavid Woodhouse 		    physical < stripe_start + rbio->stripe_len) {
968*53b381b3SDavid Woodhouse 			return i;
969*53b381b3SDavid Woodhouse 		}
970*53b381b3SDavid Woodhouse 	}
971*53b381b3SDavid Woodhouse 	return -1;
972*53b381b3SDavid Woodhouse }
973*53b381b3SDavid Woodhouse 
974*53b381b3SDavid Woodhouse /*
975*53b381b3SDavid Woodhouse  * helper to find the stripe number for a given
976*53b381b3SDavid Woodhouse  * bio (before mapping).  Used to figure out which stripe has
977*53b381b3SDavid Woodhouse  * failed.  This looks up based on logical block numbers.
978*53b381b3SDavid Woodhouse  */
979*53b381b3SDavid Woodhouse static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
980*53b381b3SDavid Woodhouse 				   struct bio *bio)
981*53b381b3SDavid Woodhouse {
982*53b381b3SDavid Woodhouse 	u64 logical = bio->bi_sector;
983*53b381b3SDavid Woodhouse 	u64 stripe_start;
984*53b381b3SDavid Woodhouse 	int i;
985*53b381b3SDavid Woodhouse 
986*53b381b3SDavid Woodhouse 	logical <<= 9;
987*53b381b3SDavid Woodhouse 
988*53b381b3SDavid Woodhouse 	for (i = 0; i < rbio->nr_data; i++) {
989*53b381b3SDavid Woodhouse 		stripe_start = rbio->raid_map[i];
990*53b381b3SDavid Woodhouse 		if (logical >= stripe_start &&
991*53b381b3SDavid Woodhouse 		    logical < stripe_start + rbio->stripe_len) {
992*53b381b3SDavid Woodhouse 			return i;
993*53b381b3SDavid Woodhouse 		}
994*53b381b3SDavid Woodhouse 	}
995*53b381b3SDavid Woodhouse 	return -1;
996*53b381b3SDavid Woodhouse }
997*53b381b3SDavid Woodhouse 
998*53b381b3SDavid Woodhouse /*
999*53b381b3SDavid Woodhouse  * returns -EIO if we had too many failures
1000*53b381b3SDavid Woodhouse  */
1001*53b381b3SDavid Woodhouse static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002*53b381b3SDavid Woodhouse {
1003*53b381b3SDavid Woodhouse 	unsigned long flags;
1004*53b381b3SDavid Woodhouse 	int ret = 0;
1005*53b381b3SDavid Woodhouse 
1006*53b381b3SDavid Woodhouse 	spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007*53b381b3SDavid Woodhouse 
1008*53b381b3SDavid Woodhouse 	/* we already know this stripe is bad, move on */
1009*53b381b3SDavid Woodhouse 	if (rbio->faila == failed || rbio->failb == failed)
1010*53b381b3SDavid Woodhouse 		goto out;
1011*53b381b3SDavid Woodhouse 
1012*53b381b3SDavid Woodhouse 	if (rbio->faila == -1) {
1013*53b381b3SDavid Woodhouse 		/* first failure on this rbio */
1014*53b381b3SDavid Woodhouse 		rbio->faila = failed;
1015*53b381b3SDavid Woodhouse 		atomic_inc(&rbio->bbio->error);
1016*53b381b3SDavid Woodhouse 	} else if (rbio->failb == -1) {
1017*53b381b3SDavid Woodhouse 		/* second failure on this rbio */
1018*53b381b3SDavid Woodhouse 		rbio->failb = failed;
1019*53b381b3SDavid Woodhouse 		atomic_inc(&rbio->bbio->error);
1020*53b381b3SDavid Woodhouse 	} else {
1021*53b381b3SDavid Woodhouse 		ret = -EIO;
1022*53b381b3SDavid Woodhouse 	}
1023*53b381b3SDavid Woodhouse out:
1024*53b381b3SDavid Woodhouse 	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025*53b381b3SDavid Woodhouse 
1026*53b381b3SDavid Woodhouse 	return ret;
1027*53b381b3SDavid Woodhouse }
1028*53b381b3SDavid Woodhouse 
1029*53b381b3SDavid Woodhouse /*
1030*53b381b3SDavid Woodhouse  * helper to fail a stripe based on a physical disk
1031*53b381b3SDavid Woodhouse  * bio.
1032*53b381b3SDavid Woodhouse  */
1033*53b381b3SDavid Woodhouse static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034*53b381b3SDavid Woodhouse 			   struct bio *bio)
1035*53b381b3SDavid Woodhouse {
1036*53b381b3SDavid Woodhouse 	int failed = find_bio_stripe(rbio, bio);
1037*53b381b3SDavid Woodhouse 
1038*53b381b3SDavid Woodhouse 	if (failed < 0)
1039*53b381b3SDavid Woodhouse 		return -EIO;
1040*53b381b3SDavid Woodhouse 
1041*53b381b3SDavid Woodhouse 	return fail_rbio_index(rbio, failed);
1042*53b381b3SDavid Woodhouse }
1043*53b381b3SDavid Woodhouse 
1044*53b381b3SDavid Woodhouse /*
1045*53b381b3SDavid Woodhouse  * this sets each page in the bio uptodate.  It should only be used on private
1046*53b381b3SDavid Woodhouse  * rbio pages, nothing that comes in from the higher layers
1047*53b381b3SDavid Woodhouse  */
1048*53b381b3SDavid Woodhouse static void set_bio_pages_uptodate(struct bio *bio)
1049*53b381b3SDavid Woodhouse {
1050*53b381b3SDavid Woodhouse 	int i;
1051*53b381b3SDavid Woodhouse 	struct page *p;
1052*53b381b3SDavid Woodhouse 
1053*53b381b3SDavid Woodhouse 	for (i = 0; i < bio->bi_vcnt; i++) {
1054*53b381b3SDavid Woodhouse 		p = bio->bi_io_vec[i].bv_page;
1055*53b381b3SDavid Woodhouse 		SetPageUptodate(p);
1056*53b381b3SDavid Woodhouse 	}
1057*53b381b3SDavid Woodhouse }
1058*53b381b3SDavid Woodhouse 
1059*53b381b3SDavid Woodhouse /*
1060*53b381b3SDavid Woodhouse  * end io for the read phase of the rmw cycle.  All the bios here are physical
1061*53b381b3SDavid Woodhouse  * stripe bios we've read from the disk so we can recalculate the parity of the
1062*53b381b3SDavid Woodhouse  * stripe.
1063*53b381b3SDavid Woodhouse  *
1064*53b381b3SDavid Woodhouse  * This will usually kick off finish_rmw once all the bios are read in, but it
1065*53b381b3SDavid Woodhouse  * may trigger parity reconstruction if we had any errors along the way
1066*53b381b3SDavid Woodhouse  */
1067*53b381b3SDavid Woodhouse static void raid_rmw_end_io(struct bio *bio, int err)
1068*53b381b3SDavid Woodhouse {
1069*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio = bio->bi_private;
1070*53b381b3SDavid Woodhouse 
1071*53b381b3SDavid Woodhouse 	if (err)
1072*53b381b3SDavid Woodhouse 		fail_bio_stripe(rbio, bio);
1073*53b381b3SDavid Woodhouse 	else
1074*53b381b3SDavid Woodhouse 		set_bio_pages_uptodate(bio);
1075*53b381b3SDavid Woodhouse 
1076*53b381b3SDavid Woodhouse 	bio_put(bio);
1077*53b381b3SDavid Woodhouse 
1078*53b381b3SDavid Woodhouse 	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079*53b381b3SDavid Woodhouse 		return;
1080*53b381b3SDavid Woodhouse 
1081*53b381b3SDavid Woodhouse 	err = 0;
1082*53b381b3SDavid Woodhouse 	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083*53b381b3SDavid Woodhouse 		goto cleanup;
1084*53b381b3SDavid Woodhouse 
1085*53b381b3SDavid Woodhouse 	/*
1086*53b381b3SDavid Woodhouse 	 * this will normally call finish_rmw to start our write
1087*53b381b3SDavid Woodhouse 	 * but if there are any failed stripes we'll reconstruct
1088*53b381b3SDavid Woodhouse 	 * from parity first
1089*53b381b3SDavid Woodhouse 	 */
1090*53b381b3SDavid Woodhouse 	validate_rbio_for_rmw(rbio);
1091*53b381b3SDavid Woodhouse 	return;
1092*53b381b3SDavid Woodhouse 
1093*53b381b3SDavid Woodhouse cleanup:
1094*53b381b3SDavid Woodhouse 
1095*53b381b3SDavid Woodhouse 	rbio_orig_end_io(rbio, -EIO, 0);
1096*53b381b3SDavid Woodhouse }
1097*53b381b3SDavid Woodhouse 
1098*53b381b3SDavid Woodhouse static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099*53b381b3SDavid Woodhouse {
1100*53b381b3SDavid Woodhouse 	rbio->work.flags = 0;
1101*53b381b3SDavid Woodhouse 	rbio->work.func = rmw_work;
1102*53b381b3SDavid Woodhouse 
1103*53b381b3SDavid Woodhouse 	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104*53b381b3SDavid Woodhouse 			   &rbio->work);
1105*53b381b3SDavid Woodhouse }
1106*53b381b3SDavid Woodhouse 
1107*53b381b3SDavid Woodhouse static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108*53b381b3SDavid Woodhouse {
1109*53b381b3SDavid Woodhouse 	rbio->work.flags = 0;
1110*53b381b3SDavid Woodhouse 	rbio->work.func = read_rebuild_work;
1111*53b381b3SDavid Woodhouse 
1112*53b381b3SDavid Woodhouse 	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113*53b381b3SDavid Woodhouse 			   &rbio->work);
1114*53b381b3SDavid Woodhouse }
1115*53b381b3SDavid Woodhouse 
1116*53b381b3SDavid Woodhouse /*
1117*53b381b3SDavid Woodhouse  * the stripe must be locked by the caller.  It will
1118*53b381b3SDavid Woodhouse  * unlock after all the writes are done
1119*53b381b3SDavid Woodhouse  */
1120*53b381b3SDavid Woodhouse static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121*53b381b3SDavid Woodhouse {
1122*53b381b3SDavid Woodhouse 	int bios_to_read = 0;
1123*53b381b3SDavid Woodhouse 	struct btrfs_bio *bbio = rbio->bbio;
1124*53b381b3SDavid Woodhouse 	struct bio_list bio_list;
1125*53b381b3SDavid Woodhouse 	int ret;
1126*53b381b3SDavid Woodhouse 	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127*53b381b3SDavid Woodhouse 	int pagenr;
1128*53b381b3SDavid Woodhouse 	int stripe;
1129*53b381b3SDavid Woodhouse 	struct bio *bio;
1130*53b381b3SDavid Woodhouse 
1131*53b381b3SDavid Woodhouse 	bio_list_init(&bio_list);
1132*53b381b3SDavid Woodhouse 
1133*53b381b3SDavid Woodhouse 	ret = alloc_rbio_pages(rbio);
1134*53b381b3SDavid Woodhouse 	if (ret)
1135*53b381b3SDavid Woodhouse 		goto cleanup;
1136*53b381b3SDavid Woodhouse 
1137*53b381b3SDavid Woodhouse 	index_rbio_pages(rbio);
1138*53b381b3SDavid Woodhouse 
1139*53b381b3SDavid Woodhouse 	atomic_set(&rbio->bbio->error, 0);
1140*53b381b3SDavid Woodhouse 	/*
1141*53b381b3SDavid Woodhouse 	 * build a list of bios to read all the missing parts of this
1142*53b381b3SDavid Woodhouse 	 * stripe
1143*53b381b3SDavid Woodhouse 	 */
1144*53b381b3SDavid Woodhouse 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145*53b381b3SDavid Woodhouse 		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146*53b381b3SDavid Woodhouse 			struct page *page;
1147*53b381b3SDavid Woodhouse 			/*
1148*53b381b3SDavid Woodhouse 			 * we want to find all the pages missing from
1149*53b381b3SDavid Woodhouse 			 * the rbio and read them from the disk.  If
1150*53b381b3SDavid Woodhouse 			 * page_in_rbio finds a page in the bio list
1151*53b381b3SDavid Woodhouse 			 * we don't need to read it off the stripe.
1152*53b381b3SDavid Woodhouse 			 */
1153*53b381b3SDavid Woodhouse 			page = page_in_rbio(rbio, stripe, pagenr, 1);
1154*53b381b3SDavid Woodhouse 			if (page)
1155*53b381b3SDavid Woodhouse 				continue;
1156*53b381b3SDavid Woodhouse 
1157*53b381b3SDavid Woodhouse 			page = rbio_stripe_page(rbio, stripe, pagenr);
1158*53b381b3SDavid Woodhouse 			ret = rbio_add_io_page(rbio, &bio_list, page,
1159*53b381b3SDavid Woodhouse 				       stripe, pagenr, rbio->stripe_len);
1160*53b381b3SDavid Woodhouse 			if (ret)
1161*53b381b3SDavid Woodhouse 				goto cleanup;
1162*53b381b3SDavid Woodhouse 		}
1163*53b381b3SDavid Woodhouse 	}
1164*53b381b3SDavid Woodhouse 
1165*53b381b3SDavid Woodhouse 	bios_to_read = bio_list_size(&bio_list);
1166*53b381b3SDavid Woodhouse 	if (!bios_to_read) {
1167*53b381b3SDavid Woodhouse 		/*
1168*53b381b3SDavid Woodhouse 		 * this can happen if others have merged with
1169*53b381b3SDavid Woodhouse 		 * us, it means there is nothing left to read.
1170*53b381b3SDavid Woodhouse 		 * But if there are missing devices it may not be
1171*53b381b3SDavid Woodhouse 		 * safe to do the full stripe write yet.
1172*53b381b3SDavid Woodhouse 		 */
1173*53b381b3SDavid Woodhouse 		goto finish;
1174*53b381b3SDavid Woodhouse 	}
1175*53b381b3SDavid Woodhouse 
1176*53b381b3SDavid Woodhouse 	/*
1177*53b381b3SDavid Woodhouse 	 * the bbio may be freed once we submit the last bio.  Make sure
1178*53b381b3SDavid Woodhouse 	 * not to touch it after that
1179*53b381b3SDavid Woodhouse 	 */
1180*53b381b3SDavid Woodhouse 	atomic_set(&bbio->stripes_pending, bios_to_read);
1181*53b381b3SDavid Woodhouse 	while (1) {
1182*53b381b3SDavid Woodhouse 		bio = bio_list_pop(&bio_list);
1183*53b381b3SDavid Woodhouse 		if (!bio)
1184*53b381b3SDavid Woodhouse 			break;
1185*53b381b3SDavid Woodhouse 
1186*53b381b3SDavid Woodhouse 		bio->bi_private = rbio;
1187*53b381b3SDavid Woodhouse 		bio->bi_end_io = raid_rmw_end_io;
1188*53b381b3SDavid Woodhouse 
1189*53b381b3SDavid Woodhouse 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190*53b381b3SDavid Woodhouse 				    BTRFS_WQ_ENDIO_RAID56);
1191*53b381b3SDavid Woodhouse 
1192*53b381b3SDavid Woodhouse 		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193*53b381b3SDavid Woodhouse 		submit_bio(READ, bio);
1194*53b381b3SDavid Woodhouse 	}
1195*53b381b3SDavid Woodhouse 	/* the actual write will happen once the reads are done */
1196*53b381b3SDavid Woodhouse 	return 0;
1197*53b381b3SDavid Woodhouse 
1198*53b381b3SDavid Woodhouse cleanup:
1199*53b381b3SDavid Woodhouse 	rbio_orig_end_io(rbio, -EIO, 0);
1200*53b381b3SDavid Woodhouse 	return -EIO;
1201*53b381b3SDavid Woodhouse 
1202*53b381b3SDavid Woodhouse finish:
1203*53b381b3SDavid Woodhouse 	validate_rbio_for_rmw(rbio);
1204*53b381b3SDavid Woodhouse 	return 0;
1205*53b381b3SDavid Woodhouse }
1206*53b381b3SDavid Woodhouse 
1207*53b381b3SDavid Woodhouse /*
1208*53b381b3SDavid Woodhouse  * if the upper layers pass in a full stripe, we thank them by only allocating
1209*53b381b3SDavid Woodhouse  * enough pages to hold the parity, and sending it all down quickly.
1210*53b381b3SDavid Woodhouse  */
1211*53b381b3SDavid Woodhouse static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212*53b381b3SDavid Woodhouse {
1213*53b381b3SDavid Woodhouse 	int ret;
1214*53b381b3SDavid Woodhouse 
1215*53b381b3SDavid Woodhouse 	ret = alloc_rbio_parity_pages(rbio);
1216*53b381b3SDavid Woodhouse 	if (ret)
1217*53b381b3SDavid Woodhouse 		return ret;
1218*53b381b3SDavid Woodhouse 
1219*53b381b3SDavid Woodhouse 	ret = lock_stripe_add(rbio);
1220*53b381b3SDavid Woodhouse 	if (ret == 0)
1221*53b381b3SDavid Woodhouse 		finish_rmw(rbio);
1222*53b381b3SDavid Woodhouse 	return 0;
1223*53b381b3SDavid Woodhouse }
1224*53b381b3SDavid Woodhouse 
1225*53b381b3SDavid Woodhouse /*
1226*53b381b3SDavid Woodhouse  * partial stripe writes get handed over to async helpers.
1227*53b381b3SDavid Woodhouse  * We're really hoping to merge a few more writes into this
1228*53b381b3SDavid Woodhouse  * rbio before calculating new parity
1229*53b381b3SDavid Woodhouse  */
1230*53b381b3SDavid Woodhouse static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231*53b381b3SDavid Woodhouse {
1232*53b381b3SDavid Woodhouse 	int ret;
1233*53b381b3SDavid Woodhouse 
1234*53b381b3SDavid Woodhouse 	ret = lock_stripe_add(rbio);
1235*53b381b3SDavid Woodhouse 	if (ret == 0)
1236*53b381b3SDavid Woodhouse 		async_rmw_stripe(rbio);
1237*53b381b3SDavid Woodhouse 	return 0;
1238*53b381b3SDavid Woodhouse }
1239*53b381b3SDavid Woodhouse 
1240*53b381b3SDavid Woodhouse /*
1241*53b381b3SDavid Woodhouse  * sometimes while we were reading from the drive to
1242*53b381b3SDavid Woodhouse  * recalculate parity, enough new bios come into create
1243*53b381b3SDavid Woodhouse  * a full stripe.  So we do a check here to see if we can
1244*53b381b3SDavid Woodhouse  * go directly to finish_rmw
1245*53b381b3SDavid Woodhouse  */
1246*53b381b3SDavid Woodhouse static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247*53b381b3SDavid Woodhouse {
1248*53b381b3SDavid Woodhouse 	/* head off into rmw land if we don't have a full stripe */
1249*53b381b3SDavid Woodhouse 	if (!rbio_is_full(rbio))
1250*53b381b3SDavid Woodhouse 		return partial_stripe_write(rbio);
1251*53b381b3SDavid Woodhouse 	return full_stripe_write(rbio);
1252*53b381b3SDavid Woodhouse }
1253*53b381b3SDavid Woodhouse 
1254*53b381b3SDavid Woodhouse /*
1255*53b381b3SDavid Woodhouse  * our main entry point for writes from the rest of the FS.
1256*53b381b3SDavid Woodhouse  */
1257*53b381b3SDavid Woodhouse int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258*53b381b3SDavid Woodhouse 			struct btrfs_bio *bbio, u64 *raid_map,
1259*53b381b3SDavid Woodhouse 			u64 stripe_len)
1260*53b381b3SDavid Woodhouse {
1261*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio;
1262*53b381b3SDavid Woodhouse 
1263*53b381b3SDavid Woodhouse 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264*53b381b3SDavid Woodhouse 	if (IS_ERR(rbio)) {
1265*53b381b3SDavid Woodhouse 		kfree(raid_map);
1266*53b381b3SDavid Woodhouse 		kfree(bbio);
1267*53b381b3SDavid Woodhouse 		return PTR_ERR(rbio);
1268*53b381b3SDavid Woodhouse 	}
1269*53b381b3SDavid Woodhouse 	bio_list_add(&rbio->bio_list, bio);
1270*53b381b3SDavid Woodhouse 	rbio->bio_list_bytes = bio->bi_size;
1271*53b381b3SDavid Woodhouse 	return __raid56_parity_write(rbio);
1272*53b381b3SDavid Woodhouse }
1273*53b381b3SDavid Woodhouse 
1274*53b381b3SDavid Woodhouse /*
1275*53b381b3SDavid Woodhouse  * all parity reconstruction happens here.  We've read in everything
1276*53b381b3SDavid Woodhouse  * we can find from the drives and this does the heavy lifting of
1277*53b381b3SDavid Woodhouse  * sorting the good from the bad.
1278*53b381b3SDavid Woodhouse  */
1279*53b381b3SDavid Woodhouse static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280*53b381b3SDavid Woodhouse {
1281*53b381b3SDavid Woodhouse 	int pagenr, stripe;
1282*53b381b3SDavid Woodhouse 	void **pointers;
1283*53b381b3SDavid Woodhouse 	int faila = -1, failb = -1;
1284*53b381b3SDavid Woodhouse 	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285*53b381b3SDavid Woodhouse 	struct page *page;
1286*53b381b3SDavid Woodhouse 	int err;
1287*53b381b3SDavid Woodhouse 	int i;
1288*53b381b3SDavid Woodhouse 
1289*53b381b3SDavid Woodhouse 	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290*53b381b3SDavid Woodhouse 			   GFP_NOFS);
1291*53b381b3SDavid Woodhouse 	if (!pointers) {
1292*53b381b3SDavid Woodhouse 		err = -ENOMEM;
1293*53b381b3SDavid Woodhouse 		goto cleanup_io;
1294*53b381b3SDavid Woodhouse 	}
1295*53b381b3SDavid Woodhouse 
1296*53b381b3SDavid Woodhouse 	faila = rbio->faila;
1297*53b381b3SDavid Woodhouse 	failb = rbio->failb;
1298*53b381b3SDavid Woodhouse 
1299*53b381b3SDavid Woodhouse 	if (rbio->read_rebuild) {
1300*53b381b3SDavid Woodhouse 		spin_lock_irq(&rbio->bio_list_lock);
1301*53b381b3SDavid Woodhouse 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302*53b381b3SDavid Woodhouse 		spin_unlock_irq(&rbio->bio_list_lock);
1303*53b381b3SDavid Woodhouse 	}
1304*53b381b3SDavid Woodhouse 
1305*53b381b3SDavid Woodhouse 	index_rbio_pages(rbio);
1306*53b381b3SDavid Woodhouse 
1307*53b381b3SDavid Woodhouse 	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308*53b381b3SDavid Woodhouse 		/* setup our array of pointers with pages
1309*53b381b3SDavid Woodhouse 		 * from each stripe
1310*53b381b3SDavid Woodhouse 		 */
1311*53b381b3SDavid Woodhouse 		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312*53b381b3SDavid Woodhouse 			/*
1313*53b381b3SDavid Woodhouse 			 * if we're rebuilding a read, we have to use
1314*53b381b3SDavid Woodhouse 			 * pages from the bio list
1315*53b381b3SDavid Woodhouse 			 */
1316*53b381b3SDavid Woodhouse 			if (rbio->read_rebuild &&
1317*53b381b3SDavid Woodhouse 			    (stripe == faila || stripe == failb)) {
1318*53b381b3SDavid Woodhouse 				page = page_in_rbio(rbio, stripe, pagenr, 0);
1319*53b381b3SDavid Woodhouse 			} else {
1320*53b381b3SDavid Woodhouse 				page = rbio_stripe_page(rbio, stripe, pagenr);
1321*53b381b3SDavid Woodhouse 			}
1322*53b381b3SDavid Woodhouse 			pointers[stripe] = kmap(page);
1323*53b381b3SDavid Woodhouse 		}
1324*53b381b3SDavid Woodhouse 
1325*53b381b3SDavid Woodhouse 		/* all raid6 handling here */
1326*53b381b3SDavid Woodhouse 		if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327*53b381b3SDavid Woodhouse 		    RAID6_Q_STRIPE) {
1328*53b381b3SDavid Woodhouse 
1329*53b381b3SDavid Woodhouse 			/*
1330*53b381b3SDavid Woodhouse 			 * single failure, rebuild from parity raid5
1331*53b381b3SDavid Woodhouse 			 * style
1332*53b381b3SDavid Woodhouse 			 */
1333*53b381b3SDavid Woodhouse 			if (failb < 0) {
1334*53b381b3SDavid Woodhouse 				if (faila == rbio->nr_data) {
1335*53b381b3SDavid Woodhouse 					/*
1336*53b381b3SDavid Woodhouse 					 * Just the P stripe has failed, without
1337*53b381b3SDavid Woodhouse 					 * a bad data or Q stripe.
1338*53b381b3SDavid Woodhouse 					 * TODO, we should redo the xor here.
1339*53b381b3SDavid Woodhouse 					 */
1340*53b381b3SDavid Woodhouse 					err = -EIO;
1341*53b381b3SDavid Woodhouse 					goto cleanup;
1342*53b381b3SDavid Woodhouse 				}
1343*53b381b3SDavid Woodhouse 				/*
1344*53b381b3SDavid Woodhouse 				 * a single failure in raid6 is rebuilt
1345*53b381b3SDavid Woodhouse 				 * in the pstripe code below
1346*53b381b3SDavid Woodhouse 				 */
1347*53b381b3SDavid Woodhouse 				goto pstripe;
1348*53b381b3SDavid Woodhouse 			}
1349*53b381b3SDavid Woodhouse 
1350*53b381b3SDavid Woodhouse 			/* make sure our ps and qs are in order */
1351*53b381b3SDavid Woodhouse 			if (faila > failb) {
1352*53b381b3SDavid Woodhouse 				int tmp = failb;
1353*53b381b3SDavid Woodhouse 				failb = faila;
1354*53b381b3SDavid Woodhouse 				faila = tmp;
1355*53b381b3SDavid Woodhouse 			}
1356*53b381b3SDavid Woodhouse 
1357*53b381b3SDavid Woodhouse 			/* if the q stripe is failed, do a pstripe reconstruction
1358*53b381b3SDavid Woodhouse 			 * from the xors.
1359*53b381b3SDavid Woodhouse 			 * If both the q stripe and the P stripe are failed, we're
1360*53b381b3SDavid Woodhouse 			 * here due to a crc mismatch and we can't give them the
1361*53b381b3SDavid Woodhouse 			 * data they want
1362*53b381b3SDavid Woodhouse 			 */
1363*53b381b3SDavid Woodhouse 			if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364*53b381b3SDavid Woodhouse 				if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365*53b381b3SDavid Woodhouse 					err = -EIO;
1366*53b381b3SDavid Woodhouse 					goto cleanup;
1367*53b381b3SDavid Woodhouse 				}
1368*53b381b3SDavid Woodhouse 				/*
1369*53b381b3SDavid Woodhouse 				 * otherwise we have one bad data stripe and
1370*53b381b3SDavid Woodhouse 				 * a good P stripe.  raid5!
1371*53b381b3SDavid Woodhouse 				 */
1372*53b381b3SDavid Woodhouse 				goto pstripe;
1373*53b381b3SDavid Woodhouse 			}
1374*53b381b3SDavid Woodhouse 
1375*53b381b3SDavid Woodhouse 			if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376*53b381b3SDavid Woodhouse 				raid6_datap_recov(rbio->bbio->num_stripes,
1377*53b381b3SDavid Woodhouse 						  PAGE_SIZE, faila, pointers);
1378*53b381b3SDavid Woodhouse 			} else {
1379*53b381b3SDavid Woodhouse 				raid6_2data_recov(rbio->bbio->num_stripes,
1380*53b381b3SDavid Woodhouse 						  PAGE_SIZE, faila, failb,
1381*53b381b3SDavid Woodhouse 						  pointers);
1382*53b381b3SDavid Woodhouse 			}
1383*53b381b3SDavid Woodhouse 		} else {
1384*53b381b3SDavid Woodhouse 			void *p;
1385*53b381b3SDavid Woodhouse 
1386*53b381b3SDavid Woodhouse 			/* rebuild from P stripe here (raid5 or raid6) */
1387*53b381b3SDavid Woodhouse 			BUG_ON(failb != -1);
1388*53b381b3SDavid Woodhouse pstripe:
1389*53b381b3SDavid Woodhouse 			/* Copy parity block into failed block to start with */
1390*53b381b3SDavid Woodhouse 			memcpy(pointers[faila],
1391*53b381b3SDavid Woodhouse 			       pointers[rbio->nr_data],
1392*53b381b3SDavid Woodhouse 			       PAGE_CACHE_SIZE);
1393*53b381b3SDavid Woodhouse 
1394*53b381b3SDavid Woodhouse 			/* rearrange the pointer array */
1395*53b381b3SDavid Woodhouse 			p = pointers[faila];
1396*53b381b3SDavid Woodhouse 			for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397*53b381b3SDavid Woodhouse 				pointers[stripe] = pointers[stripe + 1];
1398*53b381b3SDavid Woodhouse 			pointers[rbio->nr_data - 1] = p;
1399*53b381b3SDavid Woodhouse 
1400*53b381b3SDavid Woodhouse 			/* xor in the rest */
1401*53b381b3SDavid Woodhouse 			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402*53b381b3SDavid Woodhouse 		}
1403*53b381b3SDavid Woodhouse 		/* if we're doing this rebuild as part of an rmw, go through
1404*53b381b3SDavid Woodhouse 		 * and set all of our private rbio pages in the
1405*53b381b3SDavid Woodhouse 		 * failed stripes as uptodate.  This way finish_rmw will
1406*53b381b3SDavid Woodhouse 		 * know they can be trusted.  If this was a read reconstruction,
1407*53b381b3SDavid Woodhouse 		 * other endio functions will fiddle the uptodate bits
1408*53b381b3SDavid Woodhouse 		 */
1409*53b381b3SDavid Woodhouse 		if (!rbio->read_rebuild) {
1410*53b381b3SDavid Woodhouse 			for (i = 0;  i < nr_pages; i++) {
1411*53b381b3SDavid Woodhouse 				if (faila != -1) {
1412*53b381b3SDavid Woodhouse 					page = rbio_stripe_page(rbio, faila, i);
1413*53b381b3SDavid Woodhouse 					SetPageUptodate(page);
1414*53b381b3SDavid Woodhouse 				}
1415*53b381b3SDavid Woodhouse 				if (failb != -1) {
1416*53b381b3SDavid Woodhouse 					page = rbio_stripe_page(rbio, failb, i);
1417*53b381b3SDavid Woodhouse 					SetPageUptodate(page);
1418*53b381b3SDavid Woodhouse 				}
1419*53b381b3SDavid Woodhouse 			}
1420*53b381b3SDavid Woodhouse 		}
1421*53b381b3SDavid Woodhouse 		for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422*53b381b3SDavid Woodhouse 			/*
1423*53b381b3SDavid Woodhouse 			 * if we're rebuilding a read, we have to use
1424*53b381b3SDavid Woodhouse 			 * pages from the bio list
1425*53b381b3SDavid Woodhouse 			 */
1426*53b381b3SDavid Woodhouse 			if (rbio->read_rebuild &&
1427*53b381b3SDavid Woodhouse 			    (stripe == faila || stripe == failb)) {
1428*53b381b3SDavid Woodhouse 				page = page_in_rbio(rbio, stripe, pagenr, 0);
1429*53b381b3SDavid Woodhouse 			} else {
1430*53b381b3SDavid Woodhouse 				page = rbio_stripe_page(rbio, stripe, pagenr);
1431*53b381b3SDavid Woodhouse 			}
1432*53b381b3SDavid Woodhouse 			kunmap(page);
1433*53b381b3SDavid Woodhouse 		}
1434*53b381b3SDavid Woodhouse 	}
1435*53b381b3SDavid Woodhouse 
1436*53b381b3SDavid Woodhouse 	err = 0;
1437*53b381b3SDavid Woodhouse cleanup:
1438*53b381b3SDavid Woodhouse 	kfree(pointers);
1439*53b381b3SDavid Woodhouse 
1440*53b381b3SDavid Woodhouse cleanup_io:
1441*53b381b3SDavid Woodhouse 
1442*53b381b3SDavid Woodhouse 	if (rbio->read_rebuild) {
1443*53b381b3SDavid Woodhouse 		rbio_orig_end_io(rbio, err, err == 0);
1444*53b381b3SDavid Woodhouse 	} else if (err == 0) {
1445*53b381b3SDavid Woodhouse 		rbio->faila = -1;
1446*53b381b3SDavid Woodhouse 		rbio->failb = -1;
1447*53b381b3SDavid Woodhouse 		finish_rmw(rbio);
1448*53b381b3SDavid Woodhouse 	} else {
1449*53b381b3SDavid Woodhouse 		rbio_orig_end_io(rbio, err, 0);
1450*53b381b3SDavid Woodhouse 	}
1451*53b381b3SDavid Woodhouse }
1452*53b381b3SDavid Woodhouse 
1453*53b381b3SDavid Woodhouse /*
1454*53b381b3SDavid Woodhouse  * This is called only for stripes we've read from disk to
1455*53b381b3SDavid Woodhouse  * reconstruct the parity.
1456*53b381b3SDavid Woodhouse  */
1457*53b381b3SDavid Woodhouse static void raid_recover_end_io(struct bio *bio, int err)
1458*53b381b3SDavid Woodhouse {
1459*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio = bio->bi_private;
1460*53b381b3SDavid Woodhouse 
1461*53b381b3SDavid Woodhouse 	/*
1462*53b381b3SDavid Woodhouse 	 * we only read stripe pages off the disk, set them
1463*53b381b3SDavid Woodhouse 	 * up to date if there were no errors
1464*53b381b3SDavid Woodhouse 	 */
1465*53b381b3SDavid Woodhouse 	if (err)
1466*53b381b3SDavid Woodhouse 		fail_bio_stripe(rbio, bio);
1467*53b381b3SDavid Woodhouse 	else
1468*53b381b3SDavid Woodhouse 		set_bio_pages_uptodate(bio);
1469*53b381b3SDavid Woodhouse 	bio_put(bio);
1470*53b381b3SDavid Woodhouse 
1471*53b381b3SDavid Woodhouse 	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472*53b381b3SDavid Woodhouse 		return;
1473*53b381b3SDavid Woodhouse 
1474*53b381b3SDavid Woodhouse 	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475*53b381b3SDavid Woodhouse 		rbio_orig_end_io(rbio, -EIO, 0);
1476*53b381b3SDavid Woodhouse 	else
1477*53b381b3SDavid Woodhouse 		__raid_recover_end_io(rbio);
1478*53b381b3SDavid Woodhouse }
1479*53b381b3SDavid Woodhouse 
1480*53b381b3SDavid Woodhouse /*
1481*53b381b3SDavid Woodhouse  * reads everything we need off the disk to reconstruct
1482*53b381b3SDavid Woodhouse  * the parity. endio handlers trigger final reconstruction
1483*53b381b3SDavid Woodhouse  * when the IO is done.
1484*53b381b3SDavid Woodhouse  *
1485*53b381b3SDavid Woodhouse  * This is used both for reads from the higher layers and for
1486*53b381b3SDavid Woodhouse  * parity construction required to finish a rmw cycle.
1487*53b381b3SDavid Woodhouse  */
1488*53b381b3SDavid Woodhouse static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489*53b381b3SDavid Woodhouse {
1490*53b381b3SDavid Woodhouse 	int bios_to_read = 0;
1491*53b381b3SDavid Woodhouse 	struct btrfs_bio *bbio = rbio->bbio;
1492*53b381b3SDavid Woodhouse 	struct bio_list bio_list;
1493*53b381b3SDavid Woodhouse 	int ret;
1494*53b381b3SDavid Woodhouse 	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495*53b381b3SDavid Woodhouse 	int pagenr;
1496*53b381b3SDavid Woodhouse 	int stripe;
1497*53b381b3SDavid Woodhouse 	struct bio *bio;
1498*53b381b3SDavid Woodhouse 
1499*53b381b3SDavid Woodhouse 	bio_list_init(&bio_list);
1500*53b381b3SDavid Woodhouse 
1501*53b381b3SDavid Woodhouse 	ret = alloc_rbio_pages(rbio);
1502*53b381b3SDavid Woodhouse 	if (ret)
1503*53b381b3SDavid Woodhouse 		goto cleanup;
1504*53b381b3SDavid Woodhouse 
1505*53b381b3SDavid Woodhouse 	atomic_set(&rbio->bbio->error, 0);
1506*53b381b3SDavid Woodhouse 
1507*53b381b3SDavid Woodhouse 	/*
1508*53b381b3SDavid Woodhouse 	 * read everything that hasn't failed.
1509*53b381b3SDavid Woodhouse 	 */
1510*53b381b3SDavid Woodhouse 	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511*53b381b3SDavid Woodhouse 		if (rbio->faila == stripe ||
1512*53b381b3SDavid Woodhouse 		    rbio->failb == stripe)
1513*53b381b3SDavid Woodhouse 			continue;
1514*53b381b3SDavid Woodhouse 
1515*53b381b3SDavid Woodhouse 		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516*53b381b3SDavid Woodhouse 			struct page *p;
1517*53b381b3SDavid Woodhouse 
1518*53b381b3SDavid Woodhouse 			/*
1519*53b381b3SDavid Woodhouse 			 * the rmw code may have already read this
1520*53b381b3SDavid Woodhouse 			 * page in
1521*53b381b3SDavid Woodhouse 			 */
1522*53b381b3SDavid Woodhouse 			p = rbio_stripe_page(rbio, stripe, pagenr);
1523*53b381b3SDavid Woodhouse 			if (PageUptodate(p))
1524*53b381b3SDavid Woodhouse 				continue;
1525*53b381b3SDavid Woodhouse 
1526*53b381b3SDavid Woodhouse 			ret = rbio_add_io_page(rbio, &bio_list,
1527*53b381b3SDavid Woodhouse 				       rbio_stripe_page(rbio, stripe, pagenr),
1528*53b381b3SDavid Woodhouse 				       stripe, pagenr, rbio->stripe_len);
1529*53b381b3SDavid Woodhouse 			if (ret < 0)
1530*53b381b3SDavid Woodhouse 				goto cleanup;
1531*53b381b3SDavid Woodhouse 		}
1532*53b381b3SDavid Woodhouse 	}
1533*53b381b3SDavid Woodhouse 
1534*53b381b3SDavid Woodhouse 	bios_to_read = bio_list_size(&bio_list);
1535*53b381b3SDavid Woodhouse 	if (!bios_to_read) {
1536*53b381b3SDavid Woodhouse 		/*
1537*53b381b3SDavid Woodhouse 		 * we might have no bios to read just because the pages
1538*53b381b3SDavid Woodhouse 		 * were up to date, or we might have no bios to read because
1539*53b381b3SDavid Woodhouse 		 * the devices were gone.
1540*53b381b3SDavid Woodhouse 		 */
1541*53b381b3SDavid Woodhouse 		if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542*53b381b3SDavid Woodhouse 			__raid_recover_end_io(rbio);
1543*53b381b3SDavid Woodhouse 			goto out;
1544*53b381b3SDavid Woodhouse 		} else {
1545*53b381b3SDavid Woodhouse 			goto cleanup;
1546*53b381b3SDavid Woodhouse 		}
1547*53b381b3SDavid Woodhouse 	}
1548*53b381b3SDavid Woodhouse 
1549*53b381b3SDavid Woodhouse 	/*
1550*53b381b3SDavid Woodhouse 	 * the bbio may be freed once we submit the last bio.  Make sure
1551*53b381b3SDavid Woodhouse 	 * not to touch it after that
1552*53b381b3SDavid Woodhouse 	 */
1553*53b381b3SDavid Woodhouse 	atomic_set(&bbio->stripes_pending, bios_to_read);
1554*53b381b3SDavid Woodhouse 	while (1) {
1555*53b381b3SDavid Woodhouse 		bio = bio_list_pop(&bio_list);
1556*53b381b3SDavid Woodhouse 		if (!bio)
1557*53b381b3SDavid Woodhouse 			break;
1558*53b381b3SDavid Woodhouse 
1559*53b381b3SDavid Woodhouse 		bio->bi_private = rbio;
1560*53b381b3SDavid Woodhouse 		bio->bi_end_io = raid_recover_end_io;
1561*53b381b3SDavid Woodhouse 
1562*53b381b3SDavid Woodhouse 		btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563*53b381b3SDavid Woodhouse 				    BTRFS_WQ_ENDIO_RAID56);
1564*53b381b3SDavid Woodhouse 
1565*53b381b3SDavid Woodhouse 		BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566*53b381b3SDavid Woodhouse 		submit_bio(READ, bio);
1567*53b381b3SDavid Woodhouse 	}
1568*53b381b3SDavid Woodhouse out:
1569*53b381b3SDavid Woodhouse 	return 0;
1570*53b381b3SDavid Woodhouse 
1571*53b381b3SDavid Woodhouse cleanup:
1572*53b381b3SDavid Woodhouse 	if (rbio->read_rebuild)
1573*53b381b3SDavid Woodhouse 		rbio_orig_end_io(rbio, -EIO, 0);
1574*53b381b3SDavid Woodhouse 	return -EIO;
1575*53b381b3SDavid Woodhouse }
1576*53b381b3SDavid Woodhouse 
1577*53b381b3SDavid Woodhouse /*
1578*53b381b3SDavid Woodhouse  * the main entry point for reads from the higher layers.  This
1579*53b381b3SDavid Woodhouse  * is really only called when the normal read path had a failure,
1580*53b381b3SDavid Woodhouse  * so we assume the bio they send down corresponds to a failed part
1581*53b381b3SDavid Woodhouse  * of the drive.
1582*53b381b3SDavid Woodhouse  */
1583*53b381b3SDavid Woodhouse int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584*53b381b3SDavid Woodhouse 			  struct btrfs_bio *bbio, u64 *raid_map,
1585*53b381b3SDavid Woodhouse 			  u64 stripe_len, int mirror_num)
1586*53b381b3SDavid Woodhouse {
1587*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio;
1588*53b381b3SDavid Woodhouse 	int ret;
1589*53b381b3SDavid Woodhouse 
1590*53b381b3SDavid Woodhouse 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591*53b381b3SDavid Woodhouse 	if (IS_ERR(rbio)) {
1592*53b381b3SDavid Woodhouse 		return PTR_ERR(rbio);
1593*53b381b3SDavid Woodhouse 	}
1594*53b381b3SDavid Woodhouse 
1595*53b381b3SDavid Woodhouse 	rbio->read_rebuild = 1;
1596*53b381b3SDavid Woodhouse 	bio_list_add(&rbio->bio_list, bio);
1597*53b381b3SDavid Woodhouse 	rbio->bio_list_bytes = bio->bi_size;
1598*53b381b3SDavid Woodhouse 
1599*53b381b3SDavid Woodhouse 	rbio->faila = find_logical_bio_stripe(rbio, bio);
1600*53b381b3SDavid Woodhouse 	if (rbio->faila == -1) {
1601*53b381b3SDavid Woodhouse 		BUG();
1602*53b381b3SDavid Woodhouse 		kfree(rbio);
1603*53b381b3SDavid Woodhouse 		return -EIO;
1604*53b381b3SDavid Woodhouse 	}
1605*53b381b3SDavid Woodhouse 
1606*53b381b3SDavid Woodhouse 	/*
1607*53b381b3SDavid Woodhouse 	 * reconstruct from the q stripe if they are
1608*53b381b3SDavid Woodhouse 	 * asking for mirror 3
1609*53b381b3SDavid Woodhouse 	 */
1610*53b381b3SDavid Woodhouse 	if (mirror_num == 3)
1611*53b381b3SDavid Woodhouse 		rbio->failb = bbio->num_stripes - 2;
1612*53b381b3SDavid Woodhouse 
1613*53b381b3SDavid Woodhouse 	ret = lock_stripe_add(rbio);
1614*53b381b3SDavid Woodhouse 
1615*53b381b3SDavid Woodhouse 	/*
1616*53b381b3SDavid Woodhouse 	 * __raid56_parity_recover will end the bio with
1617*53b381b3SDavid Woodhouse 	 * any errors it hits.  We don't want to return
1618*53b381b3SDavid Woodhouse 	 * its error value up the stack because our caller
1619*53b381b3SDavid Woodhouse 	 * will end up calling bio_endio with any nonzero
1620*53b381b3SDavid Woodhouse 	 * return
1621*53b381b3SDavid Woodhouse 	 */
1622*53b381b3SDavid Woodhouse 	if (ret == 0)
1623*53b381b3SDavid Woodhouse 		__raid56_parity_recover(rbio);
1624*53b381b3SDavid Woodhouse 	/*
1625*53b381b3SDavid Woodhouse 	 * our rbio has been added to the list of
1626*53b381b3SDavid Woodhouse 	 * rbios that will be handled after the
1627*53b381b3SDavid Woodhouse 	 * currently lock owner is done
1628*53b381b3SDavid Woodhouse 	 */
1629*53b381b3SDavid Woodhouse 	return 0;
1630*53b381b3SDavid Woodhouse 
1631*53b381b3SDavid Woodhouse }
1632*53b381b3SDavid Woodhouse 
1633*53b381b3SDavid Woodhouse static void rmw_work(struct btrfs_work *work)
1634*53b381b3SDavid Woodhouse {
1635*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio;
1636*53b381b3SDavid Woodhouse 
1637*53b381b3SDavid Woodhouse 	rbio = container_of(work, struct btrfs_raid_bio, work);
1638*53b381b3SDavid Woodhouse 	raid56_rmw_stripe(rbio);
1639*53b381b3SDavid Woodhouse }
1640*53b381b3SDavid Woodhouse 
1641*53b381b3SDavid Woodhouse static void read_rebuild_work(struct btrfs_work *work)
1642*53b381b3SDavid Woodhouse {
1643*53b381b3SDavid Woodhouse 	struct btrfs_raid_bio *rbio;
1644*53b381b3SDavid Woodhouse 
1645*53b381b3SDavid Woodhouse 	rbio = container_of(work, struct btrfs_raid_bio, work);
1646*53b381b3SDavid Woodhouse 	__raid56_parity_recover(rbio);
1647*53b381b3SDavid Woodhouse }
1648