xref: /linux/fs/bcachefs/ec.c (revision 0564b16782b39d6f59e06f427f32826db69e75a2)
1cd575ddfSKent Overstreet // SPDX-License-Identifier: GPL-2.0
2cd575ddfSKent Overstreet 
3cd575ddfSKent Overstreet /* erasure coding */
4cd575ddfSKent Overstreet 
5cd575ddfSKent Overstreet #include "bcachefs.h"
6cd575ddfSKent Overstreet #include "alloc_foreground.h"
7cd575ddfSKent Overstreet #include "bset.h"
8cd575ddfSKent Overstreet #include "btree_gc.h"
9cd575ddfSKent Overstreet #include "btree_update.h"
10cd575ddfSKent Overstreet #include "buckets.h"
11cd575ddfSKent Overstreet #include "disk_groups.h"
12cd575ddfSKent Overstreet #include "ec.h"
13cd575ddfSKent Overstreet #include "error.h"
14cd575ddfSKent Overstreet #include "io.h"
1561c8d7c8SKent Overstreet #include "journal_io.h"
16cd575ddfSKent Overstreet #include "keylist.h"
17cd575ddfSKent Overstreet #include "super-io.h"
18cd575ddfSKent Overstreet #include "util.h"
19cd575ddfSKent Overstreet 
20de5bb710SKent Overstreet #include <linux/sort.h>
21de5bb710SKent Overstreet 
22de5bb710SKent Overstreet #ifdef __KERNEL__
23de5bb710SKent Overstreet 
24cd575ddfSKent Overstreet #include <linux/raid/pq.h>
25cd575ddfSKent Overstreet #include <linux/raid/xor.h>
26de5bb710SKent Overstreet 
27de5bb710SKent Overstreet static void raid5_recov(unsigned disks, unsigned failed_idx,
28de5bb710SKent Overstreet 			size_t size, void **data)
29de5bb710SKent Overstreet {
30de5bb710SKent Overstreet 	unsigned i = 2, nr;
31de5bb710SKent Overstreet 
32de5bb710SKent Overstreet 	BUG_ON(failed_idx >= disks);
33de5bb710SKent Overstreet 
34de5bb710SKent Overstreet 	swap(data[0], data[failed_idx]);
35de5bb710SKent Overstreet 	memcpy(data[0], data[1], size);
36de5bb710SKent Overstreet 
37de5bb710SKent Overstreet 	while (i < disks) {
38de5bb710SKent Overstreet 		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
39de5bb710SKent Overstreet 		xor_blocks(nr, size, data[0], data + i);
40de5bb710SKent Overstreet 		i += nr;
41de5bb710SKent Overstreet 	}
42de5bb710SKent Overstreet 
43de5bb710SKent Overstreet 	swap(data[0], data[failed_idx]);
44de5bb710SKent Overstreet }
45de5bb710SKent Overstreet 
46de5bb710SKent Overstreet static void raid_gen(int nd, int np, size_t size, void **v)
47de5bb710SKent Overstreet {
48de5bb710SKent Overstreet 	if (np >= 1)
49de5bb710SKent Overstreet 		raid5_recov(nd + np, nd, size, v);
50de5bb710SKent Overstreet 	if (np >= 2)
51de5bb710SKent Overstreet 		raid6_call.gen_syndrome(nd + np, size, v);
52de5bb710SKent Overstreet 	BUG_ON(np > 2);
53de5bb710SKent Overstreet }
54de5bb710SKent Overstreet 
55de5bb710SKent Overstreet static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
56de5bb710SKent Overstreet {
57de5bb710SKent Overstreet 	switch (nr) {
58de5bb710SKent Overstreet 	case 0:
59de5bb710SKent Overstreet 		break;
60de5bb710SKent Overstreet 	case 1:
61de5bb710SKent Overstreet 		if (ir[0] < nd + 1)
62de5bb710SKent Overstreet 			raid5_recov(nd + 1, ir[0], size, v);
63de5bb710SKent Overstreet 		else
64de5bb710SKent Overstreet 			raid6_call.gen_syndrome(nd + np, size, v);
65de5bb710SKent Overstreet 		break;
66de5bb710SKent Overstreet 	case 2:
67de5bb710SKent Overstreet 		if (ir[1] < nd) {
68de5bb710SKent Overstreet 			/* data+data failure. */
69de5bb710SKent Overstreet 			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
70de5bb710SKent Overstreet 		} else if (ir[0] < nd) {
71de5bb710SKent Overstreet 			/* data + p/q failure */
72de5bb710SKent Overstreet 
73de5bb710SKent Overstreet 			if (ir[1] == nd) /* data + p failure */
74de5bb710SKent Overstreet 				raid6_datap_recov(nd + np, size, ir[0], v);
75de5bb710SKent Overstreet 			else { /* data + q failure */
76de5bb710SKent Overstreet 				raid5_recov(nd + 1, ir[0], size, v);
77de5bb710SKent Overstreet 				raid6_call.gen_syndrome(nd + np, size, v);
78de5bb710SKent Overstreet 			}
79de5bb710SKent Overstreet 		} else {
80de5bb710SKent Overstreet 			raid_gen(nd, np, size, v);
81de5bb710SKent Overstreet 		}
82de5bb710SKent Overstreet 		break;
83de5bb710SKent Overstreet 	default:
84de5bb710SKent Overstreet 		BUG();
85de5bb710SKent Overstreet 	}
86de5bb710SKent Overstreet }
87de5bb710SKent Overstreet 
88de5bb710SKent Overstreet #else
89de5bb710SKent Overstreet 
90de5bb710SKent Overstreet #include <raid/raid.h>
91de5bb710SKent Overstreet 
92de5bb710SKent Overstreet #endif
93cd575ddfSKent Overstreet 
94cd575ddfSKent Overstreet struct ec_bio {
95cd575ddfSKent Overstreet 	struct bch_dev		*ca;
96cd575ddfSKent Overstreet 	struct ec_stripe_buf	*buf;
97cd575ddfSKent Overstreet 	size_t			idx;
98cd575ddfSKent Overstreet 	struct bio		bio;
99cd575ddfSKent Overstreet };
100cd575ddfSKent Overstreet 
101cd575ddfSKent Overstreet /* Stripes btree keys: */
102cd575ddfSKent Overstreet 
10326609b61SKent Overstreet const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
104cd575ddfSKent Overstreet {
10526609b61SKent Overstreet 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
10626609b61SKent Overstreet 
107cd575ddfSKent Overstreet 	if (k.k->p.inode)
108cd575ddfSKent Overstreet 		return "invalid stripe key";
109cd575ddfSKent Overstreet 
110cd575ddfSKent Overstreet 	if (bkey_val_bytes(k.k) < sizeof(*s))
111cd575ddfSKent Overstreet 		return "incorrect value size";
112cd575ddfSKent Overstreet 
11376640280SKent Overstreet 	if (bkey_val_bytes(k.k) < sizeof(*s) ||
11476640280SKent Overstreet 	    bkey_val_u64s(k.k) < stripe_val_u64s(s))
115cd575ddfSKent Overstreet 		return "incorrect value size";
116cd575ddfSKent Overstreet 
117cd575ddfSKent Overstreet 	return NULL;
118cd575ddfSKent Overstreet }
119cd575ddfSKent Overstreet 
12026609b61SKent Overstreet void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
121cd575ddfSKent Overstreet 			 struct bkey_s_c k)
122cd575ddfSKent Overstreet {
123cd575ddfSKent Overstreet 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
124cd575ddfSKent Overstreet 	unsigned i;
125cd575ddfSKent Overstreet 
126cd575ddfSKent Overstreet 	pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
127cd575ddfSKent Overstreet 	       s->algorithm,
128cd575ddfSKent Overstreet 	       le16_to_cpu(s->sectors),
129cd575ddfSKent Overstreet 	       s->nr_blocks - s->nr_redundant,
130cd575ddfSKent Overstreet 	       s->nr_redundant,
131cd575ddfSKent Overstreet 	       s->csum_type,
132cd575ddfSKent Overstreet 	       1U << s->csum_granularity_bits);
133cd575ddfSKent Overstreet 
134cd575ddfSKent Overstreet 	for (i = 0; i < s->nr_blocks; i++)
13561c8d7c8SKent Overstreet 		pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
13661c8d7c8SKent Overstreet 		       (u64) s->ptrs[i].offset,
13761c8d7c8SKent Overstreet 		       stripe_blockcount_get(s, i));
138cd575ddfSKent Overstreet }
139cd575ddfSKent Overstreet 
140cd575ddfSKent Overstreet static int ptr_matches_stripe(struct bch_fs *c,
141cd575ddfSKent Overstreet 			      struct bch_stripe *v,
142cd575ddfSKent Overstreet 			      const struct bch_extent_ptr *ptr)
143cd575ddfSKent Overstreet {
144cd575ddfSKent Overstreet 	unsigned i;
145cd575ddfSKent Overstreet 
146cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
147cd575ddfSKent Overstreet 		const struct bch_extent_ptr *ptr2 = v->ptrs + i;
148cd575ddfSKent Overstreet 
149cd575ddfSKent Overstreet 		if (ptr->dev == ptr2->dev &&
150cd575ddfSKent Overstreet 		    ptr->gen == ptr2->gen &&
151cd575ddfSKent Overstreet 		    ptr->offset >= ptr2->offset &&
152cd575ddfSKent Overstreet 		    ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
153cd575ddfSKent Overstreet 			return i;
154cd575ddfSKent Overstreet 	}
155cd575ddfSKent Overstreet 
156cd575ddfSKent Overstreet 	return -1;
157cd575ddfSKent Overstreet }
158cd575ddfSKent Overstreet 
159cd575ddfSKent Overstreet static int extent_matches_stripe(struct bch_fs *c,
160cd575ddfSKent Overstreet 				 struct bch_stripe *v,
161cd575ddfSKent Overstreet 				 struct bkey_s_c k)
162cd575ddfSKent Overstreet {
163cd575ddfSKent Overstreet 	struct bkey_s_c_extent e;
164cd575ddfSKent Overstreet 	const struct bch_extent_ptr *ptr;
165cd575ddfSKent Overstreet 	int idx;
166cd575ddfSKent Overstreet 
167cd575ddfSKent Overstreet 	if (!bkey_extent_is_data(k.k))
168cd575ddfSKent Overstreet 		return -1;
169cd575ddfSKent Overstreet 
170cd575ddfSKent Overstreet 	e = bkey_s_c_to_extent(k);
171cd575ddfSKent Overstreet 
172cd575ddfSKent Overstreet 	extent_for_each_ptr(e, ptr) {
173cd575ddfSKent Overstreet 		idx = ptr_matches_stripe(c, v, ptr);
174cd575ddfSKent Overstreet 		if (idx >= 0)
175cd575ddfSKent Overstreet 			return idx;
176cd575ddfSKent Overstreet 	}
177cd575ddfSKent Overstreet 
178cd575ddfSKent Overstreet 	return -1;
179cd575ddfSKent Overstreet }
180cd575ddfSKent Overstreet 
181cd575ddfSKent Overstreet static void ec_stripe_key_init(struct bch_fs *c,
182cd575ddfSKent Overstreet 			       struct bkey_i_stripe *s,
183cd575ddfSKent Overstreet 			       struct open_buckets *blocks,
184cd575ddfSKent Overstreet 			       struct open_buckets *parity,
185cd575ddfSKent Overstreet 			       unsigned stripe_size)
186cd575ddfSKent Overstreet {
187cd575ddfSKent Overstreet 	struct open_bucket *ob;
188cd575ddfSKent Overstreet 	unsigned i, u64s;
189cd575ddfSKent Overstreet 
190cd575ddfSKent Overstreet 	bkey_stripe_init(&s->k_i);
191cd575ddfSKent Overstreet 	s->v.sectors			= cpu_to_le16(stripe_size);
192cd575ddfSKent Overstreet 	s->v.algorithm			= 0;
193cd575ddfSKent Overstreet 	s->v.nr_blocks			= parity->nr + blocks->nr;
194cd575ddfSKent Overstreet 	s->v.nr_redundant		= parity->nr;
195cd575ddfSKent Overstreet 	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
196cd575ddfSKent Overstreet 	s->v.csum_type			= BCH_CSUM_CRC32C;
197cd575ddfSKent Overstreet 	s->v.pad			= 0;
198cd575ddfSKent Overstreet 
199cd575ddfSKent Overstreet 	open_bucket_for_each(c, blocks, ob, i)
200cd575ddfSKent Overstreet 		s->v.ptrs[i]			= ob->ptr;
201cd575ddfSKent Overstreet 
202cd575ddfSKent Overstreet 	open_bucket_for_each(c, parity, ob, i)
203cd575ddfSKent Overstreet 		s->v.ptrs[blocks->nr + i]	= ob->ptr;
204cd575ddfSKent Overstreet 
205cd575ddfSKent Overstreet 	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
206cd575ddfSKent Overstreet 		BUG_ON(1 << s->v.csum_granularity_bits >=
207cd575ddfSKent Overstreet 		       le16_to_cpu(s->v.sectors) ||
208cd575ddfSKent Overstreet 		       s->v.csum_granularity_bits == U8_MAX);
209cd575ddfSKent Overstreet 		s->v.csum_granularity_bits++;
210cd575ddfSKent Overstreet 	}
211cd575ddfSKent Overstreet 
212cd575ddfSKent Overstreet 	set_bkey_val_u64s(&s->k, u64s);
213cd575ddfSKent Overstreet }
214cd575ddfSKent Overstreet 
215cd575ddfSKent Overstreet /* Checksumming: */
216cd575ddfSKent Overstreet 
217cd575ddfSKent Overstreet static void ec_generate_checksums(struct ec_stripe_buf *buf)
218cd575ddfSKent Overstreet {
219cd575ddfSKent Overstreet 	struct bch_stripe *v = &buf->key.v;
220cd575ddfSKent Overstreet 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
221cd575ddfSKent Overstreet 	unsigned csums_per_device = stripe_csums_per_device(v);
222cd575ddfSKent Overstreet 	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
223cd575ddfSKent Overstreet 	unsigned i, j;
224cd575ddfSKent Overstreet 
225cd575ddfSKent Overstreet 	if (!csum_bytes)
226cd575ddfSKent Overstreet 		return;
227cd575ddfSKent Overstreet 
228cd575ddfSKent Overstreet 	BUG_ON(buf->offset);
229cd575ddfSKent Overstreet 	BUG_ON(buf->size != le16_to_cpu(v->sectors));
230cd575ddfSKent Overstreet 
231cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks; i++) {
232cd575ddfSKent Overstreet 		for (j = 0; j < csums_per_device; j++) {
233cd575ddfSKent Overstreet 			unsigned offset = j << v->csum_granularity_bits;
234cd575ddfSKent Overstreet 			unsigned len = min(csum_granularity, buf->size - offset);
235cd575ddfSKent Overstreet 
236cd575ddfSKent Overstreet 			struct bch_csum csum =
237cd575ddfSKent Overstreet 				bch2_checksum(NULL, v->csum_type,
238cd575ddfSKent Overstreet 					      null_nonce(),
239cd575ddfSKent Overstreet 					      buf->data[i] + (offset << 9),
240cd575ddfSKent Overstreet 					      len << 9);
241cd575ddfSKent Overstreet 
242cd575ddfSKent Overstreet 			memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
243cd575ddfSKent Overstreet 		}
244cd575ddfSKent Overstreet 	}
245cd575ddfSKent Overstreet }
246cd575ddfSKent Overstreet 
247cd575ddfSKent Overstreet static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
248cd575ddfSKent Overstreet {
249cd575ddfSKent Overstreet 	struct bch_stripe *v = &buf->key.v;
250cd575ddfSKent Overstreet 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
251cd575ddfSKent Overstreet 	unsigned csum_bytes = bch_crc_bytes[v->csum_type];
252cd575ddfSKent Overstreet 	unsigned i;
253cd575ddfSKent Overstreet 
254cd575ddfSKent Overstreet 	if (!csum_bytes)
255cd575ddfSKent Overstreet 		return;
256cd575ddfSKent Overstreet 
257cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks; i++) {
258cd575ddfSKent Overstreet 		unsigned offset = buf->offset;
259cd575ddfSKent Overstreet 		unsigned end = buf->offset + buf->size;
260cd575ddfSKent Overstreet 
261cd575ddfSKent Overstreet 		if (!test_bit(i, buf->valid))
262cd575ddfSKent Overstreet 			continue;
263cd575ddfSKent Overstreet 
264cd575ddfSKent Overstreet 		while (offset < end) {
265cd575ddfSKent Overstreet 			unsigned j = offset >> v->csum_granularity_bits;
266cd575ddfSKent Overstreet 			unsigned len = min(csum_granularity, end - offset);
267cd575ddfSKent Overstreet 			struct bch_csum csum;
268cd575ddfSKent Overstreet 
269cd575ddfSKent Overstreet 			BUG_ON(offset & (csum_granularity - 1));
270cd575ddfSKent Overstreet 			BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
271cd575ddfSKent Overstreet 			       ((offset + len) & (csum_granularity - 1)));
272cd575ddfSKent Overstreet 
273cd575ddfSKent Overstreet 			csum = bch2_checksum(NULL, v->csum_type,
274cd575ddfSKent Overstreet 					     null_nonce(),
275cd575ddfSKent Overstreet 					     buf->data[i] + ((offset - buf->offset) << 9),
276cd575ddfSKent Overstreet 					     len << 9);
277cd575ddfSKent Overstreet 
278cd575ddfSKent Overstreet 			if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
279cd575ddfSKent Overstreet 				__bcache_io_error(c,
280cd575ddfSKent Overstreet 					"checksum error while doing reconstruct read (%u:%u)",
281cd575ddfSKent Overstreet 					i, j);
282cd575ddfSKent Overstreet 				clear_bit(i, buf->valid);
283cd575ddfSKent Overstreet 				break;
284cd575ddfSKent Overstreet 			}
285cd575ddfSKent Overstreet 
286cd575ddfSKent Overstreet 			offset += len;
287cd575ddfSKent Overstreet 		}
288cd575ddfSKent Overstreet 	}
289cd575ddfSKent Overstreet }
290cd575ddfSKent Overstreet 
291cd575ddfSKent Overstreet /* Erasure coding: */
292cd575ddfSKent Overstreet 
293cd575ddfSKent Overstreet static void ec_generate_ec(struct ec_stripe_buf *buf)
294cd575ddfSKent Overstreet {
295cd575ddfSKent Overstreet 	struct bch_stripe *v = &buf->key.v;
296cd575ddfSKent Overstreet 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
297cd575ddfSKent Overstreet 	unsigned bytes = le16_to_cpu(v->sectors) << 9;
298cd575ddfSKent Overstreet 
299de5bb710SKent Overstreet 	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
300cd575ddfSKent Overstreet }
301cd575ddfSKent Overstreet 
302cd575ddfSKent Overstreet static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
303cd575ddfSKent Overstreet {
304cd575ddfSKent Overstreet 	return nr - bitmap_weight(buf->valid, nr);
305cd575ddfSKent Overstreet }
306cd575ddfSKent Overstreet 
307cd575ddfSKent Overstreet static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
308cd575ddfSKent Overstreet {
309cd575ddfSKent Overstreet 	return __ec_nr_failed(buf, buf->key.v.nr_blocks);
310cd575ddfSKent Overstreet }
311cd575ddfSKent Overstreet 
312cd575ddfSKent Overstreet static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
313cd575ddfSKent Overstreet {
314cd575ddfSKent Overstreet 	struct bch_stripe *v = &buf->key.v;
315cd575ddfSKent Overstreet 	unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
316cd575ddfSKent Overstreet 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
317cd575ddfSKent Overstreet 	unsigned bytes = buf->size << 9;
318cd575ddfSKent Overstreet 
319cd575ddfSKent Overstreet 	if (ec_nr_failed(buf) > v->nr_redundant) {
320cd575ddfSKent Overstreet 		__bcache_io_error(c,
321cd575ddfSKent Overstreet 			"error doing reconstruct read: unable to read enough blocks");
322cd575ddfSKent Overstreet 		return -1;
323cd575ddfSKent Overstreet 	}
324cd575ddfSKent Overstreet 
325cd575ddfSKent Overstreet 	for (i = 0; i < nr_data; i++)
326cd575ddfSKent Overstreet 		if (!test_bit(i, buf->valid))
327cd575ddfSKent Overstreet 			failed[nr_failed++] = i;
328cd575ddfSKent Overstreet 
329de5bb710SKent Overstreet 	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
330cd575ddfSKent Overstreet 	return 0;
331cd575ddfSKent Overstreet }
332cd575ddfSKent Overstreet 
333cd575ddfSKent Overstreet /* IO: */
334cd575ddfSKent Overstreet 
335cd575ddfSKent Overstreet static void ec_block_endio(struct bio *bio)
336cd575ddfSKent Overstreet {
337cd575ddfSKent Overstreet 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
338cd575ddfSKent Overstreet 	struct bch_dev *ca = ec_bio->ca;
339cd575ddfSKent Overstreet 	struct closure *cl = bio->bi_private;
340cd575ddfSKent Overstreet 
341cd575ddfSKent Overstreet 	if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
342cd575ddfSKent Overstreet 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
343cd575ddfSKent Overstreet 
344cd575ddfSKent Overstreet 	bio_put(&ec_bio->bio);
345cd575ddfSKent Overstreet 	percpu_ref_put(&ca->io_ref);
346cd575ddfSKent Overstreet 	closure_put(cl);
347cd575ddfSKent Overstreet }
348cd575ddfSKent Overstreet 
349cd575ddfSKent Overstreet static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
350cd575ddfSKent Overstreet 			unsigned rw, unsigned idx, struct closure *cl)
351cd575ddfSKent Overstreet {
352cd575ddfSKent Overstreet 	struct bch_stripe *v = &buf->key.v;
353cd575ddfSKent Overstreet 	unsigned offset = 0, bytes = buf->size << 9;
354cd575ddfSKent Overstreet 	struct bch_extent_ptr *ptr = &v->ptrs[idx];
355cd575ddfSKent Overstreet 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
356cd575ddfSKent Overstreet 
357cd575ddfSKent Overstreet 	if (!bch2_dev_get_ioref(ca, rw)) {
358cd575ddfSKent Overstreet 		clear_bit(idx, buf->valid);
359cd575ddfSKent Overstreet 		return;
360cd575ddfSKent Overstreet 	}
361cd575ddfSKent Overstreet 
362cd575ddfSKent Overstreet 	while (offset < bytes) {
363cd575ddfSKent Overstreet 		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
364cd575ddfSKent Overstreet 					   DIV_ROUND_UP(bytes, PAGE_SIZE));
365cd575ddfSKent Overstreet 		unsigned b = min_t(size_t, bytes - offset,
366cd575ddfSKent Overstreet 				   nr_iovecs << PAGE_SHIFT);
367cd575ddfSKent Overstreet 		struct ec_bio *ec_bio;
368cd575ddfSKent Overstreet 
369cd575ddfSKent Overstreet 		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
370cd575ddfSKent Overstreet 						       nr_iovecs,
371cd575ddfSKent Overstreet 						       rw,
372cd575ddfSKent Overstreet 						       GFP_KERNEL,
373cd575ddfSKent Overstreet 						       &c->ec_bioset),
374cd575ddfSKent Overstreet 				      struct ec_bio, bio);
375cd575ddfSKent Overstreet 
376cd575ddfSKent Overstreet 		ec_bio->ca			= ca;
377cd575ddfSKent Overstreet 		ec_bio->buf			= buf;
378cd575ddfSKent Overstreet 		ec_bio->idx			= idx;
379cd575ddfSKent Overstreet 
380cd575ddfSKent Overstreet 		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
381cd575ddfSKent Overstreet 		ec_bio->bio.bi_iter.bi_size	= b;
382cd575ddfSKent Overstreet 		ec_bio->bio.bi_end_io		= ec_block_endio;
383cd575ddfSKent Overstreet 		ec_bio->bio.bi_private		= cl;
384cd575ddfSKent Overstreet 
385cd575ddfSKent Overstreet 		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
386cd575ddfSKent Overstreet 
387cd575ddfSKent Overstreet 		closure_get(cl);
388cd575ddfSKent Overstreet 		percpu_ref_get(&ca->io_ref);
389cd575ddfSKent Overstreet 
390cd575ddfSKent Overstreet 		submit_bio(&ec_bio->bio);
391cd575ddfSKent Overstreet 
392cd575ddfSKent Overstreet 		offset += b;
393cd575ddfSKent Overstreet 	}
394cd575ddfSKent Overstreet 
395cd575ddfSKent Overstreet 	percpu_ref_put(&ca->io_ref);
396cd575ddfSKent Overstreet }
397cd575ddfSKent Overstreet 
398cd575ddfSKent Overstreet /* recovery read path: */
399cd575ddfSKent Overstreet int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
400cd575ddfSKent Overstreet {
401cd575ddfSKent Overstreet 	struct btree_iter iter;
402cd575ddfSKent Overstreet 	struct ec_stripe_buf *buf;
403cd575ddfSKent Overstreet 	struct closure cl;
404cd575ddfSKent Overstreet 	struct bkey_s_c k;
405cd575ddfSKent Overstreet 	struct bch_stripe *v;
406cd575ddfSKent Overstreet 	unsigned stripe_idx;
407cd575ddfSKent Overstreet 	unsigned offset, end;
408cd575ddfSKent Overstreet 	unsigned i, nr_data, csum_granularity;
409cd575ddfSKent Overstreet 	int ret = 0, idx;
410cd575ddfSKent Overstreet 
411cd575ddfSKent Overstreet 	closure_init_stack(&cl);
412cd575ddfSKent Overstreet 
413cd575ddfSKent Overstreet 	BUG_ON(!rbio->pick.idx ||
414cd575ddfSKent Overstreet 	       rbio->pick.idx - 1 >= rbio->pick.ec_nr);
415cd575ddfSKent Overstreet 
416cd575ddfSKent Overstreet 	stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
417cd575ddfSKent Overstreet 
418cd575ddfSKent Overstreet 	buf = kzalloc(sizeof(*buf), GFP_NOIO);
419cd575ddfSKent Overstreet 	if (!buf)
420cd575ddfSKent Overstreet 		return -ENOMEM;
421cd575ddfSKent Overstreet 
422cd575ddfSKent Overstreet 	bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
423cd575ddfSKent Overstreet 			     POS(0, stripe_idx),
424cd575ddfSKent Overstreet 			     BTREE_ITER_SLOTS);
425cd575ddfSKent Overstreet 	k = bch2_btree_iter_peek_slot(&iter);
42626609b61SKent Overstreet 	if (btree_iter_err(k) || k.k->type != KEY_TYPE_stripe) {
427cd575ddfSKent Overstreet 		__bcache_io_error(c,
428cd575ddfSKent Overstreet 			"error doing reconstruct read: stripe not found");
429cd575ddfSKent Overstreet 		kfree(buf);
430cd575ddfSKent Overstreet 		return bch2_btree_iter_unlock(&iter) ?: -EIO;
431cd575ddfSKent Overstreet 	}
432cd575ddfSKent Overstreet 
433cd575ddfSKent Overstreet 	bkey_reassemble(&buf->key.k_i, k);
434cd575ddfSKent Overstreet 	bch2_btree_iter_unlock(&iter);
435cd575ddfSKent Overstreet 
436cd575ddfSKent Overstreet 	v = &buf->key.v;
437cd575ddfSKent Overstreet 
438cd575ddfSKent Overstreet 	nr_data = v->nr_blocks - v->nr_redundant;
439cd575ddfSKent Overstreet 
440cd575ddfSKent Overstreet 	idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
441cd575ddfSKent Overstreet 	BUG_ON(idx < 0);
442cd575ddfSKent Overstreet 
443cd575ddfSKent Overstreet 	csum_granularity = 1U << v->csum_granularity_bits;
444cd575ddfSKent Overstreet 
445cd575ddfSKent Overstreet 	offset	= rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
446cd575ddfSKent Overstreet 	end	= offset + bio_sectors(&rbio->bio);
447cd575ddfSKent Overstreet 
448cd575ddfSKent Overstreet 	BUG_ON(end > le16_to_cpu(v->sectors));
449cd575ddfSKent Overstreet 
450cd575ddfSKent Overstreet 	buf->offset	= round_down(offset, csum_granularity);
451cd575ddfSKent Overstreet 	buf->size	= min_t(unsigned, le16_to_cpu(v->sectors),
452cd575ddfSKent Overstreet 				round_up(end, csum_granularity)) - buf->offset;
453cd575ddfSKent Overstreet 
454cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks; i++) {
455cd575ddfSKent Overstreet 		buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
456cd575ddfSKent Overstreet 		if (!buf->data[i]) {
457cd575ddfSKent Overstreet 			ret = -ENOMEM;
458cd575ddfSKent Overstreet 			goto err;
459cd575ddfSKent Overstreet 		}
460cd575ddfSKent Overstreet 	}
461cd575ddfSKent Overstreet 
462cd575ddfSKent Overstreet 	memset(buf->valid, 0xFF, sizeof(buf->valid));
463cd575ddfSKent Overstreet 
464cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks; i++) {
465cd575ddfSKent Overstreet 		struct bch_extent_ptr *ptr = v->ptrs + i;
466cd575ddfSKent Overstreet 		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
467cd575ddfSKent Overstreet 
468cd575ddfSKent Overstreet 		if (ptr_stale(ca, ptr)) {
469cd575ddfSKent Overstreet 			__bcache_io_error(c,
470cd575ddfSKent Overstreet 					  "error doing reconstruct read: stale pointer");
471cd575ddfSKent Overstreet 			clear_bit(i, buf->valid);
472cd575ddfSKent Overstreet 			continue;
473cd575ddfSKent Overstreet 		}
474cd575ddfSKent Overstreet 
475cd575ddfSKent Overstreet 		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
476cd575ddfSKent Overstreet 	}
477cd575ddfSKent Overstreet 
478cd575ddfSKent Overstreet 	closure_sync(&cl);
479cd575ddfSKent Overstreet 
480cd575ddfSKent Overstreet 	if (ec_nr_failed(buf) > v->nr_redundant) {
481cd575ddfSKent Overstreet 		__bcache_io_error(c,
482cd575ddfSKent Overstreet 			"error doing reconstruct read: unable to read enough blocks");
483cd575ddfSKent Overstreet 		ret = -EIO;
484cd575ddfSKent Overstreet 		goto err;
485cd575ddfSKent Overstreet 	}
486cd575ddfSKent Overstreet 
487cd575ddfSKent Overstreet 	ec_validate_checksums(c, buf);
488cd575ddfSKent Overstreet 
489cd575ddfSKent Overstreet 	ret = ec_do_recov(c, buf);
490cd575ddfSKent Overstreet 	if (ret)
491cd575ddfSKent Overstreet 		goto err;
492cd575ddfSKent Overstreet 
493cd575ddfSKent Overstreet 	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
494cd575ddfSKent Overstreet 		      buf->data[idx] + ((offset - buf->offset) << 9));
495cd575ddfSKent Overstreet err:
496cd575ddfSKent Overstreet 	for (i = 0; i < v->nr_blocks; i++)
497cd575ddfSKent Overstreet 		kfree(buf->data[i]);
498cd575ddfSKent Overstreet 	kfree(buf);
499cd575ddfSKent Overstreet 	return ret;
500cd575ddfSKent Overstreet }
501cd575ddfSKent Overstreet 
502dfe9bfb3SKent Overstreet /* stripe bucket accounting: */
503cd575ddfSKent Overstreet 
504cd575ddfSKent Overstreet static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
505cd575ddfSKent Overstreet {
506cd575ddfSKent Overstreet 	ec_stripes_heap n, *h = &c->ec_stripes_heap;
507cd575ddfSKent Overstreet 
508cd575ddfSKent Overstreet 	if (idx >= h->size) {
509cd575ddfSKent Overstreet 		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
510cd575ddfSKent Overstreet 			return -ENOMEM;
511cd575ddfSKent Overstreet 
512cd575ddfSKent Overstreet 		spin_lock(&c->ec_stripes_heap_lock);
513cd575ddfSKent Overstreet 		if (n.size > h->size) {
514cd575ddfSKent Overstreet 			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
515cd575ddfSKent Overstreet 			n.used = h->used;
516cd575ddfSKent Overstreet 			swap(*h, n);
517cd575ddfSKent Overstreet 		}
518cd575ddfSKent Overstreet 		spin_unlock(&c->ec_stripes_heap_lock);
519cd575ddfSKent Overstreet 
520cd575ddfSKent Overstreet 		free_heap(&n);
521cd575ddfSKent Overstreet 	}
522cd575ddfSKent Overstreet 
523dfe9bfb3SKent Overstreet 	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
524dfe9bfb3SKent Overstreet 		return -ENOMEM;
525dfe9bfb3SKent Overstreet 
526dfe9bfb3SKent Overstreet 	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
527dfe9bfb3SKent Overstreet 	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
528cd575ddfSKent Overstreet 		return -ENOMEM;
529cd575ddfSKent Overstreet 
530cd575ddfSKent Overstreet 	return 0;
531cd575ddfSKent Overstreet }
532cd575ddfSKent Overstreet 
533cd575ddfSKent Overstreet static int ec_stripe_mem_alloc(struct bch_fs *c,
534cd575ddfSKent Overstreet 			       struct btree_iter *iter)
535cd575ddfSKent Overstreet {
536cd575ddfSKent Overstreet 	size_t idx = iter->pos.offset;
537cd575ddfSKent Overstreet 
538cd575ddfSKent Overstreet 	if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
539cd575ddfSKent Overstreet 		return 0;
540cd575ddfSKent Overstreet 
541cd575ddfSKent Overstreet 	bch2_btree_iter_unlock(iter);
542cd575ddfSKent Overstreet 
543cd575ddfSKent Overstreet 	if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
544cd575ddfSKent Overstreet 		return -EINTR;
545cd575ddfSKent Overstreet 	return -ENOMEM;
546cd575ddfSKent Overstreet }
547cd575ddfSKent Overstreet 
548cd575ddfSKent Overstreet static ssize_t stripe_idx_to_delete(struct bch_fs *c)
549cd575ddfSKent Overstreet {
550cd575ddfSKent Overstreet 	ec_stripes_heap *h = &c->ec_stripes_heap;
551cd575ddfSKent Overstreet 
552cd575ddfSKent Overstreet 	return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
553cd575ddfSKent Overstreet }
554cd575ddfSKent Overstreet 
555cd575ddfSKent Overstreet static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
556cd575ddfSKent Overstreet 				      struct ec_stripe_heap_entry l,
557cd575ddfSKent Overstreet 				      struct ec_stripe_heap_entry r)
558cd575ddfSKent Overstreet {
559cd575ddfSKent Overstreet 	return ((l.blocks_nonempty > r.blocks_nonempty) -
560cd575ddfSKent Overstreet 		(l.blocks_nonempty < r.blocks_nonempty));
561cd575ddfSKent Overstreet }
562cd575ddfSKent Overstreet 
563cd575ddfSKent Overstreet static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
564cd575ddfSKent Overstreet 						   size_t i)
565cd575ddfSKent Overstreet {
566cd575ddfSKent Overstreet 	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
567cd575ddfSKent Overstreet 
568dfe9bfb3SKent Overstreet 	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
569cd575ddfSKent Overstreet }
570cd575ddfSKent Overstreet 
571cd575ddfSKent Overstreet static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
572cd575ddfSKent Overstreet {
573cd575ddfSKent Overstreet 	ec_stripes_heap *h = &c->ec_stripes_heap;
574dfe9bfb3SKent Overstreet 	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
575cd575ddfSKent Overstreet 
576cd575ddfSKent Overstreet 	BUG_ON(!m->alive);
577cd575ddfSKent Overstreet 	BUG_ON(m->heap_idx >= h->used);
578cd575ddfSKent Overstreet 	BUG_ON(h->data[m->heap_idx].idx != idx);
579cd575ddfSKent Overstreet }
580cd575ddfSKent Overstreet 
581cd575ddfSKent Overstreet void bch2_stripes_heap_update(struct bch_fs *c,
582dfe9bfb3SKent Overstreet 			      struct stripe *m, size_t idx)
583cd575ddfSKent Overstreet {
584cd575ddfSKent Overstreet 	ec_stripes_heap *h = &c->ec_stripes_heap;
585cd575ddfSKent Overstreet 	size_t i;
586cd575ddfSKent Overstreet 
587cd575ddfSKent Overstreet 	heap_verify_backpointer(c, idx);
588cd575ddfSKent Overstreet 
58961c8d7c8SKent Overstreet 	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
590cd575ddfSKent Overstreet 
591cd575ddfSKent Overstreet 	i = m->heap_idx;
592cd575ddfSKent Overstreet 	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
593cd575ddfSKent Overstreet 		     ec_stripes_heap_set_backpointer);
594cd575ddfSKent Overstreet 	heap_sift_down(h, i, ec_stripes_heap_cmp,
595cd575ddfSKent Overstreet 		       ec_stripes_heap_set_backpointer);
596cd575ddfSKent Overstreet 
597cd575ddfSKent Overstreet 	heap_verify_backpointer(c, idx);
598cd575ddfSKent Overstreet 
59961c8d7c8SKent Overstreet 	if (stripe_idx_to_delete(c) >= 0)
600cd575ddfSKent Overstreet 		schedule_work(&c->ec_stripe_delete_work);
601cd575ddfSKent Overstreet }
602cd575ddfSKent Overstreet 
603cd575ddfSKent Overstreet void bch2_stripes_heap_del(struct bch_fs *c,
604dfe9bfb3SKent Overstreet 			   struct stripe *m, size_t idx)
605cd575ddfSKent Overstreet {
606cd575ddfSKent Overstreet 	heap_verify_backpointer(c, idx);
607cd575ddfSKent Overstreet 
608cd575ddfSKent Overstreet 	m->alive = false;
609cd575ddfSKent Overstreet 	heap_del(&c->ec_stripes_heap, m->heap_idx,
610cd575ddfSKent Overstreet 		 ec_stripes_heap_cmp,
611cd575ddfSKent Overstreet 		 ec_stripes_heap_set_backpointer);
612cd575ddfSKent Overstreet }
613cd575ddfSKent Overstreet 
614cd575ddfSKent Overstreet void bch2_stripes_heap_insert(struct bch_fs *c,
615dfe9bfb3SKent Overstreet 			      struct stripe *m, size_t idx)
616cd575ddfSKent Overstreet {
617cd575ddfSKent Overstreet 	BUG_ON(heap_full(&c->ec_stripes_heap));
618cd575ddfSKent Overstreet 
619cd575ddfSKent Overstreet 	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
620cd575ddfSKent Overstreet 			.idx = idx,
62161c8d7c8SKent Overstreet 			.blocks_nonempty = m->blocks_nonempty,
622cd575ddfSKent Overstreet 		}),
623cd575ddfSKent Overstreet 		 ec_stripes_heap_cmp,
624cd575ddfSKent Overstreet 		 ec_stripes_heap_set_backpointer);
625cd575ddfSKent Overstreet 	m->alive = true;
626cd575ddfSKent Overstreet 
627cd575ddfSKent Overstreet 	heap_verify_backpointer(c, idx);
628cd575ddfSKent Overstreet }
629cd575ddfSKent Overstreet 
630dfe9bfb3SKent Overstreet /* stripe deletion */
631dfe9bfb3SKent Overstreet 
632*0564b167SKent Overstreet static int ec_stripe_delete(struct bch_fs *c, size_t idx)
633cd575ddfSKent Overstreet {
634*0564b167SKent Overstreet 	return bch2_btree_delete_range(c, BTREE_ID_EC,
635cd575ddfSKent Overstreet 				       POS(0, idx),
636*0564b167SKent Overstreet 				       POS(0, idx + 1),
637*0564b167SKent Overstreet 				       NULL);
638cd575ddfSKent Overstreet }
639cd575ddfSKent Overstreet 
640cd575ddfSKent Overstreet static void ec_stripe_delete_work(struct work_struct *work)
641cd575ddfSKent Overstreet {
642cd575ddfSKent Overstreet 	struct bch_fs *c =
643cd575ddfSKent Overstreet 		container_of(work, struct bch_fs, ec_stripe_delete_work);
644cd575ddfSKent Overstreet 	ssize_t idx;
645cd575ddfSKent Overstreet 
646cd575ddfSKent Overstreet 	down_read(&c->gc_lock);
647dfe9bfb3SKent Overstreet 	mutex_lock(&c->ec_stripe_create_lock);
648cd575ddfSKent Overstreet 
649cd575ddfSKent Overstreet 	while (1) {
650cd575ddfSKent Overstreet 		spin_lock(&c->ec_stripes_heap_lock);
651cd575ddfSKent Overstreet 		idx = stripe_idx_to_delete(c);
652cd575ddfSKent Overstreet 		spin_unlock(&c->ec_stripes_heap_lock);
653cd575ddfSKent Overstreet 
654cd575ddfSKent Overstreet 		if (idx < 0)
655cd575ddfSKent Overstreet 			break;
656cd575ddfSKent Overstreet 
657cd575ddfSKent Overstreet 		ec_stripe_delete(c, idx);
658cd575ddfSKent Overstreet 	}
659cd575ddfSKent Overstreet 
660dfe9bfb3SKent Overstreet 	mutex_unlock(&c->ec_stripe_create_lock);
661cd575ddfSKent Overstreet 	up_read(&c->gc_lock);
662cd575ddfSKent Overstreet }
663cd575ddfSKent Overstreet 
664dfe9bfb3SKent Overstreet /* stripe creation: */
665dfe9bfb3SKent Overstreet 
666cd575ddfSKent Overstreet static int ec_stripe_bkey_insert(struct bch_fs *c,
667cd575ddfSKent Overstreet 				 struct bkey_i_stripe *stripe)
668cd575ddfSKent Overstreet {
669*0564b167SKent Overstreet 	struct btree_trans trans;
670*0564b167SKent Overstreet 	struct btree_iter *iter;
671cd575ddfSKent Overstreet 	struct bkey_s_c k;
672cd575ddfSKent Overstreet 	int ret;
673cd575ddfSKent Overstreet 
674*0564b167SKent Overstreet 	bch2_trans_init(&trans, c);
675cd575ddfSKent Overstreet retry:
676*0564b167SKent Overstreet 	bch2_trans_begin(&trans);
677*0564b167SKent Overstreet 
678*0564b167SKent Overstreet 	/* XXX: start pos hint */
679*0564b167SKent Overstreet 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
680*0564b167SKent Overstreet 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
681*0564b167SKent Overstreet 
682*0564b167SKent Overstreet 	for_each_btree_key_continue(iter, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
683*0564b167SKent Overstreet 		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
684*0564b167SKent Overstreet 			break;
685cd575ddfSKent Overstreet 
686cd575ddfSKent Overstreet 		if (bkey_deleted(k.k))
687cd575ddfSKent Overstreet 			goto found_slot;
688cd575ddfSKent Overstreet 	}
689cd575ddfSKent Overstreet 
690*0564b167SKent Overstreet 	ret = -ENOSPC;
691*0564b167SKent Overstreet 	goto out;
692cd575ddfSKent Overstreet found_slot:
693*0564b167SKent Overstreet 	ret = ec_stripe_mem_alloc(c, iter);
694cd575ddfSKent Overstreet 
695cd575ddfSKent Overstreet 	if (ret == -EINTR)
696cd575ddfSKent Overstreet 		goto retry;
697cd575ddfSKent Overstreet 	if (ret)
698cd575ddfSKent Overstreet 		return ret;
699cd575ddfSKent Overstreet 
700*0564b167SKent Overstreet 	stripe->k.p = iter->pos;
701cd575ddfSKent Overstreet 
702*0564b167SKent Overstreet 	bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i));
703*0564b167SKent Overstreet 
704*0564b167SKent Overstreet 	ret = bch2_trans_commit(&trans, NULL, NULL,
705cd575ddfSKent Overstreet 				BTREE_INSERT_NOFAIL|
706*0564b167SKent Overstreet 				BTREE_INSERT_USE_RESERVE);
707*0564b167SKent Overstreet out:
708*0564b167SKent Overstreet 	bch2_trans_exit(&trans);
709cd575ddfSKent Overstreet 
710cd575ddfSKent Overstreet 	return ret;
711cd575ddfSKent Overstreet }
712cd575ddfSKent Overstreet 
713cd575ddfSKent Overstreet static void extent_stripe_ptr_add(struct bkey_s_extent e,
714cd575ddfSKent Overstreet 				  struct ec_stripe_buf *s,
715cd575ddfSKent Overstreet 				  struct bch_extent_ptr *ptr,
716cd575ddfSKent Overstreet 				  unsigned block)
717cd575ddfSKent Overstreet {
718cd575ddfSKent Overstreet 	struct bch_extent_stripe_ptr *dst = (void *) ptr;
719cd575ddfSKent Overstreet 	union bch_extent_entry *end = extent_entry_last(e);
720cd575ddfSKent Overstreet 
721cd575ddfSKent Overstreet 	memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
722cd575ddfSKent Overstreet 	e.k->u64s += sizeof(*dst) / sizeof(u64);
723cd575ddfSKent Overstreet 
724cd575ddfSKent Overstreet 	*dst = (struct bch_extent_stripe_ptr) {
725cd575ddfSKent Overstreet 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
726cd575ddfSKent Overstreet 		.block		= block,
727cd575ddfSKent Overstreet 		.idx		= s->key.k.p.offset,
728cd575ddfSKent Overstreet 	};
729cd575ddfSKent Overstreet }
730cd575ddfSKent Overstreet 
731cd575ddfSKent Overstreet static int ec_stripe_update_ptrs(struct bch_fs *c,
732cd575ddfSKent Overstreet 				 struct ec_stripe_buf *s,
733cd575ddfSKent Overstreet 				 struct bkey *pos)
734cd575ddfSKent Overstreet {
735*0564b167SKent Overstreet 	struct btree_trans trans;
736*0564b167SKent Overstreet 	struct btree_iter *iter;
737cd575ddfSKent Overstreet 	struct bkey_s_c k;
738cd575ddfSKent Overstreet 	struct bkey_s_extent e;
739cd575ddfSKent Overstreet 	struct bch_extent_ptr *ptr;
740cd575ddfSKent Overstreet 	BKEY_PADDED(k) tmp;
741cd575ddfSKent Overstreet 	int ret = 0, dev, idx;
742cd575ddfSKent Overstreet 
743*0564b167SKent Overstreet 	bch2_trans_init(&trans, c);
744*0564b167SKent Overstreet 
745*0564b167SKent Overstreet 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
746cd575ddfSKent Overstreet 				   bkey_start_pos(pos),
747cd575ddfSKent Overstreet 				   BTREE_ITER_INTENT);
748cd575ddfSKent Overstreet 
749*0564b167SKent Overstreet 	while ((k = bch2_btree_iter_peek(iter)).k &&
750*0564b167SKent Overstreet 	       !(ret = btree_iter_err(k)) &&
751cd575ddfSKent Overstreet 	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
752cd575ddfSKent Overstreet 		idx = extent_matches_stripe(c, &s->key.v, k);
753cd575ddfSKent Overstreet 		if (idx < 0) {
754*0564b167SKent Overstreet 			bch2_btree_iter_next(iter);
755cd575ddfSKent Overstreet 			continue;
756cd575ddfSKent Overstreet 		}
757cd575ddfSKent Overstreet 
758cd575ddfSKent Overstreet 		dev = s->key.v.ptrs[idx].dev;
759cd575ddfSKent Overstreet 
760cd575ddfSKent Overstreet 		bkey_reassemble(&tmp.k, k);
761cd575ddfSKent Overstreet 		e = bkey_i_to_s_extent(&tmp.k);
762cd575ddfSKent Overstreet 
763cd575ddfSKent Overstreet 		extent_for_each_ptr(e, ptr)
764cd575ddfSKent Overstreet 			if (ptr->dev != dev)
765cd575ddfSKent Overstreet 				ptr->cached = true;
766cd575ddfSKent Overstreet 
767cd575ddfSKent Overstreet 		ptr = (void *) bch2_extent_has_device(e.c, dev);
768cd575ddfSKent Overstreet 		BUG_ON(!ptr);
769cd575ddfSKent Overstreet 
770cd575ddfSKent Overstreet 		extent_stripe_ptr_add(e, s, ptr, idx);
771cd575ddfSKent Overstreet 
772*0564b167SKent Overstreet 		bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k));
773*0564b167SKent Overstreet 
774*0564b167SKent Overstreet 		ret = bch2_trans_commit(&trans, NULL, NULL,
775cd575ddfSKent Overstreet 					BTREE_INSERT_ATOMIC|
776cd575ddfSKent Overstreet 					BTREE_INSERT_NOFAIL|
777*0564b167SKent Overstreet 					BTREE_INSERT_USE_RESERVE);
778cd575ddfSKent Overstreet 		if (ret == -EINTR)
779cd575ddfSKent Overstreet 			ret = 0;
780cd575ddfSKent Overstreet 		if (ret)
781cd575ddfSKent Overstreet 			break;
782cd575ddfSKent Overstreet 	}
783cd575ddfSKent Overstreet 
784*0564b167SKent Overstreet 	bch2_trans_exit(&trans);
785*0564b167SKent Overstreet 
786*0564b167SKent Overstreet 	return ret;
787cd575ddfSKent Overstreet }
788cd575ddfSKent Overstreet 
789cd575ddfSKent Overstreet /*
790cd575ddfSKent Overstreet  * data buckets of new stripe all written: create the stripe
791cd575ddfSKent Overstreet  */
792cd575ddfSKent Overstreet static void ec_stripe_create(struct ec_stripe_new *s)
793cd575ddfSKent Overstreet {
794cd575ddfSKent Overstreet 	struct bch_fs *c = s->c;
795cd575ddfSKent Overstreet 	struct open_bucket *ob;
796cd575ddfSKent Overstreet 	struct bkey_i *k;
797cd575ddfSKent Overstreet 	struct bch_stripe *v = &s->stripe.key.v;
798cd575ddfSKent Overstreet 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
799cd575ddfSKent Overstreet 	struct closure cl;
800cd575ddfSKent Overstreet 	int ret;
801cd575ddfSKent Overstreet 
802cd575ddfSKent Overstreet 	BUG_ON(s->h->s == s);
803cd575ddfSKent Overstreet 
804cd575ddfSKent Overstreet 	closure_init_stack(&cl);
805cd575ddfSKent Overstreet 
806cd575ddfSKent Overstreet 	if (s->err) {
807cd575ddfSKent Overstreet 		bch_err(c, "error creating stripe: error writing data buckets");
808cd575ddfSKent Overstreet 		goto err;
809cd575ddfSKent Overstreet 	}
810cd575ddfSKent Overstreet 
811cd575ddfSKent Overstreet 	if (!percpu_ref_tryget(&c->writes))
812cd575ddfSKent Overstreet 		goto err;
813cd575ddfSKent Overstreet 
814cd575ddfSKent Overstreet 	BUG_ON(bitmap_weight(s->blocks_allocated,
815cd575ddfSKent Overstreet 			     s->blocks.nr) != s->blocks.nr);
816cd575ddfSKent Overstreet 
817cd575ddfSKent Overstreet 	ec_generate_ec(&s->stripe);
818cd575ddfSKent Overstreet 
819cd575ddfSKent Overstreet 	ec_generate_checksums(&s->stripe);
820cd575ddfSKent Overstreet 
821cd575ddfSKent Overstreet 	/* write p/q: */
822cd575ddfSKent Overstreet 	for (i = nr_data; i < v->nr_blocks; i++)
823cd575ddfSKent Overstreet 		ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
824cd575ddfSKent Overstreet 
825cd575ddfSKent Overstreet 	closure_sync(&cl);
826cd575ddfSKent Overstreet 
827cd575ddfSKent Overstreet 	for (i = nr_data; i < v->nr_blocks; i++)
828cd575ddfSKent Overstreet 		if (!test_bit(i, s->stripe.valid)) {
829cd575ddfSKent Overstreet 			bch_err(c, "error creating stripe: error writing redundancy buckets");
830cd575ddfSKent Overstreet 			goto err_put_writes;
831cd575ddfSKent Overstreet 		}
832cd575ddfSKent Overstreet 
833dfe9bfb3SKent Overstreet 	mutex_lock(&c->ec_stripe_create_lock);
834dfe9bfb3SKent Overstreet 
835cd575ddfSKent Overstreet 	ret = ec_stripe_bkey_insert(c, &s->stripe.key);
836cd575ddfSKent Overstreet 	if (ret) {
837cd575ddfSKent Overstreet 		bch_err(c, "error creating stripe: error creating stripe key");
838dfe9bfb3SKent Overstreet 		goto err_unlock;
839cd575ddfSKent Overstreet 	}
840cd575ddfSKent Overstreet 
841cd575ddfSKent Overstreet 	for_each_keylist_key(&s->keys, k) {
842cd575ddfSKent Overstreet 		ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
843cd575ddfSKent Overstreet 		if (ret)
844cd575ddfSKent Overstreet 			break;
845cd575ddfSKent Overstreet 	}
846cd575ddfSKent Overstreet 
847dfe9bfb3SKent Overstreet err_unlock:
848dfe9bfb3SKent Overstreet 	mutex_unlock(&c->ec_stripe_create_lock);
849cd575ddfSKent Overstreet err_put_writes:
850cd575ddfSKent Overstreet 	percpu_ref_put(&c->writes);
851cd575ddfSKent Overstreet err:
852cd575ddfSKent Overstreet 	open_bucket_for_each(c, &s->blocks, ob, i) {
853cd575ddfSKent Overstreet 		ob->ec = NULL;
854cd575ddfSKent Overstreet 		__bch2_open_bucket_put(c, ob);
855cd575ddfSKent Overstreet 	}
856cd575ddfSKent Overstreet 
857cd575ddfSKent Overstreet 	bch2_open_buckets_put(c, &s->parity);
858cd575ddfSKent Overstreet 
859cd575ddfSKent Overstreet 	bch2_keylist_free(&s->keys, s->inline_keys);
860cd575ddfSKent Overstreet 
861cd575ddfSKent Overstreet 	mutex_lock(&s->h->lock);
862cd575ddfSKent Overstreet 	list_del(&s->list);
863cd575ddfSKent Overstreet 	mutex_unlock(&s->h->lock);
864cd575ddfSKent Overstreet 
865cd575ddfSKent Overstreet 	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
866cd575ddfSKent Overstreet 		kvpfree(s->stripe.data[i], s->stripe.size << 9);
867cd575ddfSKent Overstreet 	kfree(s);
868cd575ddfSKent Overstreet }
869cd575ddfSKent Overstreet 
870cd575ddfSKent Overstreet static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
871cd575ddfSKent Overstreet {
872cd575ddfSKent Overstreet 	struct ec_stripe_new *s = h->s;
873cd575ddfSKent Overstreet 
874cd575ddfSKent Overstreet 	list_add(&s->list, &h->stripes);
875cd575ddfSKent Overstreet 	h->s = NULL;
876cd575ddfSKent Overstreet 
877cd575ddfSKent Overstreet 	return s;
878cd575ddfSKent Overstreet }
879cd575ddfSKent Overstreet 
880cd575ddfSKent Overstreet static void ec_stripe_new_put(struct ec_stripe_new *s)
881cd575ddfSKent Overstreet {
882cd575ddfSKent Overstreet 	BUG_ON(atomic_read(&s->pin) <= 0);
883cd575ddfSKent Overstreet 	if (atomic_dec_and_test(&s->pin))
884cd575ddfSKent Overstreet 		ec_stripe_create(s);
885cd575ddfSKent Overstreet }
886cd575ddfSKent Overstreet 
887cd575ddfSKent Overstreet /* have a full bucket - hand it off to be erasure coded: */
888cd575ddfSKent Overstreet void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
889cd575ddfSKent Overstreet {
890cd575ddfSKent Overstreet 	struct ec_stripe_new *s = ob->ec;
891cd575ddfSKent Overstreet 
892cd575ddfSKent Overstreet 	if (ob->sectors_free)
893cd575ddfSKent Overstreet 		s->err = -1;
894cd575ddfSKent Overstreet 
895cd575ddfSKent Overstreet 	ec_stripe_new_put(s);
896cd575ddfSKent Overstreet }
897cd575ddfSKent Overstreet 
898cd575ddfSKent Overstreet void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
899cd575ddfSKent Overstreet {
900cd575ddfSKent Overstreet 	struct ec_stripe_new *s = ob->ec;
901cd575ddfSKent Overstreet 
902cd575ddfSKent Overstreet 	s->err = -EIO;
903cd575ddfSKent Overstreet }
904cd575ddfSKent Overstreet 
905cd575ddfSKent Overstreet void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
906cd575ddfSKent Overstreet {
907cd575ddfSKent Overstreet 	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
908cd575ddfSKent Overstreet 	struct bch_dev *ca;
909cd575ddfSKent Overstreet 	unsigned offset;
910cd575ddfSKent Overstreet 
911cd575ddfSKent Overstreet 	if (!ob)
912cd575ddfSKent Overstreet 		return NULL;
913cd575ddfSKent Overstreet 
914cd575ddfSKent Overstreet 	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
915cd575ddfSKent Overstreet 	offset	= ca->mi.bucket_size - ob->sectors_free;
916cd575ddfSKent Overstreet 
917cd575ddfSKent Overstreet 	return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
918cd575ddfSKent Overstreet }
919cd575ddfSKent Overstreet 
920cd575ddfSKent Overstreet void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
921cd575ddfSKent Overstreet 			     struct bpos pos, unsigned sectors)
922cd575ddfSKent Overstreet {
923cd575ddfSKent Overstreet 	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
924cd575ddfSKent Overstreet 	struct ec_stripe_new *ec;
925cd575ddfSKent Overstreet 
926cd575ddfSKent Overstreet 	if (!ob)
927cd575ddfSKent Overstreet 		return;
928cd575ddfSKent Overstreet 
929cd575ddfSKent Overstreet 	ec = ob->ec;
930cd575ddfSKent Overstreet 	mutex_lock(&ec->lock);
931cd575ddfSKent Overstreet 
932cd575ddfSKent Overstreet 	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
933cd575ddfSKent Overstreet 				 ARRAY_SIZE(ec->inline_keys),
934cd575ddfSKent Overstreet 				 BKEY_U64s)) {
935cd575ddfSKent Overstreet 		BUG();
936cd575ddfSKent Overstreet 	}
937cd575ddfSKent Overstreet 
938cd575ddfSKent Overstreet 	bkey_init(&ec->keys.top->k);
939cd575ddfSKent Overstreet 	ec->keys.top->k.p	= pos;
940cd575ddfSKent Overstreet 	bch2_key_resize(&ec->keys.top->k, sectors);
941cd575ddfSKent Overstreet 	bch2_keylist_push(&ec->keys);
942cd575ddfSKent Overstreet 
943cd575ddfSKent Overstreet 	mutex_unlock(&ec->lock);
944cd575ddfSKent Overstreet }
945cd575ddfSKent Overstreet 
946cd575ddfSKent Overstreet static int unsigned_cmp(const void *_l, const void *_r)
947cd575ddfSKent Overstreet {
948cd575ddfSKent Overstreet 	unsigned l = *((const unsigned *) _l);
949cd575ddfSKent Overstreet 	unsigned r = *((const unsigned *) _r);
950cd575ddfSKent Overstreet 
951cd575ddfSKent Overstreet 	return (l > r) - (l < r);
952cd575ddfSKent Overstreet }
953cd575ddfSKent Overstreet 
954cd575ddfSKent Overstreet /* pick most common bucket size: */
955cd575ddfSKent Overstreet static unsigned pick_blocksize(struct bch_fs *c,
956cd575ddfSKent Overstreet 			       struct bch_devs_mask *devs)
957cd575ddfSKent Overstreet {
958cd575ddfSKent Overstreet 	struct bch_dev *ca;
959cd575ddfSKent Overstreet 	unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
960cd575ddfSKent Overstreet 	struct {
961cd575ddfSKent Overstreet 		unsigned nr, size;
962cd575ddfSKent Overstreet 	} cur = { 0, 0 }, best = { 0, 0 };
963cd575ddfSKent Overstreet 
964cd575ddfSKent Overstreet 	for_each_member_device_rcu(ca, c, i, devs)
965cd575ddfSKent Overstreet 		sizes[nr++] = ca->mi.bucket_size;
966cd575ddfSKent Overstreet 
967cd575ddfSKent Overstreet 	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
968cd575ddfSKent Overstreet 
969cd575ddfSKent Overstreet 	for (i = 0; i < nr; i++) {
970cd575ddfSKent Overstreet 		if (sizes[i] != cur.size) {
971cd575ddfSKent Overstreet 			if (cur.nr > best.nr)
972cd575ddfSKent Overstreet 				best = cur;
973cd575ddfSKent Overstreet 
974cd575ddfSKent Overstreet 			cur.nr = 0;
975cd575ddfSKent Overstreet 			cur.size = sizes[i];
976cd575ddfSKent Overstreet 		}
977cd575ddfSKent Overstreet 
978cd575ddfSKent Overstreet 		cur.nr++;
979cd575ddfSKent Overstreet 	}
980cd575ddfSKent Overstreet 
981cd575ddfSKent Overstreet 	if (cur.nr > best.nr)
982cd575ddfSKent Overstreet 		best = cur;
983cd575ddfSKent Overstreet 
984cd575ddfSKent Overstreet 	return best.size;
985cd575ddfSKent Overstreet }
986cd575ddfSKent Overstreet 
987cd575ddfSKent Overstreet int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
988cd575ddfSKent Overstreet {
989cd575ddfSKent Overstreet 	struct ec_stripe_new *s;
990cd575ddfSKent Overstreet 	unsigned i;
991cd575ddfSKent Overstreet 
992cd575ddfSKent Overstreet 	BUG_ON(h->parity.nr != h->redundancy);
993cd575ddfSKent Overstreet 	BUG_ON(!h->blocks.nr);
994cd575ddfSKent Overstreet 	BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
995cd575ddfSKent Overstreet 	lockdep_assert_held(&h->lock);
996cd575ddfSKent Overstreet 
997cd575ddfSKent Overstreet 	s = kzalloc(sizeof(*s), GFP_KERNEL);
998cd575ddfSKent Overstreet 	if (!s)
999cd575ddfSKent Overstreet 		return -ENOMEM;
1000cd575ddfSKent Overstreet 
1001cd575ddfSKent Overstreet 	mutex_init(&s->lock);
1002cd575ddfSKent Overstreet 	atomic_set(&s->pin, 1);
1003cd575ddfSKent Overstreet 	s->c		= c;
1004cd575ddfSKent Overstreet 	s->h		= h;
1005cd575ddfSKent Overstreet 	s->blocks	= h->blocks;
1006cd575ddfSKent Overstreet 	s->parity	= h->parity;
1007cd575ddfSKent Overstreet 
1008cd575ddfSKent Overstreet 	memset(&h->blocks, 0, sizeof(h->blocks));
1009cd575ddfSKent Overstreet 	memset(&h->parity, 0, sizeof(h->parity));
1010cd575ddfSKent Overstreet 
1011cd575ddfSKent Overstreet 	bch2_keylist_init(&s->keys, s->inline_keys);
1012cd575ddfSKent Overstreet 
1013cd575ddfSKent Overstreet 	s->stripe.offset	= 0;
1014cd575ddfSKent Overstreet 	s->stripe.size		= h->blocksize;
1015cd575ddfSKent Overstreet 	memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
1016cd575ddfSKent Overstreet 
1017cd575ddfSKent Overstreet 	ec_stripe_key_init(c, &s->stripe.key,
1018cd575ddfSKent Overstreet 			   &s->blocks, &s->parity,
1019cd575ddfSKent Overstreet 			   h->blocksize);
1020cd575ddfSKent Overstreet 
1021cd575ddfSKent Overstreet 	for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
1022cd575ddfSKent Overstreet 		s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
1023cd575ddfSKent Overstreet 		if (!s->stripe.data[i])
1024cd575ddfSKent Overstreet 			goto err;
1025cd575ddfSKent Overstreet 	}
1026cd575ddfSKent Overstreet 
1027cd575ddfSKent Overstreet 	h->s = s;
1028cd575ddfSKent Overstreet 
1029cd575ddfSKent Overstreet 	return 0;
1030cd575ddfSKent Overstreet err:
1031cd575ddfSKent Overstreet 	for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
1032cd575ddfSKent Overstreet 		kvpfree(s->stripe.data[i], s->stripe.size << 9);
1033cd575ddfSKent Overstreet 	kfree(s);
1034cd575ddfSKent Overstreet 	return -ENOMEM;
1035cd575ddfSKent Overstreet }
1036cd575ddfSKent Overstreet 
1037cd575ddfSKent Overstreet static struct ec_stripe_head *
1038cd575ddfSKent Overstreet ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1039cd575ddfSKent Overstreet 			 unsigned algo, unsigned redundancy)
1040cd575ddfSKent Overstreet {
1041cd575ddfSKent Overstreet 	struct ec_stripe_head *h;
1042cd575ddfSKent Overstreet 	struct bch_dev *ca;
1043cd575ddfSKent Overstreet 	unsigned i;
1044cd575ddfSKent Overstreet 
1045cd575ddfSKent Overstreet 	h = kzalloc(sizeof(*h), GFP_KERNEL);
1046cd575ddfSKent Overstreet 	if (!h)
1047cd575ddfSKent Overstreet 		return NULL;
1048cd575ddfSKent Overstreet 
1049cd575ddfSKent Overstreet 	mutex_init(&h->lock);
1050cd575ddfSKent Overstreet 	mutex_lock(&h->lock);
1051cd575ddfSKent Overstreet 	INIT_LIST_HEAD(&h->stripes);
1052cd575ddfSKent Overstreet 
1053cd575ddfSKent Overstreet 	h->target	= target;
1054cd575ddfSKent Overstreet 	h->algo		= algo;
1055cd575ddfSKent Overstreet 	h->redundancy	= redundancy;
1056cd575ddfSKent Overstreet 
1057cd575ddfSKent Overstreet 	rcu_read_lock();
1058cd575ddfSKent Overstreet 	h->devs = target_rw_devs(c, BCH_DATA_USER, target);
1059cd575ddfSKent Overstreet 
1060cd575ddfSKent Overstreet 	for_each_member_device_rcu(ca, c, i, &h->devs)
1061cd575ddfSKent Overstreet 		if (!ca->mi.durability)
1062cd575ddfSKent Overstreet 			__clear_bit(i, h->devs.d);
1063cd575ddfSKent Overstreet 
1064cd575ddfSKent Overstreet 	h->blocksize = pick_blocksize(c, &h->devs);
1065cd575ddfSKent Overstreet 
1066cd575ddfSKent Overstreet 	for_each_member_device_rcu(ca, c, i, &h->devs)
1067cd575ddfSKent Overstreet 		if (ca->mi.bucket_size == h->blocksize)
1068cd575ddfSKent Overstreet 			h->nr_active_devs++;
1069cd575ddfSKent Overstreet 
1070cd575ddfSKent Overstreet 	rcu_read_unlock();
1071cd575ddfSKent Overstreet 	list_add(&h->list, &c->ec_new_stripe_list);
1072cd575ddfSKent Overstreet 	return h;
1073cd575ddfSKent Overstreet }
1074cd575ddfSKent Overstreet 
1075cd575ddfSKent Overstreet void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
1076cd575ddfSKent Overstreet {
1077cd575ddfSKent Overstreet 	struct ec_stripe_new *s = NULL;
1078cd575ddfSKent Overstreet 
1079cd575ddfSKent Overstreet 	if (h->s &&
1080cd575ddfSKent Overstreet 	    bitmap_weight(h->s->blocks_allocated,
1081cd575ddfSKent Overstreet 			  h->s->blocks.nr) == h->s->blocks.nr)
1082cd575ddfSKent Overstreet 		s = ec_stripe_set_pending(h);
1083cd575ddfSKent Overstreet 
1084cd575ddfSKent Overstreet 	mutex_unlock(&h->lock);
1085cd575ddfSKent Overstreet 
1086cd575ddfSKent Overstreet 	if (s)
1087cd575ddfSKent Overstreet 		ec_stripe_new_put(s);
1088cd575ddfSKent Overstreet }
1089cd575ddfSKent Overstreet 
1090cd575ddfSKent Overstreet struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
1091cd575ddfSKent Overstreet 					       unsigned target,
1092cd575ddfSKent Overstreet 					       unsigned algo,
1093cd575ddfSKent Overstreet 					       unsigned redundancy)
1094cd575ddfSKent Overstreet {
1095cd575ddfSKent Overstreet 	struct ec_stripe_head *h;
1096cd575ddfSKent Overstreet 
1097cd575ddfSKent Overstreet 	if (!redundancy)
1098cd575ddfSKent Overstreet 		return NULL;
1099cd575ddfSKent Overstreet 
1100cd575ddfSKent Overstreet 	mutex_lock(&c->ec_new_stripe_lock);
1101cd575ddfSKent Overstreet 	list_for_each_entry(h, &c->ec_new_stripe_list, list)
1102cd575ddfSKent Overstreet 		if (h->target		== target &&
1103cd575ddfSKent Overstreet 		    h->algo		== algo &&
1104cd575ddfSKent Overstreet 		    h->redundancy	== redundancy) {
1105cd575ddfSKent Overstreet 			mutex_lock(&h->lock);
1106cd575ddfSKent Overstreet 			goto found;
1107cd575ddfSKent Overstreet 		}
1108cd575ddfSKent Overstreet 
1109cd575ddfSKent Overstreet 	h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
1110cd575ddfSKent Overstreet found:
1111cd575ddfSKent Overstreet 	mutex_unlock(&c->ec_new_stripe_lock);
1112cd575ddfSKent Overstreet 	return h;
1113cd575ddfSKent Overstreet }
1114cd575ddfSKent Overstreet 
1115cd575ddfSKent Overstreet void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
1116cd575ddfSKent Overstreet {
1117cd575ddfSKent Overstreet 	struct ec_stripe_head *h;
1118cd575ddfSKent Overstreet 	struct open_bucket *ob;
1119cd575ddfSKent Overstreet 	unsigned i;
1120cd575ddfSKent Overstreet 
1121cd575ddfSKent Overstreet 	mutex_lock(&c->ec_new_stripe_lock);
1122cd575ddfSKent Overstreet 	list_for_each_entry(h, &c->ec_new_stripe_list, list) {
1123cd575ddfSKent Overstreet 		struct ec_stripe_new *s = NULL;
1124cd575ddfSKent Overstreet 
1125cd575ddfSKent Overstreet 		mutex_lock(&h->lock);
1126cd575ddfSKent Overstreet 		bch2_open_buckets_stop_dev(c, ca,
1127cd575ddfSKent Overstreet 					   &h->blocks,
1128cd575ddfSKent Overstreet 					   BCH_DATA_USER);
1129cd575ddfSKent Overstreet 		bch2_open_buckets_stop_dev(c, ca,
1130cd575ddfSKent Overstreet 					   &h->parity,
1131cd575ddfSKent Overstreet 					   BCH_DATA_USER);
1132cd575ddfSKent Overstreet 
1133cd575ddfSKent Overstreet 		if (!h->s)
1134cd575ddfSKent Overstreet 			goto unlock;
1135cd575ddfSKent Overstreet 
1136cd575ddfSKent Overstreet 		open_bucket_for_each(c, &h->s->blocks, ob, i)
1137cd575ddfSKent Overstreet 			if (ob->ptr.dev == ca->dev_idx)
1138cd575ddfSKent Overstreet 				goto found;
1139cd575ddfSKent Overstreet 		open_bucket_for_each(c, &h->s->parity, ob, i)
1140cd575ddfSKent Overstreet 			if (ob->ptr.dev == ca->dev_idx)
1141cd575ddfSKent Overstreet 				goto found;
1142cd575ddfSKent Overstreet 		goto unlock;
1143cd575ddfSKent Overstreet found:
1144cd575ddfSKent Overstreet 		h->s->err = -1;
1145cd575ddfSKent Overstreet 		s = ec_stripe_set_pending(h);
1146cd575ddfSKent Overstreet unlock:
1147cd575ddfSKent Overstreet 		mutex_unlock(&h->lock);
1148cd575ddfSKent Overstreet 
1149cd575ddfSKent Overstreet 		if (s)
1150cd575ddfSKent Overstreet 			ec_stripe_new_put(s);
1151cd575ddfSKent Overstreet 	}
1152cd575ddfSKent Overstreet 	mutex_unlock(&c->ec_new_stripe_lock);
1153cd575ddfSKent Overstreet }
1154cd575ddfSKent Overstreet 
1155*0564b167SKent Overstreet static int __bch2_stripe_write_key(struct btree_trans *trans,
115661c8d7c8SKent Overstreet 				   struct btree_iter *iter,
115761c8d7c8SKent Overstreet 				   struct stripe *m,
115861c8d7c8SKent Overstreet 				   size_t idx,
115961c8d7c8SKent Overstreet 				   struct bkey_i_stripe *new_key,
116061c8d7c8SKent Overstreet 				   unsigned flags)
116161c8d7c8SKent Overstreet {
1162*0564b167SKent Overstreet 	struct bch_fs *c = trans->c;
116361c8d7c8SKent Overstreet 	struct bkey_s_c k;
116461c8d7c8SKent Overstreet 	unsigned i;
116561c8d7c8SKent Overstreet 	int ret;
116661c8d7c8SKent Overstreet 
116761c8d7c8SKent Overstreet 	bch2_btree_iter_set_pos(iter, POS(0, idx));
116861c8d7c8SKent Overstreet 
116961c8d7c8SKent Overstreet 	k = bch2_btree_iter_peek_slot(iter);
117061c8d7c8SKent Overstreet 	ret = btree_iter_err(k);
117161c8d7c8SKent Overstreet 	if (ret)
117261c8d7c8SKent Overstreet 		return ret;
117361c8d7c8SKent Overstreet 
117461c8d7c8SKent Overstreet 	if (k.k->type != KEY_TYPE_stripe)
117561c8d7c8SKent Overstreet 		return -EIO;
117661c8d7c8SKent Overstreet 
117761c8d7c8SKent Overstreet 	bkey_reassemble(&new_key->k_i, k);
117861c8d7c8SKent Overstreet 
117961c8d7c8SKent Overstreet 	spin_lock(&c->ec_stripes_heap_lock);
118061c8d7c8SKent Overstreet 
118161c8d7c8SKent Overstreet 	for (i = 0; i < new_key->v.nr_blocks; i++)
118261c8d7c8SKent Overstreet 		stripe_blockcount_set(&new_key->v, i,
118361c8d7c8SKent Overstreet 				      m->block_sectors[i]);
118461c8d7c8SKent Overstreet 	m->dirty = false;
118561c8d7c8SKent Overstreet 
118661c8d7c8SKent Overstreet 	spin_unlock(&c->ec_stripes_heap_lock);
118761c8d7c8SKent Overstreet 
1188*0564b167SKent Overstreet 	bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i));
1189*0564b167SKent Overstreet 
1190*0564b167SKent Overstreet 	return bch2_trans_commit(trans, NULL, NULL,
1191*0564b167SKent Overstreet 				 BTREE_INSERT_NOFAIL|flags);
119261c8d7c8SKent Overstreet }
119361c8d7c8SKent Overstreet 
119461c8d7c8SKent Overstreet int bch2_stripes_write(struct bch_fs *c, bool *wrote)
119561c8d7c8SKent Overstreet {
1196*0564b167SKent Overstreet 	struct btree_trans trans;
1197*0564b167SKent Overstreet 	struct btree_iter *iter;
119861c8d7c8SKent Overstreet 	struct genradix_iter giter;
119961c8d7c8SKent Overstreet 	struct bkey_i_stripe *new_key;
120061c8d7c8SKent Overstreet 	struct stripe *m;
120161c8d7c8SKent Overstreet 	int ret = 0;
120261c8d7c8SKent Overstreet 
120361c8d7c8SKent Overstreet 	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
120461c8d7c8SKent Overstreet 	BUG_ON(!new_key);
120561c8d7c8SKent Overstreet 
1206*0564b167SKent Overstreet 	bch2_trans_init(&trans, c);
1207*0564b167SKent Overstreet 
1208*0564b167SKent Overstreet 	iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN,
120961c8d7c8SKent Overstreet 				   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
121061c8d7c8SKent Overstreet 
121161c8d7c8SKent Overstreet 	genradix_for_each(&c->stripes[0], giter, m) {
121261c8d7c8SKent Overstreet 		if (!m->dirty)
121361c8d7c8SKent Overstreet 			continue;
121461c8d7c8SKent Overstreet 
1215*0564b167SKent Overstreet 		ret = __bch2_stripe_write_key(&trans, iter, m, giter.pos,
121661c8d7c8SKent Overstreet 					new_key, BTREE_INSERT_NOCHECK_RW);
121761c8d7c8SKent Overstreet 		if (ret)
121861c8d7c8SKent Overstreet 			break;
121961c8d7c8SKent Overstreet 
122061c8d7c8SKent Overstreet 		*wrote = true;
122161c8d7c8SKent Overstreet 	}
122261c8d7c8SKent Overstreet 
1223*0564b167SKent Overstreet 	bch2_trans_exit(&trans);
122461c8d7c8SKent Overstreet 
122561c8d7c8SKent Overstreet 	kfree(new_key);
122661c8d7c8SKent Overstreet 
122761c8d7c8SKent Overstreet 	return ret;
122861c8d7c8SKent Overstreet }
122961c8d7c8SKent Overstreet 
123061c8d7c8SKent Overstreet static void bch2_stripe_read_key(struct bch_fs *c, struct bkey_s_c k)
123161c8d7c8SKent Overstreet {
123261c8d7c8SKent Overstreet 
123361c8d7c8SKent Overstreet 	struct gc_pos pos = { 0 };
123461c8d7c8SKent Overstreet 
123561c8d7c8SKent Overstreet 	bch2_mark_key(c, k, true, 0, pos, NULL, 0, 0);
123661c8d7c8SKent Overstreet }
123761c8d7c8SKent Overstreet 
123861c8d7c8SKent Overstreet int bch2_stripes_read(struct bch_fs *c, struct list_head *journal_replay_list)
123961c8d7c8SKent Overstreet {
124061c8d7c8SKent Overstreet 	struct journal_replay *r;
124161c8d7c8SKent Overstreet 	struct btree_iter iter;
124261c8d7c8SKent Overstreet 	struct bkey_s_c k;
124361c8d7c8SKent Overstreet 	int ret;
124461c8d7c8SKent Overstreet 
124561c8d7c8SKent Overstreet 	ret = bch2_fs_ec_start(c);
124661c8d7c8SKent Overstreet 	if (ret)
124761c8d7c8SKent Overstreet 		return ret;
124861c8d7c8SKent Overstreet 
124961c8d7c8SKent Overstreet 	for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, 0, k) {
125061c8d7c8SKent Overstreet 		bch2_stripe_read_key(c, k);
125161c8d7c8SKent Overstreet 		bch2_btree_iter_cond_resched(&iter);
125261c8d7c8SKent Overstreet 	}
125361c8d7c8SKent Overstreet 
125461c8d7c8SKent Overstreet 	ret = bch2_btree_iter_unlock(&iter);
125561c8d7c8SKent Overstreet 	if (ret)
125661c8d7c8SKent Overstreet 		return ret;
125761c8d7c8SKent Overstreet 
125861c8d7c8SKent Overstreet 	list_for_each_entry(r, journal_replay_list, list) {
125961c8d7c8SKent Overstreet 		struct bkey_i *k, *n;
126061c8d7c8SKent Overstreet 		struct jset_entry *entry;
126161c8d7c8SKent Overstreet 
126261c8d7c8SKent Overstreet 		for_each_jset_key(k, n, entry, &r->j)
126361c8d7c8SKent Overstreet 			if (entry->btree_id == BTREE_ID_EC)
126461c8d7c8SKent Overstreet 				bch2_stripe_read_key(c, bkey_i_to_s_c(k));
126561c8d7c8SKent Overstreet 	}
126661c8d7c8SKent Overstreet 
126761c8d7c8SKent Overstreet 	return 0;
126861c8d7c8SKent Overstreet }
126961c8d7c8SKent Overstreet 
1270dfe9bfb3SKent Overstreet int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
1271cd575ddfSKent Overstreet {
1272cd575ddfSKent Overstreet 	struct btree_iter iter;
1273cd575ddfSKent Overstreet 	struct bkey_s_c k;
1274cd575ddfSKent Overstreet 	size_t i, idx = 0;
1275cd575ddfSKent Overstreet 	int ret = 0;
1276cd575ddfSKent Overstreet 
1277cd575ddfSKent Overstreet 	bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0);
1278cd575ddfSKent Overstreet 
1279cd575ddfSKent Overstreet 	k = bch2_btree_iter_prev(&iter);
1280cd575ddfSKent Overstreet 	if (!IS_ERR_OR_NULL(k.k))
1281cd575ddfSKent Overstreet 		idx = k.k->p.offset + 1;
1282cd575ddfSKent Overstreet 	ret = bch2_btree_iter_unlock(&iter);
1283cd575ddfSKent Overstreet 	if (ret)
1284cd575ddfSKent Overstreet 		return ret;
1285cd575ddfSKent Overstreet 
1286dfe9bfb3SKent Overstreet 	if (!gc &&
1287dfe9bfb3SKent Overstreet 	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
1288cd575ddfSKent Overstreet 		       GFP_KERNEL))
1289cd575ddfSKent Overstreet 		return -ENOMEM;
1290cd575ddfSKent Overstreet #if 0
1291dfe9bfb3SKent Overstreet 	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
1292cd575ddfSKent Overstreet #else
1293cd575ddfSKent Overstreet 	for (i = 0; i < idx; i++)
1294dfe9bfb3SKent Overstreet 		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
1295cd575ddfSKent Overstreet 			return -ENOMEM;
1296cd575ddfSKent Overstreet #endif
1297cd575ddfSKent Overstreet 	return 0;
1298cd575ddfSKent Overstreet }
1299cd575ddfSKent Overstreet 
1300dfe9bfb3SKent Overstreet int bch2_fs_ec_start(struct bch_fs *c)
1301dfe9bfb3SKent Overstreet {
1302dfe9bfb3SKent Overstreet 	return bch2_ec_mem_alloc(c, false);
1303dfe9bfb3SKent Overstreet }
1304dfe9bfb3SKent Overstreet 
1305cd575ddfSKent Overstreet void bch2_fs_ec_exit(struct bch_fs *c)
1306cd575ddfSKent Overstreet {
1307cd575ddfSKent Overstreet 	struct ec_stripe_head *h;
1308cd575ddfSKent Overstreet 
1309cd575ddfSKent Overstreet 	while (1) {
1310cd575ddfSKent Overstreet 		mutex_lock(&c->ec_new_stripe_lock);
1311cd575ddfSKent Overstreet 		h = list_first_entry_or_null(&c->ec_new_stripe_list,
1312cd575ddfSKent Overstreet 					     struct ec_stripe_head, list);
1313cd575ddfSKent Overstreet 		if (h)
1314cd575ddfSKent Overstreet 			list_del(&h->list);
1315cd575ddfSKent Overstreet 		mutex_unlock(&c->ec_new_stripe_lock);
1316cd575ddfSKent Overstreet 		if (!h)
1317cd575ddfSKent Overstreet 			break;
1318cd575ddfSKent Overstreet 
1319cd575ddfSKent Overstreet 		BUG_ON(h->s);
1320cd575ddfSKent Overstreet 		BUG_ON(!list_empty(&h->stripes));
1321cd575ddfSKent Overstreet 		kfree(h);
1322cd575ddfSKent Overstreet 	}
1323cd575ddfSKent Overstreet 
1324cd575ddfSKent Overstreet 	free_heap(&c->ec_stripes_heap);
1325dfe9bfb3SKent Overstreet 	genradix_free(&c->stripes[0]);
1326cd575ddfSKent Overstreet 	bioset_exit(&c->ec_bioset);
1327cd575ddfSKent Overstreet }
1328cd575ddfSKent Overstreet 
1329cd575ddfSKent Overstreet int bch2_fs_ec_init(struct bch_fs *c)
1330cd575ddfSKent Overstreet {
1331cd575ddfSKent Overstreet 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
1332cd575ddfSKent Overstreet 
1333cd575ddfSKent Overstreet 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
1334cd575ddfSKent Overstreet 			   BIOSET_NEED_BVECS);
1335cd575ddfSKent Overstreet }
1336