xref: /linux/fs/bcachefs/ec.c (revision ab52c59103002b49f2455371e4b9c56ba3ef1781)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /* erasure coding */
4 
5 #include "bcachefs.h"
6 #include "alloc_background.h"
7 #include "alloc_foreground.h"
8 #include "backpointers.h"
9 #include "bkey_buf.h"
10 #include "bset.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "disk_groups.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "io_read.h"
20 #include "keylist.h"
21 #include "recovery.h"
22 #include "replicas.h"
23 #include "super-io.h"
24 #include "util.h"
25 
26 #include <linux/sort.h>
27 
28 #ifdef __KERNEL__
29 
30 #include <linux/raid/pq.h>
31 #include <linux/raid/xor.h>
32 
33 static void raid5_recov(unsigned disks, unsigned failed_idx,
34 			size_t size, void **data)
35 {
36 	unsigned i = 2, nr;
37 
38 	BUG_ON(failed_idx >= disks);
39 
40 	swap(data[0], data[failed_idx]);
41 	memcpy(data[0], data[1], size);
42 
43 	while (i < disks) {
44 		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
45 		xor_blocks(nr, size, data[0], data + i);
46 		i += nr;
47 	}
48 
49 	swap(data[0], data[failed_idx]);
50 }
51 
52 static void raid_gen(int nd, int np, size_t size, void **v)
53 {
54 	if (np >= 1)
55 		raid5_recov(nd + np, nd, size, v);
56 	if (np >= 2)
57 		raid6_call.gen_syndrome(nd + np, size, v);
58 	BUG_ON(np > 2);
59 }
60 
61 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
62 {
63 	switch (nr) {
64 	case 0:
65 		break;
66 	case 1:
67 		if (ir[0] < nd + 1)
68 			raid5_recov(nd + 1, ir[0], size, v);
69 		else
70 			raid6_call.gen_syndrome(nd + np, size, v);
71 		break;
72 	case 2:
73 		if (ir[1] < nd) {
74 			/* data+data failure. */
75 			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
76 		} else if (ir[0] < nd) {
77 			/* data + p/q failure */
78 
79 			if (ir[1] == nd) /* data + p failure */
80 				raid6_datap_recov(nd + np, size, ir[0], v);
81 			else { /* data + q failure */
82 				raid5_recov(nd + 1, ir[0], size, v);
83 				raid6_call.gen_syndrome(nd + np, size, v);
84 			}
85 		} else {
86 			raid_gen(nd, np, size, v);
87 		}
88 		break;
89 	default:
90 		BUG();
91 	}
92 }
93 
94 #else
95 
96 #include <raid/raid.h>
97 
98 #endif
99 
100 struct ec_bio {
101 	struct bch_dev		*ca;
102 	struct ec_stripe_buf	*buf;
103 	size_t			idx;
104 	struct bio		bio;
105 };
106 
107 /* Stripes btree keys: */
108 
109 int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
110 			enum bch_validate_flags flags,
111 			struct printbuf *err)
112 {
113 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
114 	int ret = 0;
115 
116 	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
117 			 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
118 			 stripe_pos_bad,
119 			 "stripe at bad pos");
120 
121 	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
122 			 stripe_val_size_bad,
123 			 "incorrect value size (%zu < %u)",
124 			 bkey_val_u64s(k.k), stripe_val_u64s(s));
125 
126 	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
127 fsck_err:
128 	return ret;
129 }
130 
131 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
132 			 struct bkey_s_c k)
133 {
134 	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
135 	struct bch_stripe s = {};
136 
137 	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
138 
139 	unsigned nr_data = s.nr_blocks - s.nr_redundant;
140 
141 	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
142 		   s.algorithm,
143 		   le16_to_cpu(s.sectors),
144 		   nr_data,
145 		   s.nr_redundant);
146 	bch2_prt_csum_type(out, s.csum_type);
147 	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
148 
149 	for (unsigned i = 0; i < s.nr_blocks; i++) {
150 		const struct bch_extent_ptr *ptr = sp->ptrs + i;
151 
152 		if ((void *) ptr >= bkey_val_end(k))
153 			break;
154 
155 		bch2_extent_ptr_to_text(out, c, ptr);
156 
157 		if (s.csum_type < BCH_CSUM_NR &&
158 		    i < nr_data &&
159 		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
160 			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
161 	}
162 }
163 
164 /* Triggers: */
165 
166 static int __mark_stripe_bucket(struct btree_trans *trans,
167 				struct bch_dev *ca,
168 				struct bkey_s_c_stripe s,
169 				unsigned ptr_idx, bool deleting,
170 				struct bpos bucket,
171 				struct bch_alloc_v4 *a,
172 				enum btree_iter_update_trigger_flags flags)
173 {
174 	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
175 	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
176 	bool parity = ptr_idx >= nr_data;
177 	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
178 	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
179 	struct printbuf buf = PRINTBUF;
180 	int ret = 0;
181 
182 	struct bch_fs *c = trans->c;
183 	if (deleting)
184 		sectors = -sectors;
185 
186 	if (!deleting) {
187 		if (bch2_trans_inconsistent_on(a->stripe ||
188 					       a->stripe_redundancy, trans,
189 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
190 				bucket.inode, bucket.offset, a->gen,
191 				bch2_data_type_str(a->data_type),
192 				a->dirty_sectors,
193 				a->stripe, s.k->p.offset,
194 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
195 			ret = -EIO;
196 			goto err;
197 		}
198 
199 		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
200 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
201 				bucket.inode, bucket.offset, a->gen,
202 				bch2_data_type_str(a->data_type),
203 				a->dirty_sectors,
204 				a->cached_sectors,
205 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
206 			ret = -EIO;
207 			goto err;
208 		}
209 	} else {
210 		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
211 					       a->stripe_redundancy != s.v->nr_redundant, trans,
212 				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
213 				bucket.inode, bucket.offset, a->gen,
214 				a->stripe,
215 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
216 			ret = -EIO;
217 			goto err;
218 		}
219 
220 		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
221 				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
222 				bucket.inode, bucket.offset, a->gen,
223 				bch2_data_type_str(a->data_type),
224 				bch2_data_type_str(data_type),
225 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
226 			ret = -EIO;
227 			goto err;
228 		}
229 
230 		if (bch2_trans_inconsistent_on(parity &&
231 					       (a->dirty_sectors != -sectors ||
232 						a->cached_sectors), trans,
233 				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
234 				bucket.inode, bucket.offset, a->gen,
235 				a->dirty_sectors,
236 				a->cached_sectors,
237 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
238 			ret = -EIO;
239 			goto err;
240 		}
241 	}
242 
243 	if (sectors) {
244 		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
245 					     a->gen, a->data_type, &a->dirty_sectors);
246 		if (ret)
247 			goto err;
248 	}
249 
250 	if (!deleting) {
251 		a->stripe		= s.k->p.offset;
252 		a->stripe_redundancy	= s.v->nr_redundant;
253 	} else {
254 		a->stripe		= 0;
255 		a->stripe_redundancy	= 0;
256 	}
257 
258 	alloc_data_type_set(a, data_type);
259 err:
260 	printbuf_exit(&buf);
261 	return ret;
262 }
263 
264 static int mark_stripe_bucket(struct btree_trans *trans,
265 			      struct bkey_s_c_stripe s,
266 			      unsigned ptr_idx, bool deleting,
267 			      enum btree_iter_update_trigger_flags flags)
268 {
269 	struct bch_fs *c = trans->c;
270 	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
271 	int ret = 0;
272 
273 	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
274 	if (unlikely(!ca)) {
275 		if (!(flags & BTREE_TRIGGER_overwrite))
276 			ret = -EIO;
277 		goto err;
278 	}
279 
280 	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
281 
282 	if (flags & BTREE_TRIGGER_transactional) {
283 		struct bkey_i_alloc_v4 *a =
284 			bch2_trans_start_alloc_update(trans, bucket);
285 		ret = PTR_ERR_OR_ZERO(a) ?:
286 			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
287 	}
288 
289 	if (flags & BTREE_TRIGGER_gc) {
290 		percpu_down_read(&c->mark_lock);
291 		struct bucket *g = gc_bucket(ca, bucket.offset);
292 		bucket_lock(g);
293 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
294 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
295 		if (!ret) {
296 			alloc_to_bucket(g, new);
297 			bch2_dev_usage_update(c, ca, &old, &new, 0, true);
298 		}
299 		bucket_unlock(g);
300 		percpu_up_read(&c->mark_lock);
301 	}
302 err:
303 	bch2_dev_put(ca);
304 	return ret;
305 }
306 
307 static int mark_stripe_buckets(struct btree_trans *trans,
308 			       struct bkey_s_c old, struct bkey_s_c new,
309 			       enum btree_iter_update_trigger_flags flags)
310 {
311 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
312 		? bkey_s_c_to_stripe(old).v : NULL;
313 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
314 		? bkey_s_c_to_stripe(new).v : NULL;
315 
316 	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
317 
318 	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
319 
320 	for (unsigned i = 0; i < nr_blocks; i++) {
321 		if (new_s && old_s &&
322 		    !memcmp(&new_s->ptrs[i],
323 			    &old_s->ptrs[i],
324 			    sizeof(new_s->ptrs[i])))
325 			continue;
326 
327 		if (new_s) {
328 			int ret = mark_stripe_bucket(trans,
329 					bkey_s_c_to_stripe(new), i, false, flags);
330 			if (ret)
331 				return ret;
332 		}
333 
334 		if (old_s) {
335 			int ret = mark_stripe_bucket(trans,
336 					bkey_s_c_to_stripe(old), i, true, flags);
337 			if (ret)
338 				return ret;
339 		}
340 	}
341 
342 	return 0;
343 }
344 
345 int bch2_trigger_stripe(struct btree_trans *trans,
346 			enum btree_id btree, unsigned level,
347 			struct bkey_s_c old, struct bkey_s _new,
348 			enum btree_iter_update_trigger_flags flags)
349 {
350 	struct bkey_s_c new = _new.s_c;
351 	struct bch_fs *c = trans->c;
352 	u64 idx = new.k->p.offset;
353 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
354 		? bkey_s_c_to_stripe(old).v : NULL;
355 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
356 		? bkey_s_c_to_stripe(new).v : NULL;
357 
358 	if (unlikely(flags & BTREE_TRIGGER_check_repair))
359 		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
360 
361 	if (flags & BTREE_TRIGGER_transactional) {
362 		/*
363 		 * If the pointers aren't changing, we don't need to do anything:
364 		 */
365 		if (new_s && old_s &&
366 		    new_s->nr_blocks	== old_s->nr_blocks &&
367 		    new_s->nr_redundant	== old_s->nr_redundant &&
368 		    !memcmp(old_s->ptrs, new_s->ptrs,
369 			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
370 			return 0;
371 
372 		BUG_ON(new_s && old_s &&
373 		       (new_s->nr_blocks	!= old_s->nr_blocks ||
374 			new_s->nr_redundant	!= old_s->nr_redundant));
375 
376 		if (new_s) {
377 			s64 sectors = le16_to_cpu(new_s->sectors);
378 
379 			struct bch_replicas_padded r;
380 			bch2_bkey_to_replicas(&r.e, new);
381 			int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
382 			if (ret)
383 				return ret;
384 		}
385 
386 		if (old_s) {
387 			s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
388 
389 			struct bch_replicas_padded r;
390 			bch2_bkey_to_replicas(&r.e, old);
391 			int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
392 			if (ret)
393 				return ret;
394 		}
395 
396 		int ret = mark_stripe_buckets(trans, old, new, flags);
397 		if (ret)
398 			return ret;
399 	}
400 
401 	if (flags & BTREE_TRIGGER_atomic) {
402 		struct stripe *m = genradix_ptr(&c->stripes, idx);
403 
404 		if (!m) {
405 			struct printbuf buf1 = PRINTBUF;
406 			struct printbuf buf2 = PRINTBUF;
407 
408 			bch2_bkey_val_to_text(&buf1, c, old);
409 			bch2_bkey_val_to_text(&buf2, c, new);
410 			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
411 					    "old %s\n"
412 					    "new %s", idx, buf1.buf, buf2.buf);
413 			printbuf_exit(&buf2);
414 			printbuf_exit(&buf1);
415 			bch2_inconsistent_error(c);
416 			return -1;
417 		}
418 
419 		if (!new_s) {
420 			bch2_stripes_heap_del(c, m, idx);
421 
422 			memset(m, 0, sizeof(*m));
423 		} else {
424 			m->sectors	= le16_to_cpu(new_s->sectors);
425 			m->algorithm	= new_s->algorithm;
426 			m->nr_blocks	= new_s->nr_blocks;
427 			m->nr_redundant	= new_s->nr_redundant;
428 			m->blocks_nonempty = 0;
429 
430 			for (unsigned i = 0; i < new_s->nr_blocks; i++)
431 				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
432 
433 			if (!old_s)
434 				bch2_stripes_heap_insert(c, m, idx);
435 			else
436 				bch2_stripes_heap_update(c, m, idx);
437 		}
438 	}
439 
440 	if (flags & BTREE_TRIGGER_gc) {
441 		struct gc_stripe *m =
442 			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
443 
444 		if (!m) {
445 			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
446 				idx);
447 			return -BCH_ERR_ENOMEM_mark_stripe;
448 		}
449 		/*
450 		 * This will be wrong when we bring back runtime gc: we should
451 		 * be unmarking the old key and then marking the new key
452 		 */
453 		m->alive	= true;
454 		m->sectors	= le16_to_cpu(new_s->sectors);
455 		m->nr_blocks	= new_s->nr_blocks;
456 		m->nr_redundant	= new_s->nr_redundant;
457 
458 		for (unsigned i = 0; i < new_s->nr_blocks; i++)
459 			m->ptrs[i] = new_s->ptrs[i];
460 
461 		bch2_bkey_to_replicas(&m->r.e, new);
462 
463 		/*
464 		 * gc recalculates this field from stripe ptr
465 		 * references:
466 		 */
467 		memset(m->block_sectors, 0, sizeof(m->block_sectors));
468 
469 		int ret = mark_stripe_buckets(trans, old, new, flags);
470 		if (ret)
471 			return ret;
472 
473 		ret = bch2_update_replicas(c, new, &m->r.e,
474 				      ((s64) m->sectors * m->nr_redundant),
475 				      0, true);
476 		if (ret) {
477 			struct printbuf buf = PRINTBUF;
478 
479 			bch2_bkey_val_to_text(&buf, c, new);
480 			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
481 			printbuf_exit(&buf);
482 			return ret;
483 		}
484 	}
485 
486 	return 0;
487 }
488 
489 /* returns blocknr in stripe that we matched: */
490 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
491 						struct bkey_s_c k, unsigned *block)
492 {
493 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
494 	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
495 
496 	bkey_for_each_ptr(ptrs, ptr)
497 		for (i = 0; i < nr_data; i++)
498 			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
499 						      le16_to_cpu(s->sectors))) {
500 				*block = i;
501 				return ptr;
502 			}
503 
504 	return NULL;
505 }
506 
507 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
508 {
509 	switch (k.k->type) {
510 	case KEY_TYPE_extent: {
511 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
512 		const union bch_extent_entry *entry;
513 
514 		extent_for_each_entry(e, entry)
515 			if (extent_entry_type(entry) ==
516 			    BCH_EXTENT_ENTRY_stripe_ptr &&
517 			    entry->stripe_ptr.idx == idx)
518 				return true;
519 
520 		break;
521 	}
522 	}
523 
524 	return false;
525 }
526 
527 /* Stripe bufs: */
528 
529 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
530 {
531 	if (buf->key.k.type == KEY_TYPE_stripe) {
532 		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
533 		unsigned i;
534 
535 		for (i = 0; i < s->v.nr_blocks; i++) {
536 			kvfree(buf->data[i]);
537 			buf->data[i] = NULL;
538 		}
539 	}
540 }
541 
542 /* XXX: this is a non-mempoolified memory allocation: */
543 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
544 			      unsigned offset, unsigned size)
545 {
546 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
547 	unsigned csum_granularity = 1U << v->csum_granularity_bits;
548 	unsigned end = offset + size;
549 	unsigned i;
550 
551 	BUG_ON(end > le16_to_cpu(v->sectors));
552 
553 	offset	= round_down(offset, csum_granularity);
554 	end	= min_t(unsigned, le16_to_cpu(v->sectors),
555 			round_up(end, csum_granularity));
556 
557 	buf->offset	= offset;
558 	buf->size	= end - offset;
559 
560 	memset(buf->valid, 0xFF, sizeof(buf->valid));
561 
562 	for (i = 0; i < v->nr_blocks; i++) {
563 		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
564 		if (!buf->data[i])
565 			goto err;
566 	}
567 
568 	return 0;
569 err:
570 	ec_stripe_buf_exit(buf);
571 	return -BCH_ERR_ENOMEM_stripe_buf;
572 }
573 
574 /* Checksumming: */
575 
576 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
577 					 unsigned block, unsigned offset)
578 {
579 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
580 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
581 	unsigned end = buf->offset + buf->size;
582 	unsigned len = min(csum_granularity, end - offset);
583 
584 	BUG_ON(offset >= end);
585 	BUG_ON(offset <  buf->offset);
586 	BUG_ON(offset & (csum_granularity - 1));
587 	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
588 	       (len & (csum_granularity - 1)));
589 
590 	return bch2_checksum(NULL, v->csum_type,
591 			     null_nonce(),
592 			     buf->data[block] + ((offset - buf->offset) << 9),
593 			     len << 9);
594 }
595 
596 static void ec_generate_checksums(struct ec_stripe_buf *buf)
597 {
598 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
599 	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
600 
601 	if (!v->csum_type)
602 		return;
603 
604 	BUG_ON(buf->offset);
605 	BUG_ON(buf->size != le16_to_cpu(v->sectors));
606 
607 	for (i = 0; i < v->nr_blocks; i++)
608 		for (j = 0; j < csums_per_device; j++)
609 			stripe_csum_set(v, i, j,
610 				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
611 }
612 
613 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
614 {
615 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
616 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
617 	unsigned i;
618 
619 	if (!v->csum_type)
620 		return;
621 
622 	for (i = 0; i < v->nr_blocks; i++) {
623 		unsigned offset = buf->offset;
624 		unsigned end = buf->offset + buf->size;
625 
626 		if (!test_bit(i, buf->valid))
627 			continue;
628 
629 		while (offset < end) {
630 			unsigned j = offset >> v->csum_granularity_bits;
631 			unsigned len = min(csum_granularity, end - offset);
632 			struct bch_csum want = stripe_csum_get(v, i, j);
633 			struct bch_csum got = ec_block_checksum(buf, i, offset);
634 
635 			if (bch2_crc_cmp(want, got)) {
636 				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
637 				if (ca) {
638 					struct printbuf err = PRINTBUF;
639 
640 					prt_str(&err, "stripe ");
641 					bch2_csum_err_msg(&err, v->csum_type, want, got);
642 					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
643 					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
644 					bch_err_ratelimited(ca, "%s", err.buf);
645 					printbuf_exit(&err);
646 
647 					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
648 				}
649 
650 				clear_bit(i, buf->valid);
651 				break;
652 			}
653 
654 			offset += len;
655 		}
656 	}
657 }
658 
659 /* Erasure coding: */
660 
661 static void ec_generate_ec(struct ec_stripe_buf *buf)
662 {
663 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
664 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
665 	unsigned bytes = le16_to_cpu(v->sectors) << 9;
666 
667 	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
668 }
669 
670 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
671 {
672 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
673 
674 	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
675 }
676 
677 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
678 {
679 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
680 	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
681 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
682 	unsigned bytes = buf->size << 9;
683 
684 	if (ec_nr_failed(buf) > v->nr_redundant) {
685 		bch_err_ratelimited(c,
686 			"error doing reconstruct read: unable to read enough blocks");
687 		return -1;
688 	}
689 
690 	for (i = 0; i < nr_data; i++)
691 		if (!test_bit(i, buf->valid))
692 			failed[nr_failed++] = i;
693 
694 	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
695 	return 0;
696 }
697 
698 /* IO: */
699 
700 static void ec_block_endio(struct bio *bio)
701 {
702 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
703 	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
704 	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
705 	struct bch_dev *ca = ec_bio->ca;
706 	struct closure *cl = bio->bi_private;
707 
708 	if (bch2_dev_io_err_on(bio->bi_status, ca,
709 			       bio_data_dir(bio)
710 			       ? BCH_MEMBER_ERROR_write
711 			       : BCH_MEMBER_ERROR_read,
712 			       "erasure coding %s error: %s",
713 			       bio_data_dir(bio) ? "write" : "read",
714 			       bch2_blk_status_to_str(bio->bi_status)))
715 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
716 
717 	if (dev_ptr_stale(ca, ptr)) {
718 		bch_err_ratelimited(ca->fs,
719 				    "error %s stripe: stale pointer after io",
720 				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
721 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
722 	}
723 
724 	bio_put(&ec_bio->bio);
725 	percpu_ref_put(&ca->io_ref);
726 	closure_put(cl);
727 }
728 
729 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
730 			blk_opf_t opf, unsigned idx, struct closure *cl)
731 {
732 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
733 	unsigned offset = 0, bytes = buf->size << 9;
734 	struct bch_extent_ptr *ptr = &v->ptrs[idx];
735 	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
736 		? BCH_DATA_user
737 		: BCH_DATA_parity;
738 	int rw = op_is_write(opf);
739 
740 	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
741 	if (!ca) {
742 		clear_bit(idx, buf->valid);
743 		return;
744 	}
745 
746 	if (dev_ptr_stale(ca, ptr)) {
747 		bch_err_ratelimited(c,
748 				    "error %s stripe: stale pointer",
749 				    rw == READ ? "reading from" : "writing to");
750 		clear_bit(idx, buf->valid);
751 		return;
752 	}
753 
754 
755 	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
756 
757 	while (offset < bytes) {
758 		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
759 					   DIV_ROUND_UP(bytes, PAGE_SIZE));
760 		unsigned b = min_t(size_t, bytes - offset,
761 				   nr_iovecs << PAGE_SHIFT);
762 		struct ec_bio *ec_bio;
763 
764 		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
765 						       nr_iovecs,
766 						       opf,
767 						       GFP_KERNEL,
768 						       &c->ec_bioset),
769 				      struct ec_bio, bio);
770 
771 		ec_bio->ca			= ca;
772 		ec_bio->buf			= buf;
773 		ec_bio->idx			= idx;
774 
775 		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
776 		ec_bio->bio.bi_end_io		= ec_block_endio;
777 		ec_bio->bio.bi_private		= cl;
778 
779 		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
780 
781 		closure_get(cl);
782 		percpu_ref_get(&ca->io_ref);
783 
784 		submit_bio(&ec_bio->bio);
785 
786 		offset += b;
787 	}
788 
789 	percpu_ref_put(&ca->io_ref);
790 }
791 
792 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
793 				struct ec_stripe_buf *stripe)
794 {
795 	struct btree_iter iter;
796 	struct bkey_s_c k;
797 	int ret;
798 
799 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
800 			       POS(0, idx), BTREE_ITER_slots);
801 	ret = bkey_err(k);
802 	if (ret)
803 		goto err;
804 	if (k.k->type != KEY_TYPE_stripe) {
805 		ret = -ENOENT;
806 		goto err;
807 	}
808 	bkey_reassemble(&stripe->key, k);
809 err:
810 	bch2_trans_iter_exit(trans, &iter);
811 	return ret;
812 }
813 
814 /* recovery read path: */
815 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
816 {
817 	struct bch_fs *c = trans->c;
818 	struct ec_stripe_buf *buf;
819 	struct closure cl;
820 	struct bch_stripe *v;
821 	unsigned i, offset;
822 	int ret = 0;
823 
824 	closure_init_stack(&cl);
825 
826 	BUG_ON(!rbio->pick.has_ec);
827 
828 	buf = kzalloc(sizeof(*buf), GFP_NOFS);
829 	if (!buf)
830 		return -BCH_ERR_ENOMEM_ec_read_extent;
831 
832 	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
833 	if (ret) {
834 		bch_err_ratelimited(c,
835 			"error doing reconstruct read: error %i looking up stripe", ret);
836 		kfree(buf);
837 		return -EIO;
838 	}
839 
840 	v = &bkey_i_to_stripe(&buf->key)->v;
841 
842 	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
843 		bch_err_ratelimited(c,
844 			"error doing reconstruct read: pointer doesn't match stripe");
845 		ret = -EIO;
846 		goto err;
847 	}
848 
849 	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
850 	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
851 		bch_err_ratelimited(c,
852 			"error doing reconstruct read: read is bigger than stripe");
853 		ret = -EIO;
854 		goto err;
855 	}
856 
857 	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
858 	if (ret)
859 		goto err;
860 
861 	for (i = 0; i < v->nr_blocks; i++)
862 		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
863 
864 	closure_sync(&cl);
865 
866 	if (ec_nr_failed(buf) > v->nr_redundant) {
867 		bch_err_ratelimited(c,
868 			"error doing reconstruct read: unable to read enough blocks");
869 		ret = -EIO;
870 		goto err;
871 	}
872 
873 	ec_validate_checksums(c, buf);
874 
875 	ret = ec_do_recov(c, buf);
876 	if (ret)
877 		goto err;
878 
879 	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
880 		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
881 err:
882 	ec_stripe_buf_exit(buf);
883 	kfree(buf);
884 	return ret;
885 }
886 
887 /* stripe bucket accounting: */
888 
889 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
890 {
891 	ec_stripes_heap n, *h = &c->ec_stripes_heap;
892 
893 	if (idx >= h->size) {
894 		if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
895 			return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
896 
897 		mutex_lock(&c->ec_stripes_heap_lock);
898 		if (n.size > h->size) {
899 			memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
900 			n.used = h->used;
901 			swap(*h, n);
902 		}
903 		mutex_unlock(&c->ec_stripes_heap_lock);
904 
905 		free_heap(&n);
906 	}
907 
908 	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
909 		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
910 
911 	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
912 	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
913 		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
914 
915 	return 0;
916 }
917 
918 static int ec_stripe_mem_alloc(struct btree_trans *trans,
919 			       struct btree_iter *iter)
920 {
921 	return allocate_dropping_locks_errcode(trans,
922 			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
923 }
924 
925 /*
926  * Hash table of open stripes:
927  * Stripes that are being created or modified are kept in a hash table, so that
928  * stripe deletion can skip them.
929  */
930 
931 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
932 {
933 	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
934 	struct ec_stripe_new *s;
935 
936 	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
937 		if (s->idx == idx)
938 			return true;
939 	return false;
940 }
941 
942 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
943 {
944 	bool ret = false;
945 
946 	spin_lock(&c->ec_stripes_new_lock);
947 	ret = __bch2_stripe_is_open(c, idx);
948 	spin_unlock(&c->ec_stripes_new_lock);
949 
950 	return ret;
951 }
952 
953 static bool bch2_try_open_stripe(struct bch_fs *c,
954 				 struct ec_stripe_new *s,
955 				 u64 idx)
956 {
957 	bool ret;
958 
959 	spin_lock(&c->ec_stripes_new_lock);
960 	ret = !__bch2_stripe_is_open(c, idx);
961 	if (ret) {
962 		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
963 
964 		s->idx = idx;
965 		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
966 	}
967 	spin_unlock(&c->ec_stripes_new_lock);
968 
969 	return ret;
970 }
971 
972 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
973 {
974 	BUG_ON(!s->idx);
975 
976 	spin_lock(&c->ec_stripes_new_lock);
977 	hlist_del_init(&s->hash);
978 	spin_unlock(&c->ec_stripes_new_lock);
979 
980 	s->idx = 0;
981 }
982 
983 /* Heap of all existing stripes, ordered by blocks_nonempty */
984 
985 static u64 stripe_idx_to_delete(struct bch_fs *c)
986 {
987 	ec_stripes_heap *h = &c->ec_stripes_heap;
988 
989 	lockdep_assert_held(&c->ec_stripes_heap_lock);
990 
991 	if (h->used &&
992 	    h->data[0].blocks_nonempty == 0 &&
993 	    !bch2_stripe_is_open(c, h->data[0].idx))
994 		return h->data[0].idx;
995 
996 	return 0;
997 }
998 
999 static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
1000 				      struct ec_stripe_heap_entry l,
1001 				      struct ec_stripe_heap_entry r)
1002 {
1003 	return ((l.blocks_nonempty > r.blocks_nonempty) -
1004 		(l.blocks_nonempty < r.blocks_nonempty));
1005 }
1006 
1007 static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
1008 						   size_t i)
1009 {
1010 	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
1011 
1012 	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
1013 }
1014 
1015 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
1016 {
1017 	ec_stripes_heap *h = &c->ec_stripes_heap;
1018 	struct stripe *m = genradix_ptr(&c->stripes, idx);
1019 
1020 	BUG_ON(m->heap_idx >= h->used);
1021 	BUG_ON(h->data[m->heap_idx].idx != idx);
1022 }
1023 
1024 void bch2_stripes_heap_del(struct bch_fs *c,
1025 			   struct stripe *m, size_t idx)
1026 {
1027 	mutex_lock(&c->ec_stripes_heap_lock);
1028 	heap_verify_backpointer(c, idx);
1029 
1030 	heap_del(&c->ec_stripes_heap, m->heap_idx,
1031 		 ec_stripes_heap_cmp,
1032 		 ec_stripes_heap_set_backpointer);
1033 	mutex_unlock(&c->ec_stripes_heap_lock);
1034 }
1035 
1036 void bch2_stripes_heap_insert(struct bch_fs *c,
1037 			      struct stripe *m, size_t idx)
1038 {
1039 	mutex_lock(&c->ec_stripes_heap_lock);
1040 	BUG_ON(heap_full(&c->ec_stripes_heap));
1041 
1042 	heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
1043 			.idx = idx,
1044 			.blocks_nonempty = m->blocks_nonempty,
1045 		}),
1046 		 ec_stripes_heap_cmp,
1047 		 ec_stripes_heap_set_backpointer);
1048 
1049 	heap_verify_backpointer(c, idx);
1050 	mutex_unlock(&c->ec_stripes_heap_lock);
1051 }
1052 
1053 void bch2_stripes_heap_update(struct bch_fs *c,
1054 			      struct stripe *m, size_t idx)
1055 {
1056 	ec_stripes_heap *h = &c->ec_stripes_heap;
1057 	bool do_deletes;
1058 	size_t i;
1059 
1060 	mutex_lock(&c->ec_stripes_heap_lock);
1061 	heap_verify_backpointer(c, idx);
1062 
1063 	h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
1064 
1065 	i = m->heap_idx;
1066 	heap_sift_up(h,	  i, ec_stripes_heap_cmp,
1067 		     ec_stripes_heap_set_backpointer);
1068 	heap_sift_down(h, i, ec_stripes_heap_cmp,
1069 		       ec_stripes_heap_set_backpointer);
1070 
1071 	heap_verify_backpointer(c, idx);
1072 
1073 	do_deletes = stripe_idx_to_delete(c) != 0;
1074 	mutex_unlock(&c->ec_stripes_heap_lock);
1075 
1076 	if (do_deletes)
1077 		bch2_do_stripe_deletes(c);
1078 }
1079 
1080 /* stripe deletion */
1081 
1082 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
1083 {
1084 	struct bch_fs *c = trans->c;
1085 	struct btree_iter iter;
1086 	struct bkey_s_c k;
1087 	struct bkey_s_c_stripe s;
1088 	int ret;
1089 
1090 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
1091 			       BTREE_ITER_intent);
1092 	ret = bkey_err(k);
1093 	if (ret)
1094 		goto err;
1095 
1096 	if (k.k->type != KEY_TYPE_stripe) {
1097 		bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
1098 		ret = -EINVAL;
1099 		goto err;
1100 	}
1101 
1102 	s = bkey_s_c_to_stripe(k);
1103 	for (unsigned i = 0; i < s.v->nr_blocks; i++)
1104 		if (stripe_blockcount_get(s.v, i)) {
1105 			struct printbuf buf = PRINTBUF;
1106 
1107 			bch2_bkey_val_to_text(&buf, c, k);
1108 			bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
1109 			printbuf_exit(&buf);
1110 			ret = -EINVAL;
1111 			goto err;
1112 		}
1113 
1114 	ret = bch2_btree_delete_at(trans, &iter, 0);
1115 err:
1116 	bch2_trans_iter_exit(trans, &iter);
1117 	return ret;
1118 }
1119 
1120 static void ec_stripe_delete_work(struct work_struct *work)
1121 {
1122 	struct bch_fs *c =
1123 		container_of(work, struct bch_fs, ec_stripe_delete_work);
1124 
1125 	while (1) {
1126 		mutex_lock(&c->ec_stripes_heap_lock);
1127 		u64 idx = stripe_idx_to_delete(c);
1128 		mutex_unlock(&c->ec_stripes_heap_lock);
1129 
1130 		if (!idx)
1131 			break;
1132 
1133 		int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1134 					ec_stripe_delete(trans, idx));
1135 		bch_err_fn(c, ret);
1136 		if (ret)
1137 			break;
1138 	}
1139 
1140 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1141 }
1142 
1143 void bch2_do_stripe_deletes(struct bch_fs *c)
1144 {
1145 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1146 	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1147 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1148 }
1149 
1150 /* stripe creation: */
1151 
1152 static int ec_stripe_key_update(struct btree_trans *trans,
1153 				struct bkey_i_stripe *new,
1154 				bool create)
1155 {
1156 	struct bch_fs *c = trans->c;
1157 	struct btree_iter iter;
1158 	struct bkey_s_c k;
1159 	int ret;
1160 
1161 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1162 			       new->k.p, BTREE_ITER_intent);
1163 	ret = bkey_err(k);
1164 	if (ret)
1165 		goto err;
1166 
1167 	if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
1168 		bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
1169 				     create ? "creating" : "updating",
1170 				     bch2_bkey_types[k.k->type]);
1171 		ret = -EINVAL;
1172 		goto err;
1173 	}
1174 
1175 	if (k.k->type == KEY_TYPE_stripe) {
1176 		const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
1177 		unsigned i;
1178 
1179 		if (old->nr_blocks != new->v.nr_blocks) {
1180 			bch_err(c, "error updating stripe: nr_blocks does not match");
1181 			ret = -EINVAL;
1182 			goto err;
1183 		}
1184 
1185 		for (i = 0; i < new->v.nr_blocks; i++) {
1186 			unsigned v = stripe_blockcount_get(old, i);
1187 
1188 			BUG_ON(v &&
1189 			       (old->ptrs[i].dev != new->v.ptrs[i].dev ||
1190 				old->ptrs[i].gen != new->v.ptrs[i].gen ||
1191 				old->ptrs[i].offset != new->v.ptrs[i].offset));
1192 
1193 			stripe_blockcount_set(&new->v, i, v);
1194 		}
1195 	}
1196 
1197 	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1198 err:
1199 	bch2_trans_iter_exit(trans, &iter);
1200 	return ret;
1201 }
1202 
1203 static int ec_stripe_update_extent(struct btree_trans *trans,
1204 				   struct bch_dev *ca,
1205 				   struct bpos bucket, u8 gen,
1206 				   struct ec_stripe_buf *s,
1207 				   struct bpos *bp_pos)
1208 {
1209 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1210 	struct bch_fs *c = trans->c;
1211 	struct bch_backpointer bp;
1212 	struct btree_iter iter;
1213 	struct bkey_s_c k;
1214 	const struct bch_extent_ptr *ptr_c;
1215 	struct bch_extent_ptr *ec_ptr = NULL;
1216 	struct bch_extent_stripe_ptr stripe_ptr;
1217 	struct bkey_i *n;
1218 	int ret, dev, block;
1219 
1220 	ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
1221 				bp_pos, &bp, BTREE_ITER_cached);
1222 	if (ret)
1223 		return ret;
1224 	if (bpos_eq(*bp_pos, SPOS_MAX))
1225 		return 0;
1226 
1227 	if (bp.level) {
1228 		struct printbuf buf = PRINTBUF;
1229 		struct btree_iter node_iter;
1230 		struct btree *b;
1231 
1232 		b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
1233 		bch2_trans_iter_exit(trans, &node_iter);
1234 
1235 		if (!b)
1236 			return 0;
1237 
1238 		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1239 		bch2_backpointer_to_text(&buf, &bp);
1240 
1241 		bch2_fs_inconsistent(c, "%s", buf.buf);
1242 		printbuf_exit(&buf);
1243 		return -EIO;
1244 	}
1245 
1246 	k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
1247 	ret = bkey_err(k);
1248 	if (ret)
1249 		return ret;
1250 	if (!k.k) {
1251 		/*
1252 		 * extent no longer exists - we could flush the btree
1253 		 * write buffer and retry to verify, but no need:
1254 		 */
1255 		return 0;
1256 	}
1257 
1258 	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1259 		goto out;
1260 
1261 	ptr_c = bkey_matches_stripe(v, k, &block);
1262 	/*
1263 	 * It doesn't generally make sense to erasure code cached ptrs:
1264 	 * XXX: should we be incrementing a counter?
1265 	 */
1266 	if (!ptr_c || ptr_c->cached)
1267 		goto out;
1268 
1269 	dev = v->ptrs[block].dev;
1270 
1271 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1272 	ret = PTR_ERR_OR_ZERO(n);
1273 	if (ret)
1274 		goto out;
1275 
1276 	bkey_reassemble(n, k);
1277 
1278 	bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev);
1279 	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1280 	BUG_ON(!ec_ptr);
1281 
1282 	stripe_ptr = (struct bch_extent_stripe_ptr) {
1283 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1284 		.block		= block,
1285 		.redundancy	= v->nr_redundant,
1286 		.idx		= s->key.k.p.offset,
1287 	};
1288 
1289 	__extent_entry_insert(n,
1290 			(union bch_extent_entry *) ec_ptr,
1291 			(union bch_extent_entry *) &stripe_ptr);
1292 
1293 	ret = bch2_trans_update(trans, &iter, n, 0);
1294 out:
1295 	bch2_trans_iter_exit(trans, &iter);
1296 	return ret;
1297 }
1298 
1299 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1300 				   unsigned block)
1301 {
1302 	struct bch_fs *c = trans->c;
1303 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1304 	struct bch_extent_ptr ptr = v->ptrs[block];
1305 	struct bpos bp_pos = POS_MIN;
1306 	int ret = 0;
1307 
1308 	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
1309 	if (!ca)
1310 		return -EIO;
1311 
1312 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
1313 
1314 	while (1) {
1315 		ret = commit_do(trans, NULL, NULL,
1316 				BCH_TRANS_COMMIT_no_check_rw|
1317 				BCH_TRANS_COMMIT_no_enospc,
1318 			ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
1319 		if (ret)
1320 			break;
1321 		if (bkey_eq(bp_pos, POS_MAX))
1322 			break;
1323 
1324 		bp_pos = bpos_nosnap_successor(bp_pos);
1325 	}
1326 
1327 	bch2_dev_put(ca);
1328 	return ret;
1329 }
1330 
1331 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1332 {
1333 	struct btree_trans *trans = bch2_trans_get(c);
1334 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1335 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1336 	int ret = 0;
1337 
1338 	ret = bch2_btree_write_buffer_flush_sync(trans);
1339 	if (ret)
1340 		goto err;
1341 
1342 	for (i = 0; i < nr_data; i++) {
1343 		ret = ec_stripe_update_bucket(trans, s, i);
1344 		if (ret)
1345 			break;
1346 	}
1347 err:
1348 	bch2_trans_put(trans);
1349 
1350 	return ret;
1351 }
1352 
1353 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1354 				       struct ec_stripe_new *s,
1355 				       unsigned block,
1356 				       struct open_bucket *ob)
1357 {
1358 	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
1359 	if (!ca) {
1360 		s->err = -BCH_ERR_erofs_no_writes;
1361 		return;
1362 	}
1363 
1364 	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1365 	memset(s->new_stripe.data[block] + (offset << 9),
1366 	       0,
1367 	       ob->sectors_free << 9);
1368 
1369 	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1370 			ob->bucket * ca->mi.bucket_size + offset,
1371 			ob->sectors_free,
1372 			GFP_KERNEL, 0);
1373 
1374 	percpu_ref_put(&ca->io_ref);
1375 
1376 	if (ret)
1377 		s->err = ret;
1378 }
1379 
1380 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1381 {
1382 	if (s->idx)
1383 		bch2_stripe_close(c, s);
1384 	kfree(s);
1385 }
1386 
1387 /*
1388  * data buckets of new stripe all written: create the stripe
1389  */
1390 static void ec_stripe_create(struct ec_stripe_new *s)
1391 {
1392 	struct bch_fs *c = s->c;
1393 	struct open_bucket *ob;
1394 	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1395 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1396 	int ret;
1397 
1398 	BUG_ON(s->h->s == s);
1399 
1400 	closure_sync(&s->iodone);
1401 
1402 	if (!s->err) {
1403 		for (i = 0; i < nr_data; i++)
1404 			if (s->blocks[i]) {
1405 				ob = c->open_buckets + s->blocks[i];
1406 
1407 				if (ob->sectors_free)
1408 					zero_out_rest_of_ec_bucket(c, s, i, ob);
1409 			}
1410 	}
1411 
1412 	if (s->err) {
1413 		if (!bch2_err_matches(s->err, EROFS))
1414 			bch_err(c, "error creating stripe: error writing data buckets");
1415 		goto err;
1416 	}
1417 
1418 	if (s->have_existing_stripe) {
1419 		ec_validate_checksums(c, &s->existing_stripe);
1420 
1421 		if (ec_do_recov(c, &s->existing_stripe)) {
1422 			bch_err(c, "error creating stripe: error reading existing stripe");
1423 			goto err;
1424 		}
1425 
1426 		for (i = 0; i < nr_data; i++)
1427 			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1428 				swap(s->new_stripe.data[i],
1429 				     s->existing_stripe.data[i]);
1430 
1431 		ec_stripe_buf_exit(&s->existing_stripe);
1432 	}
1433 
1434 	BUG_ON(!s->allocated);
1435 	BUG_ON(!s->idx);
1436 
1437 	ec_generate_ec(&s->new_stripe);
1438 
1439 	ec_generate_checksums(&s->new_stripe);
1440 
1441 	/* write p/q: */
1442 	for (i = nr_data; i < v->nr_blocks; i++)
1443 		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1444 	closure_sync(&s->iodone);
1445 
1446 	if (ec_nr_failed(&s->new_stripe)) {
1447 		bch_err(c, "error creating stripe: error writing redundancy buckets");
1448 		goto err;
1449 	}
1450 
1451 	ret = bch2_trans_do(c, &s->res, NULL,
1452 			    BCH_TRANS_COMMIT_no_check_rw|
1453 			    BCH_TRANS_COMMIT_no_enospc,
1454 			    ec_stripe_key_update(trans,
1455 					bkey_i_to_stripe(&s->new_stripe.key),
1456 					!s->have_existing_stripe));
1457 	bch_err_msg(c, ret, "creating stripe key");
1458 	if (ret) {
1459 		goto err;
1460 	}
1461 
1462 	ret = ec_stripe_update_extents(c, &s->new_stripe);
1463 	bch_err_msg(c, ret, "error updating extents");
1464 	if (ret)
1465 		goto err;
1466 err:
1467 	bch2_disk_reservation_put(c, &s->res);
1468 
1469 	for (i = 0; i < v->nr_blocks; i++)
1470 		if (s->blocks[i]) {
1471 			ob = c->open_buckets + s->blocks[i];
1472 
1473 			if (i < nr_data) {
1474 				ob->ec = NULL;
1475 				__bch2_open_bucket_put(c, ob);
1476 			} else {
1477 				bch2_open_bucket_put(c, ob);
1478 			}
1479 		}
1480 
1481 	mutex_lock(&c->ec_stripe_new_lock);
1482 	list_del(&s->list);
1483 	mutex_unlock(&c->ec_stripe_new_lock);
1484 	wake_up(&c->ec_stripe_new_wait);
1485 
1486 	ec_stripe_buf_exit(&s->existing_stripe);
1487 	ec_stripe_buf_exit(&s->new_stripe);
1488 	closure_debug_destroy(&s->iodone);
1489 
1490 	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1491 }
1492 
1493 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1494 {
1495 	struct ec_stripe_new *s;
1496 
1497 	mutex_lock(&c->ec_stripe_new_lock);
1498 	list_for_each_entry(s, &c->ec_stripe_new_list, list)
1499 		if (!atomic_read(&s->ref[STRIPE_REF_io]))
1500 			goto out;
1501 	s = NULL;
1502 out:
1503 	mutex_unlock(&c->ec_stripe_new_lock);
1504 
1505 	return s;
1506 }
1507 
1508 static void ec_stripe_create_work(struct work_struct *work)
1509 {
1510 	struct bch_fs *c = container_of(work,
1511 		struct bch_fs, ec_stripe_create_work);
1512 	struct ec_stripe_new *s;
1513 
1514 	while ((s = get_pending_stripe(c)))
1515 		ec_stripe_create(s);
1516 
1517 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1518 }
1519 
1520 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1521 {
1522 	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1523 
1524 	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1525 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1526 }
1527 
1528 static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1529 {
1530 	struct ec_stripe_new *s = h->s;
1531 
1532 	BUG_ON(!s->allocated && !s->err);
1533 
1534 	h->s		= NULL;
1535 	s->pending	= true;
1536 
1537 	mutex_lock(&c->ec_stripe_new_lock);
1538 	list_add(&s->list, &c->ec_stripe_new_list);
1539 	mutex_unlock(&c->ec_stripe_new_lock);
1540 
1541 	ec_stripe_new_put(c, s, STRIPE_REF_io);
1542 }
1543 
1544 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
1545 {
1546 	struct ec_stripe_new *s = ob->ec;
1547 
1548 	s->err = -EIO;
1549 }
1550 
1551 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1552 {
1553 	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1554 	if (!ob)
1555 		return NULL;
1556 
1557 	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1558 
1559 	struct bch_dev *ca	= ob_dev(c, ob);
1560 	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
1561 
1562 	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1563 }
1564 
1565 static int unsigned_cmp(const void *_l, const void *_r)
1566 {
1567 	unsigned l = *((const unsigned *) _l);
1568 	unsigned r = *((const unsigned *) _r);
1569 
1570 	return cmp_int(l, r);
1571 }
1572 
1573 /* pick most common bucket size: */
1574 static unsigned pick_blocksize(struct bch_fs *c,
1575 			       struct bch_devs_mask *devs)
1576 {
1577 	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1578 	struct {
1579 		unsigned nr, size;
1580 	} cur = { 0, 0 }, best = { 0, 0 };
1581 
1582 	for_each_member_device_rcu(c, ca, devs)
1583 		sizes[nr++] = ca->mi.bucket_size;
1584 
1585 	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1586 
1587 	for (unsigned i = 0; i < nr; i++) {
1588 		if (sizes[i] != cur.size) {
1589 			if (cur.nr > best.nr)
1590 				best = cur;
1591 
1592 			cur.nr = 0;
1593 			cur.size = sizes[i];
1594 		}
1595 
1596 		cur.nr++;
1597 	}
1598 
1599 	if (cur.nr > best.nr)
1600 		best = cur;
1601 
1602 	return best.size;
1603 }
1604 
1605 static bool may_create_new_stripe(struct bch_fs *c)
1606 {
1607 	return false;
1608 }
1609 
1610 static void ec_stripe_key_init(struct bch_fs *c,
1611 			       struct bkey_i *k,
1612 			       unsigned nr_data,
1613 			       unsigned nr_parity,
1614 			       unsigned stripe_size)
1615 {
1616 	struct bkey_i_stripe *s = bkey_stripe_init(k);
1617 	unsigned u64s;
1618 
1619 	s->v.sectors			= cpu_to_le16(stripe_size);
1620 	s->v.algorithm			= 0;
1621 	s->v.nr_blocks			= nr_data + nr_parity;
1622 	s->v.nr_redundant		= nr_parity;
1623 	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
1624 	s->v.csum_type			= BCH_CSUM_crc32c;
1625 	s->v.pad			= 0;
1626 
1627 	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1628 		BUG_ON(1 << s->v.csum_granularity_bits >=
1629 		       le16_to_cpu(s->v.sectors) ||
1630 		       s->v.csum_granularity_bits == U8_MAX);
1631 		s->v.csum_granularity_bits++;
1632 	}
1633 
1634 	set_bkey_val_u64s(&s->k, u64s);
1635 }
1636 
1637 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1638 {
1639 	struct ec_stripe_new *s;
1640 
1641 	lockdep_assert_held(&h->lock);
1642 
1643 	s = kzalloc(sizeof(*s), GFP_KERNEL);
1644 	if (!s)
1645 		return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
1646 
1647 	mutex_init(&s->lock);
1648 	closure_init(&s->iodone, NULL);
1649 	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1650 	atomic_set(&s->ref[STRIPE_REF_io], 1);
1651 	s->c		= c;
1652 	s->h		= h;
1653 	s->nr_data	= min_t(unsigned, h->nr_active_devs,
1654 				BCH_BKEY_PTRS_MAX) - h->redundancy;
1655 	s->nr_parity	= h->redundancy;
1656 
1657 	ec_stripe_key_init(c, &s->new_stripe.key,
1658 			   s->nr_data, s->nr_parity, h->blocksize);
1659 
1660 	h->s = s;
1661 	return 0;
1662 }
1663 
1664 static struct ec_stripe_head *
1665 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
1666 			 unsigned algo, unsigned redundancy,
1667 			 enum bch_watermark watermark)
1668 {
1669 	struct ec_stripe_head *h;
1670 
1671 	h = kzalloc(sizeof(*h), GFP_KERNEL);
1672 	if (!h)
1673 		return NULL;
1674 
1675 	mutex_init(&h->lock);
1676 	BUG_ON(!mutex_trylock(&h->lock));
1677 
1678 	h->target	= target;
1679 	h->algo		= algo;
1680 	h->redundancy	= redundancy;
1681 	h->watermark	= watermark;
1682 
1683 	rcu_read_lock();
1684 	h->devs = target_rw_devs(c, BCH_DATA_user, target);
1685 
1686 	for_each_member_device_rcu(c, ca, &h->devs)
1687 		if (!ca->mi.durability)
1688 			__clear_bit(ca->dev_idx, h->devs.d);
1689 
1690 	h->blocksize = pick_blocksize(c, &h->devs);
1691 
1692 	for_each_member_device_rcu(c, ca, &h->devs)
1693 		if (ca->mi.bucket_size == h->blocksize)
1694 			h->nr_active_devs++;
1695 
1696 	rcu_read_unlock();
1697 
1698 	/*
1699 	 * If we only have redundancy + 1 devices, we're better off with just
1700 	 * replication:
1701 	 */
1702 	if (h->nr_active_devs < h->redundancy + 2)
1703 		bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
1704 			h->nr_active_devs, h->redundancy + 2);
1705 
1706 	list_add(&h->list, &c->ec_stripe_head_list);
1707 	return h;
1708 }
1709 
1710 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1711 {
1712 	if (h->s &&
1713 	    h->s->allocated &&
1714 	    bitmap_weight(h->s->blocks_allocated,
1715 			  h->s->nr_data) == h->s->nr_data)
1716 		ec_stripe_set_pending(c, h);
1717 
1718 	mutex_unlock(&h->lock);
1719 }
1720 
1721 static struct ec_stripe_head *
1722 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1723 			  unsigned target,
1724 			  unsigned algo,
1725 			  unsigned redundancy,
1726 			  enum bch_watermark watermark)
1727 {
1728 	struct bch_fs *c = trans->c;
1729 	struct ec_stripe_head *h;
1730 	int ret;
1731 
1732 	if (!redundancy)
1733 		return NULL;
1734 
1735 	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1736 	if (ret)
1737 		return ERR_PTR(ret);
1738 
1739 	if (test_bit(BCH_FS_going_ro, &c->flags)) {
1740 		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1741 		goto found;
1742 	}
1743 
1744 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
1745 		if (h->target		== target &&
1746 		    h->algo		== algo &&
1747 		    h->redundancy	== redundancy &&
1748 		    h->watermark	== watermark) {
1749 			ret = bch2_trans_mutex_lock(trans, &h->lock);
1750 			if (ret)
1751 				h = ERR_PTR(ret);
1752 			goto found;
1753 		}
1754 
1755 	h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
1756 found:
1757 	if (!IS_ERR_OR_NULL(h) &&
1758 	    h->nr_active_devs < h->redundancy + 2) {
1759 		mutex_unlock(&h->lock);
1760 		h = NULL;
1761 	}
1762 	mutex_unlock(&c->ec_stripe_head_lock);
1763 	return h;
1764 }
1765 
1766 static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
1767 				    enum bch_watermark watermark, struct closure *cl)
1768 {
1769 	struct bch_fs *c = trans->c;
1770 	struct bch_devs_mask devs = h->devs;
1771 	struct open_bucket *ob;
1772 	struct open_buckets buckets;
1773 	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1774 	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1775 	bool have_cache = true;
1776 	int ret = 0;
1777 
1778 	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
1779 	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
1780 
1781 	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
1782 		__clear_bit(v->ptrs[i].dev, devs.d);
1783 		if (i < h->s->nr_data)
1784 			nr_have_data++;
1785 		else
1786 			nr_have_parity++;
1787 	}
1788 
1789 	BUG_ON(nr_have_data	> h->s->nr_data);
1790 	BUG_ON(nr_have_parity	> h->s->nr_parity);
1791 
1792 	buckets.nr = 0;
1793 	if (nr_have_parity < h->s->nr_parity) {
1794 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1795 					    &h->parity_stripe,
1796 					    &devs,
1797 					    h->s->nr_parity,
1798 					    &nr_have_parity,
1799 					    &have_cache, 0,
1800 					    BCH_DATA_parity,
1801 					    watermark,
1802 					    cl);
1803 
1804 		open_bucket_for_each(c, &buckets, ob, i) {
1805 			j = find_next_zero_bit(h->s->blocks_gotten,
1806 					       h->s->nr_data + h->s->nr_parity,
1807 					       h->s->nr_data);
1808 			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
1809 
1810 			h->s->blocks[j] = buckets.v[i];
1811 			v->ptrs[j] = bch2_ob_ptr(c, ob);
1812 			__set_bit(j, h->s->blocks_gotten);
1813 		}
1814 
1815 		if (ret)
1816 			return ret;
1817 	}
1818 
1819 	buckets.nr = 0;
1820 	if (nr_have_data < h->s->nr_data) {
1821 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1822 					    &h->block_stripe,
1823 					    &devs,
1824 					    h->s->nr_data,
1825 					    &nr_have_data,
1826 					    &have_cache, 0,
1827 					    BCH_DATA_user,
1828 					    watermark,
1829 					    cl);
1830 
1831 		open_bucket_for_each(c, &buckets, ob, i) {
1832 			j = find_next_zero_bit(h->s->blocks_gotten,
1833 					       h->s->nr_data, 0);
1834 			BUG_ON(j >= h->s->nr_data);
1835 
1836 			h->s->blocks[j] = buckets.v[i];
1837 			v->ptrs[j] = bch2_ob_ptr(c, ob);
1838 			__set_bit(j, h->s->blocks_gotten);
1839 		}
1840 
1841 		if (ret)
1842 			return ret;
1843 	}
1844 
1845 	return 0;
1846 }
1847 
1848 /* XXX: doesn't obey target: */
1849 static s64 get_existing_stripe(struct bch_fs *c,
1850 			       struct ec_stripe_head *head)
1851 {
1852 	ec_stripes_heap *h = &c->ec_stripes_heap;
1853 	struct stripe *m;
1854 	size_t heap_idx;
1855 	u64 stripe_idx;
1856 	s64 ret = -1;
1857 
1858 	if (may_create_new_stripe(c))
1859 		return -1;
1860 
1861 	mutex_lock(&c->ec_stripes_heap_lock);
1862 	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
1863 		/* No blocks worth reusing, stripe will just be deleted: */
1864 		if (!h->data[heap_idx].blocks_nonempty)
1865 			continue;
1866 
1867 		stripe_idx = h->data[heap_idx].idx;
1868 
1869 		m = genradix_ptr(&c->stripes, stripe_idx);
1870 
1871 		if (m->algorithm	== head->algo &&
1872 		    m->nr_redundant	== head->redundancy &&
1873 		    m->sectors		== head->blocksize &&
1874 		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant &&
1875 		    bch2_try_open_stripe(c, head->s, stripe_idx)) {
1876 			ret = stripe_idx;
1877 			break;
1878 		}
1879 	}
1880 	mutex_unlock(&c->ec_stripes_heap_lock);
1881 	return ret;
1882 }
1883 
1884 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
1885 {
1886 	struct bch_fs *c = trans->c;
1887 	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
1888 	struct bch_stripe *existing_v;
1889 	unsigned i;
1890 	s64 idx;
1891 	int ret;
1892 
1893 	/*
1894 	 * If we can't allocate a new stripe, and there's no stripes with empty
1895 	 * blocks for us to reuse, that means we have to wait on copygc:
1896 	 */
1897 	idx = get_existing_stripe(c, h);
1898 	if (idx < 0)
1899 		return -BCH_ERR_stripe_alloc_blocked;
1900 
1901 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
1902 	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
1903 			     "reading stripe key: %s", bch2_err_str(ret));
1904 	if (ret) {
1905 		bch2_stripe_close(c, h->s);
1906 		return ret;
1907 	}
1908 
1909 	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
1910 
1911 	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
1912 	h->s->nr_data = existing_v->nr_blocks -
1913 		existing_v->nr_redundant;
1914 
1915 	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
1916 	if (ret) {
1917 		bch2_stripe_close(c, h->s);
1918 		return ret;
1919 	}
1920 
1921 	BUG_ON(h->s->existing_stripe.size != h->blocksize);
1922 	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1923 
1924 	/*
1925 	 * Free buckets we initially allocated - they might conflict with
1926 	 * blocks from the stripe we're reusing:
1927 	 */
1928 	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
1929 		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
1930 		h->s->blocks[i] = 0;
1931 	}
1932 	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
1933 	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
1934 
1935 	for (i = 0; i < existing_v->nr_blocks; i++) {
1936 		if (stripe_blockcount_get(existing_v, i)) {
1937 			__set_bit(i, h->s->blocks_gotten);
1938 			__set_bit(i, h->s->blocks_allocated);
1939 		}
1940 
1941 		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
1942 	}
1943 
1944 	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
1945 	h->s->have_existing_stripe = true;
1946 
1947 	return 0;
1948 }
1949 
1950 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
1951 {
1952 	struct bch_fs *c = trans->c;
1953 	struct btree_iter iter;
1954 	struct bkey_s_c k;
1955 	struct bpos min_pos = POS(0, 1);
1956 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1957 	int ret;
1958 
1959 	if (!h->s->res.sectors) {
1960 		ret = bch2_disk_reservation_get(c, &h->s->res,
1961 					h->blocksize,
1962 					h->s->nr_parity,
1963 					BCH_DISK_RESERVATION_NOFAIL);
1964 		if (ret)
1965 			return ret;
1966 	}
1967 
1968 	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
1969 			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
1970 		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
1971 			if (start_pos.offset) {
1972 				start_pos = min_pos;
1973 				bch2_btree_iter_set_pos(&iter, start_pos);
1974 				continue;
1975 			}
1976 
1977 			ret = -BCH_ERR_ENOSPC_stripe_create;
1978 			break;
1979 		}
1980 
1981 		if (bkey_deleted(k.k) &&
1982 		    bch2_try_open_stripe(c, h->s, k.k->p.offset))
1983 			break;
1984 	}
1985 
1986 	c->ec_stripe_hint = iter.pos.offset;
1987 
1988 	if (ret)
1989 		goto err;
1990 
1991 	ret = ec_stripe_mem_alloc(trans, &iter);
1992 	if (ret) {
1993 		bch2_stripe_close(c, h->s);
1994 		goto err;
1995 	}
1996 
1997 	h->s->new_stripe.key.k.p = iter.pos;
1998 out:
1999 	bch2_trans_iter_exit(trans, &iter);
2000 	return ret;
2001 err:
2002 	bch2_disk_reservation_put(c, &h->s->res);
2003 	goto out;
2004 }
2005 
2006 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
2007 					       unsigned target,
2008 					       unsigned algo,
2009 					       unsigned redundancy,
2010 					       enum bch_watermark watermark,
2011 					       struct closure *cl)
2012 {
2013 	struct bch_fs *c = trans->c;
2014 	struct ec_stripe_head *h;
2015 	bool waiting = false;
2016 	int ret;
2017 
2018 	h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
2019 	if (IS_ERR_OR_NULL(h))
2020 		return h;
2021 
2022 	if (!h->s) {
2023 		ret = ec_new_stripe_alloc(c, h);
2024 		if (ret) {
2025 			bch_err(c, "failed to allocate new stripe");
2026 			goto err;
2027 		}
2028 	}
2029 
2030 	if (h->s->allocated)
2031 		goto allocated;
2032 
2033 	if (h->s->have_existing_stripe)
2034 		goto alloc_existing;
2035 
2036 	/* First, try to allocate a full stripe: */
2037 	ret =   new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
2038 		__bch2_ec_stripe_head_reserve(trans, h);
2039 	if (!ret)
2040 		goto allocate_buf;
2041 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2042 	    bch2_err_matches(ret, ENOMEM))
2043 		goto err;
2044 
2045 	/*
2046 	 * Not enough buckets available for a full stripe: we must reuse an
2047 	 * existing stripe:
2048 	 */
2049 	while (1) {
2050 		ret = __bch2_ec_stripe_head_reuse(trans, h);
2051 		if (!ret)
2052 			break;
2053 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2054 			goto err;
2055 
2056 		if (watermark == BCH_WATERMARK_copygc) {
2057 			ret =   new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
2058 				__bch2_ec_stripe_head_reserve(trans, h);
2059 			if (ret)
2060 				goto err;
2061 			goto allocate_buf;
2062 		}
2063 
2064 		/* XXX freelist_wait? */
2065 		closure_wait(&c->freelist_wait, cl);
2066 		waiting = true;
2067 	}
2068 
2069 	if (waiting)
2070 		closure_wake_up(&c->freelist_wait);
2071 alloc_existing:
2072 	/*
2073 	 * Retry allocating buckets, with the watermark for this
2074 	 * particular write:
2075 	 */
2076 	ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
2077 	if (ret)
2078 		goto err;
2079 
2080 allocate_buf:
2081 	ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
2082 	if (ret)
2083 		goto err;
2084 
2085 	h->s->allocated = true;
2086 allocated:
2087 	BUG_ON(!h->s->idx);
2088 	BUG_ON(!h->s->new_stripe.data[0]);
2089 	BUG_ON(trans->restarted);
2090 	return h;
2091 err:
2092 	bch2_ec_stripe_head_put(c, h);
2093 	return ERR_PTR(ret);
2094 }
2095 
2096 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2097 {
2098 	struct ec_stripe_head *h;
2099 	struct open_bucket *ob;
2100 	unsigned i;
2101 
2102 	mutex_lock(&c->ec_stripe_head_lock);
2103 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2104 		mutex_lock(&h->lock);
2105 		if (!h->s)
2106 			goto unlock;
2107 
2108 		if (!ca)
2109 			goto found;
2110 
2111 		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2112 			if (!h->s->blocks[i])
2113 				continue;
2114 
2115 			ob = c->open_buckets + h->s->blocks[i];
2116 			if (ob->dev == ca->dev_idx)
2117 				goto found;
2118 		}
2119 		goto unlock;
2120 found:
2121 		h->s->err = -BCH_ERR_erofs_no_writes;
2122 		ec_stripe_set_pending(c, h);
2123 unlock:
2124 		mutex_unlock(&h->lock);
2125 	}
2126 	mutex_unlock(&c->ec_stripe_head_lock);
2127 }
2128 
2129 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2130 {
2131 	__bch2_ec_stop(c, ca);
2132 }
2133 
2134 void bch2_fs_ec_stop(struct bch_fs *c)
2135 {
2136 	__bch2_ec_stop(c, NULL);
2137 }
2138 
2139 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2140 {
2141 	bool ret;
2142 
2143 	mutex_lock(&c->ec_stripe_new_lock);
2144 	ret = list_empty(&c->ec_stripe_new_list);
2145 	mutex_unlock(&c->ec_stripe_new_lock);
2146 
2147 	return ret;
2148 }
2149 
2150 void bch2_fs_ec_flush(struct bch_fs *c)
2151 {
2152 	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2153 }
2154 
2155 int bch2_stripes_read(struct bch_fs *c)
2156 {
2157 	int ret = bch2_trans_run(c,
2158 		for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
2159 				   BTREE_ITER_prefetch, k, ({
2160 			if (k.k->type != KEY_TYPE_stripe)
2161 				continue;
2162 
2163 			ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
2164 			if (ret)
2165 				break;
2166 
2167 			const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
2168 
2169 			struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
2170 			m->sectors	= le16_to_cpu(s->sectors);
2171 			m->algorithm	= s->algorithm;
2172 			m->nr_blocks	= s->nr_blocks;
2173 			m->nr_redundant	= s->nr_redundant;
2174 			m->blocks_nonempty = 0;
2175 
2176 			for (unsigned i = 0; i < s->nr_blocks; i++)
2177 				m->blocks_nonempty += !!stripe_blockcount_get(s, i);
2178 
2179 			bch2_stripes_heap_insert(c, m, k.k->p.offset);
2180 			0;
2181 		})));
2182 	bch_err_fn(c, ret);
2183 	return ret;
2184 }
2185 
2186 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
2187 {
2188 	ec_stripes_heap *h = &c->ec_stripes_heap;
2189 	struct stripe *m;
2190 	size_t i;
2191 
2192 	mutex_lock(&c->ec_stripes_heap_lock);
2193 	for (i = 0; i < min_t(size_t, h->used, 50); i++) {
2194 		m = genradix_ptr(&c->stripes, h->data[i].idx);
2195 
2196 		prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
2197 		       h->data[i].blocks_nonempty,
2198 		       m->nr_blocks - m->nr_redundant,
2199 		       m->nr_redundant);
2200 		if (bch2_stripe_is_open(c, h->data[i].idx))
2201 			prt_str(out, " open");
2202 		prt_newline(out);
2203 	}
2204 	mutex_unlock(&c->ec_stripes_heap_lock);
2205 }
2206 
2207 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2208 {
2209 	struct ec_stripe_head *h;
2210 	struct ec_stripe_new *s;
2211 
2212 	mutex_lock(&c->ec_stripe_head_lock);
2213 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2214 		prt_printf(out, "target %u algo %u redundancy %u %s:\n",
2215 		       h->target, h->algo, h->redundancy,
2216 		       bch2_watermarks[h->watermark]);
2217 
2218 		if (h->s)
2219 			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
2220 			       h->s->idx, h->s->nr_data, h->s->nr_parity,
2221 			       bitmap_weight(h->s->blocks_allocated,
2222 					     h->s->nr_data));
2223 	}
2224 	mutex_unlock(&c->ec_stripe_head_lock);
2225 
2226 	prt_printf(out, "in flight:\n");
2227 
2228 	mutex_lock(&c->ec_stripe_new_lock);
2229 	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
2230 		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
2231 			   s->idx, s->nr_data, s->nr_parity,
2232 			   atomic_read(&s->ref[STRIPE_REF_io]),
2233 			   atomic_read(&s->ref[STRIPE_REF_stripe]),
2234 			   bch2_watermarks[s->h->watermark]);
2235 	}
2236 	mutex_unlock(&c->ec_stripe_new_lock);
2237 }
2238 
2239 void bch2_fs_ec_exit(struct bch_fs *c)
2240 {
2241 	struct ec_stripe_head *h;
2242 	unsigned i;
2243 
2244 	while (1) {
2245 		mutex_lock(&c->ec_stripe_head_lock);
2246 		h = list_first_entry_or_null(&c->ec_stripe_head_list,
2247 					     struct ec_stripe_head, list);
2248 		if (h)
2249 			list_del(&h->list);
2250 		mutex_unlock(&c->ec_stripe_head_lock);
2251 		if (!h)
2252 			break;
2253 
2254 		if (h->s) {
2255 			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2256 				BUG_ON(h->s->blocks[i]);
2257 
2258 			kfree(h->s);
2259 		}
2260 		kfree(h);
2261 	}
2262 
2263 	BUG_ON(!list_empty(&c->ec_stripe_new_list));
2264 
2265 	free_heap(&c->ec_stripes_heap);
2266 	genradix_free(&c->stripes);
2267 	bioset_exit(&c->ec_bioset);
2268 }
2269 
2270 void bch2_fs_ec_init_early(struct bch_fs *c)
2271 {
2272 	spin_lock_init(&c->ec_stripes_new_lock);
2273 	mutex_init(&c->ec_stripes_heap_lock);
2274 
2275 	INIT_LIST_HEAD(&c->ec_stripe_head_list);
2276 	mutex_init(&c->ec_stripe_head_lock);
2277 
2278 	INIT_LIST_HEAD(&c->ec_stripe_new_list);
2279 	mutex_init(&c->ec_stripe_new_lock);
2280 	init_waitqueue_head(&c->ec_stripe_new_wait);
2281 
2282 	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2283 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2284 }
2285 
2286 int bch2_fs_ec_init(struct bch_fs *c)
2287 {
2288 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2289 			   BIOSET_NEED_BVECS);
2290 }
2291