xref: /linux/fs/bcachefs/ec.c (revision f694f30e81c4ade358eb8c75273bac1a48f0cb8f)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /* erasure coding */
4 
5 #include "bcachefs.h"
6 #include "alloc_background.h"
7 #include "alloc_foreground.h"
8 #include "backpointers.h"
9 #include "bkey_buf.h"
10 #include "bset.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "disk_accounting.h"
17 #include "disk_groups.h"
18 #include "ec.h"
19 #include "error.h"
20 #include "io_read.h"
21 #include "io_write.h"
22 #include "keylist.h"
23 #include "lru.h"
24 #include "recovery.h"
25 #include "replicas.h"
26 #include "super-io.h"
27 #include "util.h"
28 
29 #include <linux/sort.h>
30 #include <linux/string_choices.h>
31 
32 #ifdef __KERNEL__
33 
34 #include <linux/raid/pq.h>
35 #include <linux/raid/xor.h>
36 
37 static void raid5_recov(unsigned disks, unsigned failed_idx,
38 			size_t size, void **data)
39 {
40 	unsigned i = 2, nr;
41 
42 	BUG_ON(failed_idx >= disks);
43 
44 	swap(data[0], data[failed_idx]);
45 	memcpy(data[0], data[1], size);
46 
47 	while (i < disks) {
48 		nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
49 		xor_blocks(nr, size, data[0], data + i);
50 		i += nr;
51 	}
52 
53 	swap(data[0], data[failed_idx]);
54 }
55 
56 static void raid_gen(int nd, int np, size_t size, void **v)
57 {
58 	if (np >= 1)
59 		raid5_recov(nd + np, nd, size, v);
60 	if (np >= 2)
61 		raid6_call.gen_syndrome(nd + np, size, v);
62 	BUG_ON(np > 2);
63 }
64 
65 static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
66 {
67 	switch (nr) {
68 	case 0:
69 		break;
70 	case 1:
71 		if (ir[0] < nd + 1)
72 			raid5_recov(nd + 1, ir[0], size, v);
73 		else
74 			raid6_call.gen_syndrome(nd + np, size, v);
75 		break;
76 	case 2:
77 		if (ir[1] < nd) {
78 			/* data+data failure. */
79 			raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
80 		} else if (ir[0] < nd) {
81 			/* data + p/q failure */
82 
83 			if (ir[1] == nd) /* data + p failure */
84 				raid6_datap_recov(nd + np, size, ir[0], v);
85 			else { /* data + q failure */
86 				raid5_recov(nd + 1, ir[0], size, v);
87 				raid6_call.gen_syndrome(nd + np, size, v);
88 			}
89 		} else {
90 			raid_gen(nd, np, size, v);
91 		}
92 		break;
93 	default:
94 		BUG();
95 	}
96 }
97 
98 #else
99 
100 #include <raid/raid.h>
101 
102 #endif
103 
104 struct ec_bio {
105 	struct bch_dev		*ca;
106 	struct ec_stripe_buf	*buf;
107 	size_t			idx;
108 	int			rw;
109 	u64			submit_time;
110 	struct bio		bio;
111 };
112 
113 /* Stripes btree keys: */
114 
115 int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
116 			 struct bkey_validate_context from)
117 {
118 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
119 	int ret = 0;
120 
121 	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
122 			 bpos_gt(k.k->p, POS(0, U32_MAX)),
123 			 c, stripe_pos_bad,
124 			 "stripe at bad pos");
125 
126 	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
127 			 c, stripe_val_size_bad,
128 			 "incorrect value size (%zu < %u)",
129 			 bkey_val_u64s(k.k), stripe_val_u64s(s));
130 
131 	bkey_fsck_err_on(s->csum_granularity_bits >= 64,
132 			 c, stripe_csum_granularity_bad,
133 			 "invalid csum granularity (%u >= 64)",
134 			 s->csum_granularity_bits);
135 
136 	ret = bch2_bkey_ptrs_validate(c, k, from);
137 fsck_err:
138 	return ret;
139 }
140 
141 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
142 			 struct bkey_s_c k)
143 {
144 	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
145 	struct bch_stripe s = {};
146 
147 	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
148 
149 	unsigned nr_data = s.nr_blocks - s.nr_redundant;
150 
151 	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
152 		   s.algorithm,
153 		   le16_to_cpu(s.sectors),
154 		   nr_data,
155 		   s.nr_redundant);
156 	bch2_prt_csum_type(out, s.csum_type);
157 	prt_str(out, " gran ");
158 	if (s.csum_granularity_bits < 64)
159 		prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
160 	else
161 		prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
162 
163 	if (s.disk_label) {
164 		prt_str(out, " label");
165 		bch2_disk_path_to_text(out, c, s.disk_label - 1);
166 	}
167 
168 	for (unsigned i = 0; i < s.nr_blocks; i++) {
169 		const struct bch_extent_ptr *ptr = sp->ptrs + i;
170 
171 		if ((void *) ptr >= bkey_val_end(k))
172 			break;
173 
174 		prt_char(out, ' ');
175 		bch2_extent_ptr_to_text(out, c, ptr);
176 
177 		if (s.csum_type < BCH_CSUM_NR &&
178 		    i < nr_data &&
179 		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
180 			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
181 	}
182 }
183 
184 /* Triggers: */
185 
186 static int __mark_stripe_bucket(struct btree_trans *trans,
187 				struct bch_dev *ca,
188 				struct bkey_s_c_stripe s,
189 				unsigned ptr_idx, bool deleting,
190 				struct bpos bucket,
191 				struct bch_alloc_v4 *a,
192 				enum btree_iter_update_trigger_flags flags)
193 {
194 	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
195 	unsigned nr_data = s.v->nr_blocks - s.v->nr_redundant;
196 	bool parity = ptr_idx >= nr_data;
197 	enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
198 	s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0;
199 	struct printbuf buf = PRINTBUF;
200 	int ret = 0;
201 
202 	struct bch_fs *c = trans->c;
203 	if (deleting)
204 		sectors = -sectors;
205 
206 	if (!deleting) {
207 		if (bch2_trans_inconsistent_on(a->stripe ||
208 					       a->stripe_redundancy, trans,
209 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)\n%s",
210 				bucket.inode, bucket.offset, a->gen,
211 				bch2_data_type_str(a->data_type),
212 				a->dirty_sectors,
213 				a->stripe, s.k->p.offset,
214 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
215 			ret = -BCH_ERR_mark_stripe;
216 			goto err;
217 		}
218 
219 		if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans,
220 				"bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s",
221 				bucket.inode, bucket.offset, a->gen,
222 				bch2_data_type_str(a->data_type),
223 				a->dirty_sectors,
224 				a->cached_sectors,
225 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
226 			ret = -BCH_ERR_mark_stripe;
227 			goto err;
228 		}
229 	} else {
230 		if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset ||
231 					       a->stripe_redundancy != s.v->nr_redundant, trans,
232 				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s",
233 				bucket.inode, bucket.offset, a->gen,
234 				a->stripe,
235 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
236 			ret = -BCH_ERR_mark_stripe;
237 			goto err;
238 		}
239 
240 		if (bch2_trans_inconsistent_on(a->data_type != data_type, trans,
241 				"bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s",
242 				bucket.inode, bucket.offset, a->gen,
243 				bch2_data_type_str(a->data_type),
244 				bch2_data_type_str(data_type),
245 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
246 			ret = -BCH_ERR_mark_stripe;
247 			goto err;
248 		}
249 
250 		if (bch2_trans_inconsistent_on(parity &&
251 					       (a->dirty_sectors != -sectors ||
252 						a->cached_sectors), trans,
253 				"bucket %llu:%llu gen %u dirty_sectors %u cached_sectors %u: wrong sectors when deleting parity block of stripe\n%s",
254 				bucket.inode, bucket.offset, a->gen,
255 				a->dirty_sectors,
256 				a->cached_sectors,
257 				(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
258 			ret = -BCH_ERR_mark_stripe;
259 			goto err;
260 		}
261 	}
262 
263 	if (sectors) {
264 		ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type,
265 					     a->gen, a->data_type, &a->dirty_sectors);
266 		if (ret)
267 			goto err;
268 	}
269 
270 	if (!deleting) {
271 		a->stripe		= s.k->p.offset;
272 		a->stripe_redundancy	= s.v->nr_redundant;
273 		alloc_data_type_set(a, data_type);
274 	} else {
275 		a->stripe		= 0;
276 		a->stripe_redundancy	= 0;
277 		alloc_data_type_set(a, BCH_DATA_user);
278 	}
279 err:
280 	printbuf_exit(&buf);
281 	return ret;
282 }
283 
284 static int mark_stripe_bucket(struct btree_trans *trans,
285 			      struct bkey_s_c_stripe s,
286 			      unsigned ptr_idx, bool deleting,
287 			      enum btree_iter_update_trigger_flags flags)
288 {
289 	struct bch_fs *c = trans->c;
290 	const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
291 	struct printbuf buf = PRINTBUF;
292 	int ret = 0;
293 
294 	struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
295 	if (unlikely(!ca)) {
296 		if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
297 			ret = -BCH_ERR_mark_stripe;
298 		goto err;
299 	}
300 
301 	struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
302 
303 	if (flags & BTREE_TRIGGER_transactional) {
304 		struct extent_ptr_decoded p = {
305 			.ptr = *ptr,
306 			.crc = bch2_extent_crc_unpack(s.k, NULL),
307 		};
308 		struct bkey_i_backpointer bp;
309 		bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
310 				      (const union bch_extent_entry *) ptr, &bp);
311 
312 		struct bkey_i_alloc_v4 *a =
313 			bch2_trans_start_alloc_update(trans, bucket, 0);
314 		ret   = PTR_ERR_OR_ZERO(a) ?:
315 			__mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
316 			bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
317 						    !(flags & BTREE_TRIGGER_overwrite));
318 		if (ret)
319 			goto err;
320 	}
321 
322 	if (flags & BTREE_TRIGGER_gc) {
323 		struct bucket *g = gc_bucket(ca, bucket.offset);
324 		if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
325 					    ptr->dev,
326 					    (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
327 			ret = -BCH_ERR_mark_stripe;
328 			goto err;
329 		}
330 
331 		bucket_lock(g);
332 		struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
333 		ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
334 		alloc_to_bucket(g, new);
335 		bucket_unlock(g);
336 
337 		if (!ret)
338 			ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
339 	}
340 err:
341 	bch2_dev_put(ca);
342 	printbuf_exit(&buf);
343 	return ret;
344 }
345 
346 static int mark_stripe_buckets(struct btree_trans *trans,
347 			       struct bkey_s_c old, struct bkey_s_c new,
348 			       enum btree_iter_update_trigger_flags flags)
349 {
350 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
351 		? bkey_s_c_to_stripe(old).v : NULL;
352 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
353 		? bkey_s_c_to_stripe(new).v : NULL;
354 
355 	BUG_ON(old_s && new_s && old_s->nr_blocks != new_s->nr_blocks);
356 
357 	unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
358 
359 	for (unsigned i = 0; i < nr_blocks; i++) {
360 		if (new_s && old_s &&
361 		    !memcmp(&new_s->ptrs[i],
362 			    &old_s->ptrs[i],
363 			    sizeof(new_s->ptrs[i])))
364 			continue;
365 
366 		if (new_s) {
367 			int ret = mark_stripe_bucket(trans,
368 					bkey_s_c_to_stripe(new), i, false, flags);
369 			if (ret)
370 				return ret;
371 		}
372 
373 		if (old_s) {
374 			int ret = mark_stripe_bucket(trans,
375 					bkey_s_c_to_stripe(old), i, true, flags);
376 			if (ret)
377 				return ret;
378 		}
379 	}
380 
381 	return 0;
382 }
383 
384 int bch2_trigger_stripe(struct btree_trans *trans,
385 			enum btree_id btree, unsigned level,
386 			struct bkey_s_c old, struct bkey_s _new,
387 			enum btree_iter_update_trigger_flags flags)
388 {
389 	struct bkey_s_c new = _new.s_c;
390 	struct bch_fs *c = trans->c;
391 	u64 idx = new.k->p.offset;
392 	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
393 		? bkey_s_c_to_stripe(old).v : NULL;
394 	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
395 		? bkey_s_c_to_stripe(new).v : NULL;
396 
397 	if (unlikely(flags & BTREE_TRIGGER_check_repair))
398 		return bch2_check_fix_ptrs(trans, btree, level, _new.s_c, flags);
399 
400 	BUG_ON(new_s && old_s &&
401 	       (new_s->nr_blocks	!= old_s->nr_blocks ||
402 		new_s->nr_redundant	!= old_s->nr_redundant));
403 
404 	if (flags & BTREE_TRIGGER_transactional) {
405 		int ret = bch2_lru_change(trans,
406 					  BCH_LRU_STRIPE_FRAGMENTATION,
407 					  idx,
408 					  stripe_lru_pos(old_s),
409 					  stripe_lru_pos(new_s));
410 		if (ret)
411 			return ret;
412 	}
413 
414 	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
415 		/*
416 		 * If the pointers aren't changing, we don't need to do anything:
417 		 */
418 		if (new_s && old_s &&
419 		    new_s->nr_blocks	== old_s->nr_blocks &&
420 		    new_s->nr_redundant	== old_s->nr_redundant &&
421 		    !memcmp(old_s->ptrs, new_s->ptrs,
422 			    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
423 			return 0;
424 
425 		struct gc_stripe *gc = NULL;
426 		if (flags & BTREE_TRIGGER_gc) {
427 			gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
428 			if (!gc) {
429 				bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
430 				return -BCH_ERR_ENOMEM_mark_stripe;
431 			}
432 
433 			/*
434 			 * This will be wrong when we bring back runtime gc: we should
435 			 * be unmarking the old key and then marking the new key
436 			 *
437 			 * Also: when we bring back runtime gc, locking
438 			 */
439 			gc->alive	= true;
440 			gc->sectors	= le16_to_cpu(new_s->sectors);
441 			gc->nr_blocks	= new_s->nr_blocks;
442 			gc->nr_redundant	= new_s->nr_redundant;
443 
444 			for (unsigned i = 0; i < new_s->nr_blocks; i++)
445 				gc->ptrs[i] = new_s->ptrs[i];
446 
447 			/*
448 			 * gc recalculates this field from stripe ptr
449 			 * references:
450 			 */
451 			memset(gc->block_sectors, 0, sizeof(gc->block_sectors));
452 		}
453 
454 		if (new_s) {
455 			s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
456 
457 			struct disk_accounting_pos acc;
458 			memset(&acc, 0, sizeof(acc));
459 			acc.type = BCH_DISK_ACCOUNTING_replicas;
460 			bch2_bkey_to_replicas(&acc.replicas, new);
461 			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
462 			if (ret)
463 				return ret;
464 
465 			if (gc)
466 				unsafe_memcpy(&gc->r.e, &acc.replicas,
467 					      replicas_entry_bytes(&acc.replicas), "VLA");
468 		}
469 
470 		if (old_s) {
471 			s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
472 
473 			struct disk_accounting_pos acc;
474 			memset(&acc, 0, sizeof(acc));
475 			acc.type = BCH_DISK_ACCOUNTING_replicas;
476 			bch2_bkey_to_replicas(&acc.replicas, old);
477 			int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
478 			if (ret)
479 				return ret;
480 		}
481 
482 		int ret = mark_stripe_buckets(trans, old, new, flags);
483 		if (ret)
484 			return ret;
485 	}
486 
487 	return 0;
488 }
489 
490 /* returns blocknr in stripe that we matched: */
491 static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
492 						struct bkey_s_c k, unsigned *block)
493 {
494 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
495 	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
496 
497 	bkey_for_each_ptr(ptrs, ptr)
498 		for (i = 0; i < nr_data; i++)
499 			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
500 						      le16_to_cpu(s->sectors))) {
501 				*block = i;
502 				return ptr;
503 			}
504 
505 	return NULL;
506 }
507 
508 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
509 {
510 	switch (k.k->type) {
511 	case KEY_TYPE_extent: {
512 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
513 		const union bch_extent_entry *entry;
514 
515 		extent_for_each_entry(e, entry)
516 			if (extent_entry_type(entry) ==
517 			    BCH_EXTENT_ENTRY_stripe_ptr &&
518 			    entry->stripe_ptr.idx == idx)
519 				return true;
520 
521 		break;
522 	}
523 	}
524 
525 	return false;
526 }
527 
528 /* Stripe bufs: */
529 
530 static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
531 {
532 	if (buf->key.k.type == KEY_TYPE_stripe) {
533 		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
534 		unsigned i;
535 
536 		for (i = 0; i < s->v.nr_blocks; i++) {
537 			kvfree(buf->data[i]);
538 			buf->data[i] = NULL;
539 		}
540 	}
541 }
542 
543 /* XXX: this is a non-mempoolified memory allocation: */
544 static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
545 			      unsigned offset, unsigned size)
546 {
547 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
548 	unsigned csum_granularity = 1U << v->csum_granularity_bits;
549 	unsigned end = offset + size;
550 	unsigned i;
551 
552 	BUG_ON(end > le16_to_cpu(v->sectors));
553 
554 	offset	= round_down(offset, csum_granularity);
555 	end	= min_t(unsigned, le16_to_cpu(v->sectors),
556 			round_up(end, csum_granularity));
557 
558 	buf->offset	= offset;
559 	buf->size	= end - offset;
560 
561 	memset(buf->valid, 0xFF, sizeof(buf->valid));
562 
563 	for (i = 0; i < v->nr_blocks; i++) {
564 		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
565 		if (!buf->data[i])
566 			goto err;
567 	}
568 
569 	return 0;
570 err:
571 	ec_stripe_buf_exit(buf);
572 	return -BCH_ERR_ENOMEM_stripe_buf;
573 }
574 
575 /* Checksumming: */
576 
577 static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
578 					 unsigned block, unsigned offset)
579 {
580 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
581 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
582 	unsigned end = buf->offset + buf->size;
583 	unsigned len = min(csum_granularity, end - offset);
584 
585 	BUG_ON(offset >= end);
586 	BUG_ON(offset <  buf->offset);
587 	BUG_ON(offset & (csum_granularity - 1));
588 	BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
589 	       (len & (csum_granularity - 1)));
590 
591 	return bch2_checksum(NULL, v->csum_type,
592 			     null_nonce(),
593 			     buf->data[block] + ((offset - buf->offset) << 9),
594 			     len << 9);
595 }
596 
597 static void ec_generate_checksums(struct ec_stripe_buf *buf)
598 {
599 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
600 	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
601 
602 	if (!v->csum_type)
603 		return;
604 
605 	BUG_ON(buf->offset);
606 	BUG_ON(buf->size != le16_to_cpu(v->sectors));
607 
608 	for (i = 0; i < v->nr_blocks; i++)
609 		for (j = 0; j < csums_per_device; j++)
610 			stripe_csum_set(v, i, j,
611 				ec_block_checksum(buf, i, j << v->csum_granularity_bits));
612 }
613 
614 static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
615 {
616 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
617 	unsigned csum_granularity = 1 << v->csum_granularity_bits;
618 	unsigned i;
619 
620 	if (!v->csum_type)
621 		return;
622 
623 	for (i = 0; i < v->nr_blocks; i++) {
624 		unsigned offset = buf->offset;
625 		unsigned end = buf->offset + buf->size;
626 
627 		if (!test_bit(i, buf->valid))
628 			continue;
629 
630 		while (offset < end) {
631 			unsigned j = offset >> v->csum_granularity_bits;
632 			unsigned len = min(csum_granularity, end - offset);
633 			struct bch_csum want = stripe_csum_get(v, i, j);
634 			struct bch_csum got = ec_block_checksum(buf, i, offset);
635 
636 			if (bch2_crc_cmp(want, got)) {
637 				struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev);
638 				if (ca) {
639 					struct printbuf err = PRINTBUF;
640 
641 					prt_str(&err, "stripe ");
642 					bch2_csum_err_msg(&err, v->csum_type, want, got);
643 					prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
644 					bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
645 					bch_err_ratelimited(ca, "%s", err.buf);
646 					printbuf_exit(&err);
647 
648 					bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
649 				}
650 
651 				clear_bit(i, buf->valid);
652 				break;
653 			}
654 
655 			offset += len;
656 		}
657 	}
658 }
659 
660 /* Erasure coding: */
661 
662 static void ec_generate_ec(struct ec_stripe_buf *buf)
663 {
664 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
665 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
666 	unsigned bytes = le16_to_cpu(v->sectors) << 9;
667 
668 	raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
669 }
670 
671 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
672 {
673 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
674 
675 	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
676 }
677 
678 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
679 {
680 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
681 	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
682 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
683 	unsigned bytes = buf->size << 9;
684 
685 	if (ec_nr_failed(buf) > v->nr_redundant) {
686 		bch_err_ratelimited(c,
687 			"error doing reconstruct read: unable to read enough blocks");
688 		return -1;
689 	}
690 
691 	for (i = 0; i < nr_data; i++)
692 		if (!test_bit(i, buf->valid))
693 			failed[nr_failed++] = i;
694 
695 	raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
696 	return 0;
697 }
698 
699 /* IO: */
700 
701 static void ec_block_endio(struct bio *bio)
702 {
703 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
704 	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
705 	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
706 	struct bch_dev *ca = ec_bio->ca;
707 	struct closure *cl = bio->bi_private;
708 	int rw = ec_bio->rw;
709 
710 	bch2_account_io_completion(ca, bio_data_dir(bio),
711 				   ec_bio->submit_time, !bio->bi_status);
712 
713 	if (bio->bi_status) {
714 		bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
715 			       str_write_read(bio_data_dir(bio)),
716 			       bch2_blk_status_to_str(bio->bi_status));
717 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
718 	}
719 
720 	int stale = dev_ptr_stale(ca, ptr);
721 	if (stale) {
722 		bch_err_ratelimited(ca->fs,
723 				    "error %s stripe: stale/invalid pointer (%i) after io",
724 				    bio_data_dir(bio) == READ ? "reading from" : "writing to",
725 				    stale);
726 		clear_bit(ec_bio->idx, ec_bio->buf->valid);
727 	}
728 
729 	bio_put(&ec_bio->bio);
730 	percpu_ref_put(&ca->io_ref[rw]);
731 	closure_put(cl);
732 }
733 
734 static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
735 			blk_opf_t opf, unsigned idx, struct closure *cl)
736 {
737 	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
738 	unsigned offset = 0, bytes = buf->size << 9;
739 	struct bch_extent_ptr *ptr = &v->ptrs[idx];
740 	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
741 		? BCH_DATA_user
742 		: BCH_DATA_parity;
743 	int rw = op_is_write(opf);
744 
745 	struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
746 	if (!ca) {
747 		clear_bit(idx, buf->valid);
748 		return;
749 	}
750 
751 	int stale = dev_ptr_stale(ca, ptr);
752 	if (stale) {
753 		bch_err_ratelimited(c,
754 				    "error %s stripe: stale pointer (%i)",
755 				    rw == READ ? "reading from" : "writing to",
756 				    stale);
757 		clear_bit(idx, buf->valid);
758 		return;
759 	}
760 
761 
762 	this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
763 
764 	while (offset < bytes) {
765 		unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
766 					   DIV_ROUND_UP(bytes, PAGE_SIZE));
767 		unsigned b = min_t(size_t, bytes - offset,
768 				   nr_iovecs << PAGE_SHIFT);
769 		struct ec_bio *ec_bio;
770 
771 		ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev,
772 						       nr_iovecs,
773 						       opf,
774 						       GFP_KERNEL,
775 						       &c->ec_bioset),
776 				      struct ec_bio, bio);
777 
778 		ec_bio->ca			= ca;
779 		ec_bio->buf			= buf;
780 		ec_bio->idx			= idx;
781 		ec_bio->rw			= rw;
782 		ec_bio->submit_time		= local_clock();
783 
784 		ec_bio->bio.bi_iter.bi_sector	= ptr->offset + buf->offset + (offset >> 9);
785 		ec_bio->bio.bi_end_io		= ec_block_endio;
786 		ec_bio->bio.bi_private		= cl;
787 
788 		bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
789 
790 		closure_get(cl);
791 		percpu_ref_get(&ca->io_ref[rw]);
792 
793 		submit_bio(&ec_bio->bio);
794 
795 		offset += b;
796 	}
797 
798 	percpu_ref_put(&ca->io_ref[rw]);
799 }
800 
801 static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
802 				struct ec_stripe_buf *stripe)
803 {
804 	struct btree_iter iter;
805 	struct bkey_s_c k;
806 	int ret;
807 
808 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
809 			       POS(0, idx), BTREE_ITER_slots);
810 	ret = bkey_err(k);
811 	if (ret)
812 		goto err;
813 	if (k.k->type != KEY_TYPE_stripe) {
814 		ret = -ENOENT;
815 		goto err;
816 	}
817 	bkey_reassemble(&stripe->key, k);
818 err:
819 	bch2_trans_iter_exit(trans, &iter);
820 	return ret;
821 }
822 
823 /* recovery read path: */
824 int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
825 			struct bkey_s_c orig_k)
826 {
827 	struct bch_fs *c = trans->c;
828 	struct ec_stripe_buf *buf = NULL;
829 	struct closure cl;
830 	struct bch_stripe *v;
831 	unsigned i, offset;
832 	const char *msg = NULL;
833 	struct printbuf msgbuf = PRINTBUF;
834 	int ret = 0;
835 
836 	closure_init_stack(&cl);
837 
838 	BUG_ON(!rbio->pick.has_ec);
839 
840 	buf = kzalloc(sizeof(*buf), GFP_NOFS);
841 	if (!buf)
842 		return -BCH_ERR_ENOMEM_ec_read_extent;
843 
844 	ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
845 	if (ret) {
846 		msg = "stripe not found";
847 		goto err;
848 	}
849 
850 	v = &bkey_i_to_stripe(&buf->key)->v;
851 
852 	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
853 		msg = "pointer doesn't match stripe";
854 		goto err;
855 	}
856 
857 	offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
858 	if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
859 		msg = "read is bigger than stripe";
860 		goto err;
861 	}
862 
863 	ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
864 	if (ret) {
865 		msg = "-ENOMEM";
866 		goto err;
867 	}
868 
869 	for (i = 0; i < v->nr_blocks; i++)
870 		ec_block_io(c, buf, REQ_OP_READ, i, &cl);
871 
872 	closure_sync(&cl);
873 
874 	if (ec_nr_failed(buf) > v->nr_redundant) {
875 		msg = "unable to read enough blocks";
876 		goto err;
877 	}
878 
879 	ec_validate_checksums(c, buf);
880 
881 	ret = ec_do_recov(c, buf);
882 	if (ret)
883 		goto err;
884 
885 	memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
886 		      buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
887 out:
888 	ec_stripe_buf_exit(buf);
889 	kfree(buf);
890 	return ret;
891 err:
892 	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
893 	bch_err_ratelimited(c,
894 			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
895 	printbuf_exit(&msgbuf);
896 	ret = -BCH_ERR_stripe_reconstruct;
897 	goto out;
898 }
899 
900 /* stripe bucket accounting: */
901 
902 static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
903 {
904 	if (c->gc_pos.phase != GC_PHASE_not_running &&
905 	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
906 		return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
907 
908 	return 0;
909 }
910 
911 static int ec_stripe_mem_alloc(struct btree_trans *trans,
912 			       struct btree_iter *iter)
913 {
914 	return allocate_dropping_locks_errcode(trans,
915 			__ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp));
916 }
917 
918 /*
919  * Hash table of open stripes:
920  * Stripes that are being created or modified are kept in a hash table, so that
921  * stripe deletion can skip them.
922  */
923 
924 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
925 {
926 	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
927 	struct ec_stripe_new *s;
928 
929 	hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash)
930 		if (s->idx == idx)
931 			return true;
932 	return false;
933 }
934 
935 static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx)
936 {
937 	bool ret = false;
938 
939 	spin_lock(&c->ec_stripes_new_lock);
940 	ret = __bch2_stripe_is_open(c, idx);
941 	spin_unlock(&c->ec_stripes_new_lock);
942 
943 	return ret;
944 }
945 
946 static bool bch2_try_open_stripe(struct bch_fs *c,
947 				 struct ec_stripe_new *s,
948 				 u64 idx)
949 {
950 	bool ret;
951 
952 	spin_lock(&c->ec_stripes_new_lock);
953 	ret = !__bch2_stripe_is_open(c, idx);
954 	if (ret) {
955 		unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
956 
957 		s->idx = idx;
958 		hlist_add_head(&s->hash, &c->ec_stripes_new[hash]);
959 	}
960 	spin_unlock(&c->ec_stripes_new_lock);
961 
962 	return ret;
963 }
964 
965 static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
966 {
967 	BUG_ON(!s->idx);
968 
969 	spin_lock(&c->ec_stripes_new_lock);
970 	hlist_del_init(&s->hash);
971 	spin_unlock(&c->ec_stripes_new_lock);
972 
973 	s->idx = 0;
974 }
975 
976 /* stripe deletion */
977 
978 static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
979 {
980 	struct btree_iter iter;
981 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
982 					       BTREE_ID_stripes, POS(0, idx),
983 					       BTREE_ITER_intent);
984 	int ret = bkey_err(k);
985 	if (ret)
986 		goto err;
987 
988 	/*
989 	 * We expect write buffer races here
990 	 * Important: check stripe_is_open with stripe key locked:
991 	 */
992 	if (k.k->type == KEY_TYPE_stripe &&
993 	    !bch2_stripe_is_open(trans->c, idx) &&
994 	    stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
995 		ret = bch2_btree_delete_at(trans, &iter, 0);
996 err:
997 	bch2_trans_iter_exit(trans, &iter);
998 	return ret;
999 }
1000 
1001 /*
1002  * XXX
1003  * can we kill this and delete stripes from the trigger?
1004  */
1005 static void ec_stripe_delete_work(struct work_struct *work)
1006 {
1007 	struct bch_fs *c =
1008 		container_of(work, struct bch_fs, ec_stripe_delete_work);
1009 
1010 	bch2_trans_run(c,
1011 		bch2_btree_write_buffer_tryflush(trans) ?:
1012 		for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
1013 				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
1014 				lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
1015 				0, lru_k,
1016 				NULL, NULL,
1017 				BCH_TRANS_COMMIT_no_enospc, ({
1018 			ec_stripe_delete(trans, lru_k.k->p.offset);
1019 		})));
1020 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1021 }
1022 
1023 void bch2_do_stripe_deletes(struct bch_fs *c)
1024 {
1025 	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
1026 	    !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
1027 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
1028 }
1029 
1030 /* stripe creation: */
1031 
1032 static int ec_stripe_key_update(struct btree_trans *trans,
1033 				struct bkey_i_stripe *old,
1034 				struct bkey_i_stripe *new)
1035 {
1036 	struct bch_fs *c = trans->c;
1037 	bool create = !old;
1038 
1039 	struct btree_iter iter;
1040 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
1041 					       new->k.p, BTREE_ITER_intent);
1042 	int ret = bkey_err(k);
1043 	if (ret)
1044 		goto err;
1045 
1046 	if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
1047 				    c, "error %s stripe: got existing key type %s",
1048 				    create ? "creating" : "updating",
1049 				    bch2_bkey_types[k.k->type])) {
1050 		ret = -EINVAL;
1051 		goto err;
1052 	}
1053 
1054 	if (k.k->type == KEY_TYPE_stripe) {
1055 		const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
1056 
1057 		BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
1058 		BUG_ON(old->v.nr_blocks != v->nr_blocks);
1059 
1060 		for (unsigned i = 0; i < new->v.nr_blocks; i++) {
1061 			unsigned sectors = stripe_blockcount_get(v, i);
1062 
1063 			if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
1064 				struct printbuf buf = PRINTBUF;
1065 
1066 				prt_printf(&buf, "stripe changed nonempty block %u", i);
1067 				prt_str(&buf, "\nold: ");
1068 				bch2_bkey_val_to_text(&buf, c, k);
1069 				prt_str(&buf, "\nnew: ");
1070 				bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
1071 				bch2_fs_inconsistent(c, "%s", buf.buf);
1072 				printbuf_exit(&buf);
1073 				ret = -EINVAL;
1074 				goto err;
1075 			}
1076 
1077 			/*
1078 			 * If the stripe ptr changed underneath us, it must have
1079 			 * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
1080 			 */
1081 			if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
1082 				BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
1083 
1084 				if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
1085 					new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
1086 			}
1087 
1088 			stripe_blockcount_set(&new->v, i, sectors);
1089 		}
1090 	}
1091 
1092 	ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
1093 err:
1094 	bch2_trans_iter_exit(trans, &iter);
1095 	return ret;
1096 }
1097 
1098 static int ec_stripe_update_extent(struct btree_trans *trans,
1099 				   struct bch_dev *ca,
1100 				   struct bpos bucket, u8 gen,
1101 				   struct ec_stripe_buf *s,
1102 				   struct bkey_s_c_backpointer bp,
1103 				   struct bkey_buf *last_flushed)
1104 {
1105 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1106 	struct bch_fs *c = trans->c;
1107 	struct btree_iter iter;
1108 	struct bkey_s_c k;
1109 	const struct bch_extent_ptr *ptr_c;
1110 	struct bch_extent_ptr *ec_ptr = NULL;
1111 	struct bch_extent_stripe_ptr stripe_ptr;
1112 	struct bkey_i *n;
1113 	int ret, dev, block;
1114 
1115 	if (bp.v->level) {
1116 		struct printbuf buf = PRINTBUF;
1117 		struct btree_iter node_iter;
1118 		struct btree *b;
1119 
1120 		b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
1121 		bch2_trans_iter_exit(trans, &node_iter);
1122 
1123 		if (!b)
1124 			return 0;
1125 
1126 		prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
1127 		bch2_bkey_val_to_text(&buf, c, bp.s_c);
1128 
1129 		bch2_fs_inconsistent(c, "%s", buf.buf);
1130 		printbuf_exit(&buf);
1131 		return -BCH_ERR_erasure_coding_found_btree_node;
1132 	}
1133 
1134 	k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
1135 	ret = bkey_err(k);
1136 	if (ret)
1137 		return ret;
1138 	if (!k.k) {
1139 		/*
1140 		 * extent no longer exists - we could flush the btree
1141 		 * write buffer and retry to verify, but no need:
1142 		 */
1143 		return 0;
1144 	}
1145 
1146 	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
1147 		goto out;
1148 
1149 	ptr_c = bkey_matches_stripe(v, k, &block);
1150 	/*
1151 	 * It doesn't generally make sense to erasure code cached ptrs:
1152 	 * XXX: should we be incrementing a counter?
1153 	 */
1154 	if (!ptr_c || ptr_c->cached)
1155 		goto out;
1156 
1157 	dev = v->ptrs[block].dev;
1158 
1159 	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
1160 	ret = PTR_ERR_OR_ZERO(n);
1161 	if (ret)
1162 		goto out;
1163 
1164 	bkey_reassemble(n, k);
1165 
1166 	bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev);
1167 	ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev);
1168 	BUG_ON(!ec_ptr);
1169 
1170 	stripe_ptr = (struct bch_extent_stripe_ptr) {
1171 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
1172 		.block		= block,
1173 		.redundancy	= v->nr_redundant,
1174 		.idx		= s->key.k.p.offset,
1175 	};
1176 
1177 	__extent_entry_insert(n,
1178 			(union bch_extent_entry *) ec_ptr,
1179 			(union bch_extent_entry *) &stripe_ptr);
1180 
1181 	ret = bch2_trans_update(trans, &iter, n, 0);
1182 out:
1183 	bch2_trans_iter_exit(trans, &iter);
1184 	return ret;
1185 }
1186 
1187 static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
1188 				   unsigned block)
1189 {
1190 	struct bch_fs *c = trans->c;
1191 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1192 	struct bch_extent_ptr ptr = v->ptrs[block];
1193 	int ret = 0;
1194 
1195 	struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
1196 	if (!ca)
1197 		return -BCH_ERR_ENOENT_dev_not_found;
1198 
1199 	struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
1200 
1201 	struct bkey_buf last_flushed;
1202 	bch2_bkey_buf_init(&last_flushed);
1203 	bkey_init(&last_flushed.k->k);
1204 
1205 	ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
1206 			bucket_pos_to_bp_start(ca, bucket_pos),
1207 			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
1208 			NULL, NULL,
1209 			BCH_TRANS_COMMIT_no_check_rw|
1210 			BCH_TRANS_COMMIT_no_enospc, ({
1211 		if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
1212 			break;
1213 
1214 		if (bp_k.k->type != KEY_TYPE_backpointer)
1215 			continue;
1216 
1217 		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
1218 		if (bp.v->btree_id == BTREE_ID_stripes)
1219 			continue;
1220 
1221 		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
1222 					bp, &last_flushed);
1223 	}));
1224 
1225 	bch2_bkey_buf_exit(&last_flushed, c);
1226 	bch2_dev_put(ca);
1227 	return ret;
1228 }
1229 
1230 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
1231 {
1232 	struct btree_trans *trans = bch2_trans_get(c);
1233 	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
1234 	unsigned nr_data = v->nr_blocks - v->nr_redundant;
1235 
1236 	int ret = bch2_btree_write_buffer_flush_sync(trans);
1237 	if (ret)
1238 		goto err;
1239 
1240 	for (unsigned i = 0; i < nr_data; i++) {
1241 		ret = ec_stripe_update_bucket(trans, s, i);
1242 		if (ret)
1243 			break;
1244 	}
1245 err:
1246 	bch2_trans_put(trans);
1247 	return ret;
1248 }
1249 
1250 static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
1251 				       struct ec_stripe_new *s,
1252 				       unsigned block,
1253 				       struct open_bucket *ob)
1254 {
1255 	struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
1256 	if (!ca) {
1257 		s->err = -BCH_ERR_erofs_no_writes;
1258 		return;
1259 	}
1260 
1261 	unsigned offset = ca->mi.bucket_size - ob->sectors_free;
1262 	memset(s->new_stripe.data[block] + (offset << 9),
1263 	       0,
1264 	       ob->sectors_free << 9);
1265 
1266 	int ret = blkdev_issue_zeroout(ca->disk_sb.bdev,
1267 			ob->bucket * ca->mi.bucket_size + offset,
1268 			ob->sectors_free,
1269 			GFP_KERNEL, 0);
1270 
1271 	percpu_ref_put(&ca->io_ref[WRITE]);
1272 
1273 	if (ret)
1274 		s->err = ret;
1275 }
1276 
1277 void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s)
1278 {
1279 	if (s->idx)
1280 		bch2_stripe_close(c, s);
1281 	kfree(s);
1282 }
1283 
1284 /*
1285  * data buckets of new stripe all written: create the stripe
1286  */
1287 static void ec_stripe_create(struct ec_stripe_new *s)
1288 {
1289 	struct bch_fs *c = s->c;
1290 	struct open_bucket *ob;
1291 	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1292 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
1293 	int ret;
1294 
1295 	BUG_ON(s->h->s == s);
1296 
1297 	closure_sync(&s->iodone);
1298 
1299 	if (!s->err) {
1300 		for (i = 0; i < nr_data; i++)
1301 			if (s->blocks[i]) {
1302 				ob = c->open_buckets + s->blocks[i];
1303 
1304 				if (ob->sectors_free)
1305 					zero_out_rest_of_ec_bucket(c, s, i, ob);
1306 			}
1307 	}
1308 
1309 	if (s->err) {
1310 		if (!bch2_err_matches(s->err, EROFS))
1311 			bch_err(c, "error creating stripe: error writing data buckets");
1312 		ret = s->err;
1313 		goto err;
1314 	}
1315 
1316 	if (s->have_existing_stripe) {
1317 		ec_validate_checksums(c, &s->existing_stripe);
1318 
1319 		if (ec_do_recov(c, &s->existing_stripe)) {
1320 			bch_err(c, "error creating stripe: error reading existing stripe");
1321 			ret = -BCH_ERR_ec_block_read;
1322 			goto err;
1323 		}
1324 
1325 		for (i = 0; i < nr_data; i++)
1326 			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
1327 				swap(s->new_stripe.data[i],
1328 				     s->existing_stripe.data[i]);
1329 
1330 		ec_stripe_buf_exit(&s->existing_stripe);
1331 	}
1332 
1333 	BUG_ON(!s->allocated);
1334 	BUG_ON(!s->idx);
1335 
1336 	ec_generate_ec(&s->new_stripe);
1337 
1338 	ec_generate_checksums(&s->new_stripe);
1339 
1340 	/* write p/q: */
1341 	for (i = nr_data; i < v->nr_blocks; i++)
1342 		ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
1343 	closure_sync(&s->iodone);
1344 
1345 	if (ec_nr_failed(&s->new_stripe)) {
1346 		bch_err(c, "error creating stripe: error writing redundancy buckets");
1347 		ret = -BCH_ERR_ec_block_write;
1348 		goto err;
1349 	}
1350 
1351 	ret = bch2_trans_commit_do(c, &s->res, NULL,
1352 		BCH_TRANS_COMMIT_no_check_rw|
1353 		BCH_TRANS_COMMIT_no_enospc,
1354 		ec_stripe_key_update(trans,
1355 				     s->have_existing_stripe
1356 				     ? bkey_i_to_stripe(&s->existing_stripe.key)
1357 				     : NULL,
1358 				     bkey_i_to_stripe(&s->new_stripe.key)));
1359 	bch_err_msg(c, ret, "creating stripe key");
1360 	if (ret) {
1361 		goto err;
1362 	}
1363 
1364 	ret = ec_stripe_update_extents(c, &s->new_stripe);
1365 	bch_err_msg(c, ret, "error updating extents");
1366 	if (ret)
1367 		goto err;
1368 err:
1369 	trace_stripe_create(c, s->idx, ret);
1370 
1371 	bch2_disk_reservation_put(c, &s->res);
1372 
1373 	for (i = 0; i < v->nr_blocks; i++)
1374 		if (s->blocks[i]) {
1375 			ob = c->open_buckets + s->blocks[i];
1376 
1377 			if (i < nr_data) {
1378 				ob->ec = NULL;
1379 				__bch2_open_bucket_put(c, ob);
1380 			} else {
1381 				bch2_open_bucket_put(c, ob);
1382 			}
1383 		}
1384 
1385 	mutex_lock(&c->ec_stripe_new_lock);
1386 	list_del(&s->list);
1387 	mutex_unlock(&c->ec_stripe_new_lock);
1388 	wake_up(&c->ec_stripe_new_wait);
1389 
1390 	ec_stripe_buf_exit(&s->existing_stripe);
1391 	ec_stripe_buf_exit(&s->new_stripe);
1392 	closure_debug_destroy(&s->iodone);
1393 
1394 	ec_stripe_new_put(c, s, STRIPE_REF_stripe);
1395 }
1396 
1397 static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c)
1398 {
1399 	struct ec_stripe_new *s;
1400 
1401 	mutex_lock(&c->ec_stripe_new_lock);
1402 	list_for_each_entry(s, &c->ec_stripe_new_list, list)
1403 		if (!atomic_read(&s->ref[STRIPE_REF_io]))
1404 			goto out;
1405 	s = NULL;
1406 out:
1407 	mutex_unlock(&c->ec_stripe_new_lock);
1408 
1409 	return s;
1410 }
1411 
1412 static void ec_stripe_create_work(struct work_struct *work)
1413 {
1414 	struct bch_fs *c = container_of(work,
1415 		struct bch_fs, ec_stripe_create_work);
1416 	struct ec_stripe_new *s;
1417 
1418 	while ((s = get_pending_stripe(c)))
1419 		ec_stripe_create(s);
1420 
1421 	bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1422 }
1423 
1424 void bch2_ec_do_stripe_creates(struct bch_fs *c)
1425 {
1426 	bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
1427 
1428 	if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
1429 		bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
1430 }
1431 
1432 static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
1433 {
1434 	struct ec_stripe_new *s = h->s;
1435 
1436 	lockdep_assert_held(&h->lock);
1437 
1438 	BUG_ON(!s->allocated && !s->err);
1439 
1440 	h->s		= NULL;
1441 	s->pending	= true;
1442 
1443 	mutex_lock(&c->ec_stripe_new_lock);
1444 	list_add(&s->list, &c->ec_stripe_new_list);
1445 	mutex_unlock(&c->ec_stripe_new_lock);
1446 
1447 	ec_stripe_new_put(c, s, STRIPE_REF_io);
1448 }
1449 
1450 static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err)
1451 {
1452 	h->s->err = err;
1453 	ec_stripe_new_set_pending(c, h);
1454 }
1455 
1456 void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
1457 {
1458 	struct ec_stripe_new *s = ob->ec;
1459 
1460 	s->err = err;
1461 }
1462 
1463 void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
1464 {
1465 	struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
1466 	if (!ob)
1467 		return NULL;
1468 
1469 	BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]);
1470 
1471 	struct bch_dev *ca	= ob_dev(c, ob);
1472 	unsigned offset		= ca->mi.bucket_size - ob->sectors_free;
1473 
1474 	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
1475 }
1476 
1477 static int unsigned_cmp(const void *_l, const void *_r)
1478 {
1479 	unsigned l = *((const unsigned *) _l);
1480 	unsigned r = *((const unsigned *) _r);
1481 
1482 	return cmp_int(l, r);
1483 }
1484 
1485 /* pick most common bucket size: */
1486 static unsigned pick_blocksize(struct bch_fs *c,
1487 			       struct bch_devs_mask *devs)
1488 {
1489 	unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
1490 	struct {
1491 		unsigned nr, size;
1492 	} cur = { 0, 0 }, best = { 0, 0 };
1493 
1494 	for_each_member_device_rcu(c, ca, devs)
1495 		sizes[nr++] = ca->mi.bucket_size;
1496 
1497 	sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
1498 
1499 	for (unsigned i = 0; i < nr; i++) {
1500 		if (sizes[i] != cur.size) {
1501 			if (cur.nr > best.nr)
1502 				best = cur;
1503 
1504 			cur.nr = 0;
1505 			cur.size = sizes[i];
1506 		}
1507 
1508 		cur.nr++;
1509 	}
1510 
1511 	if (cur.nr > best.nr)
1512 		best = cur;
1513 
1514 	return best.size;
1515 }
1516 
1517 static bool may_create_new_stripe(struct bch_fs *c)
1518 {
1519 	return false;
1520 }
1521 
1522 static void ec_stripe_key_init(struct bch_fs *c,
1523 			       struct bkey_i *k,
1524 			       unsigned nr_data,
1525 			       unsigned nr_parity,
1526 			       unsigned stripe_size,
1527 			       unsigned disk_label)
1528 {
1529 	struct bkey_i_stripe *s = bkey_stripe_init(k);
1530 	unsigned u64s;
1531 
1532 	s->v.sectors			= cpu_to_le16(stripe_size);
1533 	s->v.algorithm			= 0;
1534 	s->v.nr_blocks			= nr_data + nr_parity;
1535 	s->v.nr_redundant		= nr_parity;
1536 	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
1537 	s->v.csum_type			= BCH_CSUM_crc32c;
1538 	s->v.disk_label			= disk_label;
1539 
1540 	while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
1541 		BUG_ON(1 << s->v.csum_granularity_bits >=
1542 		       le16_to_cpu(s->v.sectors) ||
1543 		       s->v.csum_granularity_bits == U8_MAX);
1544 		s->v.csum_granularity_bits++;
1545 	}
1546 
1547 	set_bkey_val_u64s(&s->k, u64s);
1548 }
1549 
1550 static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
1551 {
1552 	struct ec_stripe_new *s;
1553 
1554 	lockdep_assert_held(&h->lock);
1555 
1556 	s = kzalloc(sizeof(*s), GFP_KERNEL);
1557 	if (!s)
1558 		return NULL;
1559 
1560 	mutex_init(&s->lock);
1561 	closure_init(&s->iodone, NULL);
1562 	atomic_set(&s->ref[STRIPE_REF_stripe], 1);
1563 	atomic_set(&s->ref[STRIPE_REF_io], 1);
1564 	s->c		= c;
1565 	s->h		= h;
1566 	s->nr_data	= min_t(unsigned, h->nr_active_devs,
1567 				BCH_BKEY_PTRS_MAX) - h->redundancy;
1568 	s->nr_parity	= h->redundancy;
1569 
1570 	ec_stripe_key_init(c, &s->new_stripe.key,
1571 			   s->nr_data, s->nr_parity,
1572 			   h->blocksize, h->disk_label);
1573 	return s;
1574 }
1575 
1576 static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
1577 {
1578 	struct bch_devs_mask devs = h->devs;
1579 
1580 	rcu_read_lock();
1581 	h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
1582 				 ? group_to_target(h->disk_label - 1)
1583 				 : 0);
1584 	unsigned nr_devs = dev_mask_nr(&h->devs);
1585 
1586 	for_each_member_device_rcu(c, ca, &h->devs)
1587 		if (!ca->mi.durability)
1588 			__clear_bit(ca->dev_idx, h->devs.d);
1589 	unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
1590 
1591 	h->blocksize = pick_blocksize(c, &h->devs);
1592 
1593 	h->nr_active_devs = 0;
1594 	for_each_member_device_rcu(c, ca, &h->devs)
1595 		if (ca->mi.bucket_size == h->blocksize)
1596 			h->nr_active_devs++;
1597 
1598 	rcu_read_unlock();
1599 
1600 	/*
1601 	 * If we only have redundancy + 1 devices, we're better off with just
1602 	 * replication:
1603 	 */
1604 	h->insufficient_devs = h->nr_active_devs < h->redundancy + 2;
1605 
1606 	if (h->insufficient_devs) {
1607 		const char *err;
1608 
1609 		if (nr_devs < h->redundancy + 2)
1610 			err = NULL;
1611 		else if (nr_devs_with_durability < h->redundancy + 2)
1612 			err = "cannot use durability=0 devices";
1613 		else
1614 			err = "mismatched bucket sizes";
1615 
1616 		if (err)
1617 			bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s",
1618 				h->nr_active_devs, h->redundancy + 2, err);
1619 	}
1620 
1621 	struct bch_devs_mask devs_leaving;
1622 	bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX);
1623 
1624 	if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving))
1625 		ec_stripe_new_cancel(c, h, -EINTR);
1626 
1627 	h->rw_devs_change_count = c->rw_devs_change_count;
1628 }
1629 
1630 static struct ec_stripe_head *
1631 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label,
1632 			 unsigned algo, unsigned redundancy,
1633 			 enum bch_watermark watermark)
1634 {
1635 	struct ec_stripe_head *h;
1636 
1637 	h = kzalloc(sizeof(*h), GFP_KERNEL);
1638 	if (!h)
1639 		return NULL;
1640 
1641 	mutex_init(&h->lock);
1642 	BUG_ON(!mutex_trylock(&h->lock));
1643 
1644 	h->disk_label	= disk_label;
1645 	h->algo		= algo;
1646 	h->redundancy	= redundancy;
1647 	h->watermark	= watermark;
1648 
1649 	list_add(&h->list, &c->ec_stripe_head_list);
1650 	return h;
1651 }
1652 
1653 void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
1654 {
1655 	if (h->s &&
1656 	    h->s->allocated &&
1657 	    bitmap_weight(h->s->blocks_allocated,
1658 			  h->s->nr_data) == h->s->nr_data)
1659 		ec_stripe_new_set_pending(c, h);
1660 
1661 	mutex_unlock(&h->lock);
1662 }
1663 
1664 static struct ec_stripe_head *
1665 __bch2_ec_stripe_head_get(struct btree_trans *trans,
1666 			  unsigned disk_label,
1667 			  unsigned algo,
1668 			  unsigned redundancy,
1669 			  enum bch_watermark watermark)
1670 {
1671 	struct bch_fs *c = trans->c;
1672 	struct ec_stripe_head *h;
1673 	int ret;
1674 
1675 	if (!redundancy)
1676 		return NULL;
1677 
1678 	ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock);
1679 	if (ret)
1680 		return ERR_PTR(ret);
1681 
1682 	if (test_bit(BCH_FS_going_ro, &c->flags)) {
1683 		h = ERR_PTR(-BCH_ERR_erofs_no_writes);
1684 		goto err;
1685 	}
1686 
1687 	list_for_each_entry(h, &c->ec_stripe_head_list, list)
1688 		if (h->disk_label	== disk_label &&
1689 		    h->algo		== algo &&
1690 		    h->redundancy	== redundancy &&
1691 		    h->watermark	== watermark) {
1692 			ret = bch2_trans_mutex_lock(trans, &h->lock);
1693 			if (ret) {
1694 				h = ERR_PTR(ret);
1695 				goto err;
1696 			}
1697 			goto found;
1698 		}
1699 
1700 	h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
1701 	if (!h) {
1702 		h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
1703 		goto err;
1704 	}
1705 found:
1706 	if (h->rw_devs_change_count != c->rw_devs_change_count)
1707 		ec_stripe_head_devs_update(c, h);
1708 
1709 	if (h->insufficient_devs) {
1710 		mutex_unlock(&h->lock);
1711 		h = NULL;
1712 	}
1713 err:
1714 	mutex_unlock(&c->ec_stripe_head_lock);
1715 	return h;
1716 }
1717 
1718 static int new_stripe_alloc_buckets(struct btree_trans *trans,
1719 				    struct ec_stripe_head *h, struct ec_stripe_new *s,
1720 				    enum bch_watermark watermark, struct closure *cl)
1721 {
1722 	struct bch_fs *c = trans->c;
1723 	struct bch_devs_mask devs = h->devs;
1724 	struct open_bucket *ob;
1725 	struct open_buckets buckets;
1726 	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1727 	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
1728 	bool have_cache = true;
1729 	int ret = 0;
1730 
1731 	BUG_ON(v->nr_blocks	!= s->nr_data + s->nr_parity);
1732 	BUG_ON(v->nr_redundant	!= s->nr_parity);
1733 
1734 	/* * We bypass the sector allocator which normally does this: */
1735 	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
1736 
1737 	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
1738 		/*
1739 		 * Note: we don't yet repair invalid blocks (failed/removed
1740 		 * devices) when reusing stripes - we still need a codepath to
1741 		 * walk backpointers and update all extents that point to that
1742 		 * block when updating the stripe
1743 		 */
1744 		if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
1745 			__clear_bit(v->ptrs[i].dev, devs.d);
1746 
1747 		if (i < s->nr_data)
1748 			nr_have_data++;
1749 		else
1750 			nr_have_parity++;
1751 	}
1752 
1753 	BUG_ON(nr_have_data	> s->nr_data);
1754 	BUG_ON(nr_have_parity	> s->nr_parity);
1755 
1756 	buckets.nr = 0;
1757 	if (nr_have_parity < s->nr_parity) {
1758 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1759 					    &h->parity_stripe,
1760 					    &devs,
1761 					    s->nr_parity,
1762 					    &nr_have_parity,
1763 					    &have_cache, 0,
1764 					    BCH_DATA_parity,
1765 					    watermark,
1766 					    cl);
1767 
1768 		open_bucket_for_each(c, &buckets, ob, i) {
1769 			j = find_next_zero_bit(s->blocks_gotten,
1770 					       s->nr_data + s->nr_parity,
1771 					       s->nr_data);
1772 			BUG_ON(j >= s->nr_data + s->nr_parity);
1773 
1774 			s->blocks[j] = buckets.v[i];
1775 			v->ptrs[j] = bch2_ob_ptr(c, ob);
1776 			__set_bit(j, s->blocks_gotten);
1777 		}
1778 
1779 		if (ret)
1780 			return ret;
1781 	}
1782 
1783 	buckets.nr = 0;
1784 	if (nr_have_data < s->nr_data) {
1785 		ret = bch2_bucket_alloc_set_trans(trans, &buckets,
1786 					    &h->block_stripe,
1787 					    &devs,
1788 					    s->nr_data,
1789 					    &nr_have_data,
1790 					    &have_cache, 0,
1791 					    BCH_DATA_user,
1792 					    watermark,
1793 					    cl);
1794 
1795 		open_bucket_for_each(c, &buckets, ob, i) {
1796 			j = find_next_zero_bit(s->blocks_gotten,
1797 					       s->nr_data, 0);
1798 			BUG_ON(j >= s->nr_data);
1799 
1800 			s->blocks[j] = buckets.v[i];
1801 			v->ptrs[j] = bch2_ob_ptr(c, ob);
1802 			__set_bit(j, s->blocks_gotten);
1803 		}
1804 
1805 		if (ret)
1806 			return ret;
1807 	}
1808 
1809 	return 0;
1810 }
1811 
1812 static int __get_existing_stripe(struct btree_trans *trans,
1813 				 struct ec_stripe_head *head,
1814 				 struct ec_stripe_buf *stripe,
1815 				 u64 idx)
1816 {
1817 	struct bch_fs *c = trans->c;
1818 
1819 	struct btree_iter iter;
1820 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
1821 					  BTREE_ID_stripes, POS(0, idx), 0);
1822 	int ret = bkey_err(k);
1823 	if (ret)
1824 		goto err;
1825 
1826 	/* We expect write buffer races here */
1827 	if (k.k->type != KEY_TYPE_stripe)
1828 		goto out;
1829 
1830 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
1831 	if (stripe_lru_pos(s.v) <= 1)
1832 		goto out;
1833 
1834 	if (s.v->disk_label		== head->disk_label &&
1835 	    s.v->algorithm		== head->algo &&
1836 	    s.v->nr_redundant		== head->redundancy &&
1837 	    le16_to_cpu(s.v->sectors)	== head->blocksize &&
1838 	    bch2_try_open_stripe(c, head->s, idx)) {
1839 		bkey_reassemble(&stripe->key, k);
1840 		ret = 1;
1841 	}
1842 out:
1843 	bch2_set_btree_iter_dontneed(trans, &iter);
1844 err:
1845 	bch2_trans_iter_exit(trans, &iter);
1846 	return ret;
1847 }
1848 
1849 static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
1850 {
1851 	struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
1852 	struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
1853 	unsigned i;
1854 
1855 	BUG_ON(existing_v->nr_redundant != s->nr_parity);
1856 	s->nr_data = existing_v->nr_blocks -
1857 		existing_v->nr_redundant;
1858 
1859 	int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
1860 	if (ret) {
1861 		bch2_stripe_close(c, s);
1862 		return ret;
1863 	}
1864 
1865 	BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
1866 
1867 	/*
1868 	 * Free buckets we initially allocated - they might conflict with
1869 	 * blocks from the stripe we're reusing:
1870 	 */
1871 	for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
1872 		bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
1873 		s->blocks[i] = 0;
1874 	}
1875 	memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
1876 	memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
1877 
1878 	for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
1879 		if (stripe_blockcount_get(existing_v, i)) {
1880 			__set_bit(i, s->blocks_gotten);
1881 			__set_bit(i, s->blocks_allocated);
1882 		}
1883 
1884 		ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
1885 	}
1886 
1887 	bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
1888 	s->have_existing_stripe = true;
1889 
1890 	return 0;
1891 }
1892 
1893 static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
1894 				       struct ec_stripe_new *s)
1895 {
1896 	struct bch_fs *c = trans->c;
1897 
1898 	/*
1899 	 * If we can't allocate a new stripe, and there's no stripes with empty
1900 	 * blocks for us to reuse, that means we have to wait on copygc:
1901 	 */
1902 	if (may_create_new_stripe(c))
1903 		return -1;
1904 
1905 	struct btree_iter lru_iter;
1906 	struct bkey_s_c lru_k;
1907 	int ret = 0;
1908 
1909 	for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
1910 			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
1911 			lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
1912 			0, lru_k, ret) {
1913 		ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
1914 		if (ret)
1915 			break;
1916 	}
1917 	bch2_trans_iter_exit(trans, &lru_iter);
1918 	if (!ret)
1919 		ret = -BCH_ERR_stripe_alloc_blocked;
1920 	if (ret == 1)
1921 		ret = 0;
1922 	if (ret)
1923 		return ret;
1924 
1925 	return init_new_stripe_from_existing(c, s);
1926 }
1927 
1928 static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
1929 					 struct ec_stripe_new *s)
1930 {
1931 	struct bch_fs *c = trans->c;
1932 	struct btree_iter iter;
1933 	struct bkey_s_c k;
1934 	struct bpos min_pos = POS(0, 1);
1935 	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
1936 	int ret;
1937 
1938 	if (!s->res.sectors) {
1939 		ret = bch2_disk_reservation_get(c, &s->res,
1940 					h->blocksize,
1941 					s->nr_parity,
1942 					BCH_DISK_RESERVATION_NOFAIL);
1943 		if (ret)
1944 			return ret;
1945 	}
1946 
1947 	/*
1948 	 * Allocate stripe slot
1949 	 * XXX: we're going to need a bitrange btree of free stripes
1950 	 */
1951 	for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
1952 			   BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
1953 		if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
1954 			if (start_pos.offset) {
1955 				start_pos = min_pos;
1956 				bch2_btree_iter_set_pos(trans, &iter, start_pos);
1957 				continue;
1958 			}
1959 
1960 			ret = -BCH_ERR_ENOSPC_stripe_create;
1961 			break;
1962 		}
1963 
1964 		if (bkey_deleted(k.k) &&
1965 		    bch2_try_open_stripe(c, s, k.k->p.offset))
1966 			break;
1967 	}
1968 
1969 	c->ec_stripe_hint = iter.pos.offset;
1970 
1971 	if (ret)
1972 		goto err;
1973 
1974 	ret = ec_stripe_mem_alloc(trans, &iter);
1975 	if (ret) {
1976 		bch2_stripe_close(c, s);
1977 		goto err;
1978 	}
1979 
1980 	s->new_stripe.key.k.p = iter.pos;
1981 out:
1982 	bch2_trans_iter_exit(trans, &iter);
1983 	return ret;
1984 err:
1985 	bch2_disk_reservation_put(c, &s->res);
1986 	goto out;
1987 }
1988 
1989 struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
1990 					       unsigned target,
1991 					       unsigned algo,
1992 					       unsigned redundancy,
1993 					       enum bch_watermark watermark,
1994 					       struct closure *cl)
1995 {
1996 	struct bch_fs *c = trans->c;
1997 	struct ec_stripe_head *h;
1998 	bool waiting = false;
1999 	unsigned disk_label = 0;
2000 	struct target t = target_decode(target);
2001 	int ret;
2002 
2003 	if (t.type == TARGET_GROUP) {
2004 		if (t.group > U8_MAX) {
2005 			bch_err(c, "cannot create a stripe when disk_label > U8_MAX");
2006 			return NULL;
2007 		}
2008 		disk_label = t.group + 1; /* 0 == no label */
2009 	}
2010 
2011 	h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
2012 	if (IS_ERR_OR_NULL(h))
2013 		return h;
2014 
2015 	if (!h->s) {
2016 		h->s = ec_new_stripe_alloc(c, h);
2017 		if (!h->s) {
2018 			ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
2019 			bch_err(c, "failed to allocate new stripe");
2020 			goto err;
2021 		}
2022 
2023 		h->nr_created++;
2024 	}
2025 
2026 	struct ec_stripe_new *s = h->s;
2027 
2028 	if (s->allocated)
2029 		goto allocated;
2030 
2031 	if (s->have_existing_stripe)
2032 		goto alloc_existing;
2033 
2034 	/* First, try to allocate a full stripe: */
2035 	ret =   new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
2036 		__bch2_ec_stripe_head_reserve(trans, h, s);
2037 	if (!ret)
2038 		goto allocate_buf;
2039 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
2040 	    bch2_err_matches(ret, ENOMEM))
2041 		goto err;
2042 
2043 	/*
2044 	 * Not enough buckets available for a full stripe: we must reuse an
2045 	 * existing stripe:
2046 	 */
2047 	while (1) {
2048 		ret = __bch2_ec_stripe_head_reuse(trans, h, s);
2049 		if (!ret)
2050 			break;
2051 		if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
2052 			goto err;
2053 
2054 		if (watermark == BCH_WATERMARK_copygc) {
2055 			ret =   new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
2056 				__bch2_ec_stripe_head_reserve(trans, h, s);
2057 			if (ret)
2058 				goto err;
2059 			goto allocate_buf;
2060 		}
2061 
2062 		/* XXX freelist_wait? */
2063 		closure_wait(&c->freelist_wait, cl);
2064 		waiting = true;
2065 	}
2066 
2067 	if (waiting)
2068 		closure_wake_up(&c->freelist_wait);
2069 alloc_existing:
2070 	/*
2071 	 * Retry allocating buckets, with the watermark for this
2072 	 * particular write:
2073 	 */
2074 	ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
2075 	if (ret)
2076 		goto err;
2077 
2078 allocate_buf:
2079 	ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
2080 	if (ret)
2081 		goto err;
2082 
2083 	s->allocated = true;
2084 allocated:
2085 	BUG_ON(!s->idx);
2086 	BUG_ON(!s->new_stripe.data[0]);
2087 	BUG_ON(trans->restarted);
2088 	return h;
2089 err:
2090 	bch2_ec_stripe_head_put(c, h);
2091 	return ERR_PTR(ret);
2092 }
2093 
2094 /* device removal */
2095 
2096 static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
2097 {
2098 	struct bch_alloc_v4 a_convert;
2099 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
2100 
2101 	if (!a->stripe)
2102 		return 0;
2103 
2104 	if (a->stripe_sectors) {
2105 		bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
2106 		return -BCH_ERR_invalidate_stripe_to_dev;
2107 	}
2108 
2109 	struct btree_iter iter;
2110 	struct bkey_i_stripe *s =
2111 		bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
2112 					BTREE_ITER_slots, stripe);
2113 	int ret = PTR_ERR_OR_ZERO(s);
2114 	if (ret)
2115 		return ret;
2116 
2117 	struct disk_accounting_pos acc;
2118 
2119 	s64 sectors = 0;
2120 	for (unsigned i = 0; i < s->v.nr_blocks; i++)
2121 		sectors -= stripe_blockcount_get(&s->v, i);
2122 
2123 	memset(&acc, 0, sizeof(acc));
2124 	acc.type = BCH_DISK_ACCOUNTING_replicas;
2125 	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2126 	acc.replicas.data_type = BCH_DATA_user;
2127 	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2128 	if (ret)
2129 		goto err;
2130 
2131 	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
2132 	bkey_for_each_ptr(ptrs, ptr)
2133 		if (ptr->dev == k_a.k->p.inode)
2134 			ptr->dev = BCH_SB_MEMBER_INVALID;
2135 
2136 	sectors = -sectors;
2137 
2138 	memset(&acc, 0, sizeof(acc));
2139 	acc.type = BCH_DISK_ACCOUNTING_replicas;
2140 	bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
2141 	acc.replicas.data_type = BCH_DATA_user;
2142 	ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
2143 	if (ret)
2144 		goto err;
2145 err:
2146 	bch2_trans_iter_exit(trans, &iter);
2147 	return ret;
2148 }
2149 
2150 int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
2151 {
2152 	return bch2_trans_run(c,
2153 		for_each_btree_key_max_commit(trans, iter,
2154 				  BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
2155 				  BTREE_ITER_intent, k,
2156 				  NULL, NULL, 0, ({
2157 			bch2_invalidate_stripe_to_dev(trans, k);
2158 	})));
2159 }
2160 
2161 /* startup/shutdown */
2162 
2163 static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
2164 {
2165 	struct ec_stripe_head *h;
2166 	struct open_bucket *ob;
2167 	unsigned i;
2168 
2169 	mutex_lock(&c->ec_stripe_head_lock);
2170 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2171 		mutex_lock(&h->lock);
2172 		if (!h->s)
2173 			goto unlock;
2174 
2175 		if (!ca)
2176 			goto found;
2177 
2178 		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
2179 			if (!h->s->blocks[i])
2180 				continue;
2181 
2182 			ob = c->open_buckets + h->s->blocks[i];
2183 			if (ob->dev == ca->dev_idx)
2184 				goto found;
2185 		}
2186 		goto unlock;
2187 found:
2188 		ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes);
2189 unlock:
2190 		mutex_unlock(&h->lock);
2191 	}
2192 	mutex_unlock(&c->ec_stripe_head_lock);
2193 }
2194 
2195 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
2196 {
2197 	__bch2_ec_stop(c, ca);
2198 }
2199 
2200 void bch2_fs_ec_stop(struct bch_fs *c)
2201 {
2202 	__bch2_ec_stop(c, NULL);
2203 }
2204 
2205 static bool bch2_fs_ec_flush_done(struct bch_fs *c)
2206 {
2207 	bool ret;
2208 
2209 	mutex_lock(&c->ec_stripe_new_lock);
2210 	ret = list_empty(&c->ec_stripe_new_list);
2211 	mutex_unlock(&c->ec_stripe_new_lock);
2212 
2213 	return ret;
2214 }
2215 
2216 void bch2_fs_ec_flush(struct bch_fs *c)
2217 {
2218 	wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
2219 }
2220 
2221 int bch2_stripes_read(struct bch_fs *c)
2222 {
2223 	return 0;
2224 }
2225 
2226 static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
2227 				    struct ec_stripe_new *s)
2228 {
2229 	prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
2230 		   s->idx, s->nr_data, s->nr_parity,
2231 		   bitmap_weight(s->blocks_allocated, s->nr_data),
2232 		   atomic_read(&s->ref[STRIPE_REF_io]),
2233 		   atomic_read(&s->ref[STRIPE_REF_stripe]),
2234 		   bch2_watermarks[s->h->watermark]);
2235 
2236 	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
2237 	unsigned i;
2238 	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
2239 		prt_printf(out, " %u", s->blocks[i]);
2240 	prt_newline(out);
2241 	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key));
2242 	prt_newline(out);
2243 }
2244 
2245 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
2246 {
2247 	struct ec_stripe_head *h;
2248 	struct ec_stripe_new *s;
2249 
2250 	mutex_lock(&c->ec_stripe_head_lock);
2251 	list_for_each_entry(h, &c->ec_stripe_head_list, list) {
2252 		prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n",
2253 		       h->disk_label, h->algo, h->redundancy,
2254 		       bch2_watermarks[h->watermark],
2255 		       h->nr_created);
2256 
2257 		if (h->s)
2258 			bch2_new_stripe_to_text(out, c, h->s);
2259 	}
2260 	mutex_unlock(&c->ec_stripe_head_lock);
2261 
2262 	prt_printf(out, "in flight:\n");
2263 
2264 	mutex_lock(&c->ec_stripe_new_lock);
2265 	list_for_each_entry(s, &c->ec_stripe_new_list, list)
2266 		bch2_new_stripe_to_text(out, c, s);
2267 	mutex_unlock(&c->ec_stripe_new_lock);
2268 }
2269 
2270 void bch2_fs_ec_exit(struct bch_fs *c)
2271 {
2272 	struct ec_stripe_head *h;
2273 	unsigned i;
2274 
2275 	while (1) {
2276 		mutex_lock(&c->ec_stripe_head_lock);
2277 		h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
2278 		mutex_unlock(&c->ec_stripe_head_lock);
2279 
2280 		if (!h)
2281 			break;
2282 
2283 		if (h->s) {
2284 			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
2285 				BUG_ON(h->s->blocks[i]);
2286 
2287 			kfree(h->s);
2288 		}
2289 		kfree(h);
2290 	}
2291 
2292 	BUG_ON(!list_empty(&c->ec_stripe_new_list));
2293 
2294 	bioset_exit(&c->ec_bioset);
2295 }
2296 
2297 void bch2_fs_ec_init_early(struct bch_fs *c)
2298 {
2299 	spin_lock_init(&c->ec_stripes_new_lock);
2300 
2301 	INIT_LIST_HEAD(&c->ec_stripe_head_list);
2302 	mutex_init(&c->ec_stripe_head_lock);
2303 
2304 	INIT_LIST_HEAD(&c->ec_stripe_new_list);
2305 	mutex_init(&c->ec_stripe_new_lock);
2306 	init_waitqueue_head(&c->ec_stripe_new_wait);
2307 
2308 	INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
2309 	INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
2310 }
2311 
2312 int bch2_fs_ec_init(struct bch_fs *c)
2313 {
2314 	return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
2315 			   BIOSET_NEED_BVECS);
2316 }
2317 
2318 static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
2319 					struct bkey_s_c k,
2320 					struct bkey_buf *last_flushed)
2321 {
2322 	if (k.k->type != KEY_TYPE_stripe)
2323 		return 0;
2324 
2325 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
2326 
2327 	u64 lru_idx = stripe_lru_pos(s.v);
2328 	if (lru_idx) {
2329 		int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
2330 					     k.k->p.offset, lru_idx, k, last_flushed);
2331 		if (ret)
2332 			return ret;
2333 	}
2334 	return 0;
2335 }
2336 
2337 int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
2338 {
2339 	struct bkey_buf last_flushed;
2340 
2341 	bch2_bkey_buf_init(&last_flushed);
2342 	bkey_init(&last_flushed.k->k);
2343 
2344 	int ret = bch2_trans_run(c,
2345 		for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
2346 				POS_MIN, BTREE_ITER_prefetch, k,
2347 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2348 			bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
2349 
2350 	bch2_bkey_buf_exit(&last_flushed, c);
2351 	bch_err_fn(c, ret);
2352 	return ret;
2353 }
2354