xref: /linux/fs/bcachefs/alloc_background.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "backpointers.h"
6 #include "bkey_buf.h"
7 #include "btree_cache.h"
8 #include "btree_io.h"
9 #include "btree_key_cache.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_gc.h"
13 #include "btree_write_buffer.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "clock.h"
17 #include "debug.h"
18 #include "disk_accounting.h"
19 #include "ec.h"
20 #include "error.h"
21 #include "lru.h"
22 #include "recovery.h"
23 #include "trace.h"
24 #include "varint.h"
25 
26 #include <linux/kthread.h>
27 #include <linux/math64.h>
28 #include <linux/random.h>
29 #include <linux/rculist.h>
30 #include <linux/rcupdate.h>
31 #include <linux/sched/task.h>
32 #include <linux/sort.h>
33 #include <linux/jiffies.h>
34 
35 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
36 
37 /* Persistent alloc info: */
38 
39 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
40 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
41 	BCH_ALLOC_FIELDS_V1()
42 #undef x
43 };
44 
45 struct bkey_alloc_unpacked {
46 	u64		journal_seq;
47 	u8		gen;
48 	u8		oldest_gen;
49 	u8		data_type;
50 	bool		need_discard:1;
51 	bool		need_inc_gen:1;
52 #define x(_name, _bits)	u##_bits _name;
53 	BCH_ALLOC_FIELDS_V2()
54 #undef  x
55 };
56 
57 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
58 				     const void **p, unsigned field)
59 {
60 	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
61 	u64 v;
62 
63 	if (!(a->fields & (1 << field)))
64 		return 0;
65 
66 	switch (bytes) {
67 	case 1:
68 		v = *((const u8 *) *p);
69 		break;
70 	case 2:
71 		v = le16_to_cpup(*p);
72 		break;
73 	case 4:
74 		v = le32_to_cpup(*p);
75 		break;
76 	case 8:
77 		v = le64_to_cpup(*p);
78 		break;
79 	default:
80 		BUG();
81 	}
82 
83 	*p += bytes;
84 	return v;
85 }
86 
87 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
88 				 struct bkey_s_c k)
89 {
90 	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
91 	const void *d = in->data;
92 	unsigned idx = 0;
93 
94 	out->gen = in->gen;
95 
96 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
97 	BCH_ALLOC_FIELDS_V1()
98 #undef  x
99 }
100 
101 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
102 				struct bkey_s_c k)
103 {
104 	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
105 	const u8 *in = a.v->data;
106 	const u8 *end = bkey_val_end(a);
107 	unsigned fieldnr = 0;
108 	int ret;
109 	u64 v;
110 
111 	out->gen	= a.v->gen;
112 	out->oldest_gen	= a.v->oldest_gen;
113 	out->data_type	= a.v->data_type;
114 
115 #define x(_name, _bits)							\
116 	if (fieldnr < a.v->nr_fields) {					\
117 		ret = bch2_varint_decode_fast(in, end, &v);		\
118 		if (ret < 0)						\
119 			return ret;					\
120 		in += ret;						\
121 	} else {							\
122 		v = 0;							\
123 	}								\
124 	out->_name = v;							\
125 	if (v != out->_name)						\
126 		return -1;						\
127 	fieldnr++;
128 
129 	BCH_ALLOC_FIELDS_V2()
130 #undef  x
131 	return 0;
132 }
133 
134 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
135 				struct bkey_s_c k)
136 {
137 	struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
138 	const u8 *in = a.v->data;
139 	const u8 *end = bkey_val_end(a);
140 	unsigned fieldnr = 0;
141 	int ret;
142 	u64 v;
143 
144 	out->gen	= a.v->gen;
145 	out->oldest_gen	= a.v->oldest_gen;
146 	out->data_type	= a.v->data_type;
147 	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
148 	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
149 	out->journal_seq = le64_to_cpu(a.v->journal_seq);
150 
151 #define x(_name, _bits)							\
152 	if (fieldnr < a.v->nr_fields) {					\
153 		ret = bch2_varint_decode_fast(in, end, &v);		\
154 		if (ret < 0)						\
155 			return ret;					\
156 		in += ret;						\
157 	} else {							\
158 		v = 0;							\
159 	}								\
160 	out->_name = v;							\
161 	if (v != out->_name)						\
162 		return -1;						\
163 	fieldnr++;
164 
165 	BCH_ALLOC_FIELDS_V2()
166 #undef  x
167 	return 0;
168 }
169 
170 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
171 {
172 	struct bkey_alloc_unpacked ret = { .gen	= 0 };
173 
174 	switch (k.k->type) {
175 	case KEY_TYPE_alloc:
176 		bch2_alloc_unpack_v1(&ret, k);
177 		break;
178 	case KEY_TYPE_alloc_v2:
179 		bch2_alloc_unpack_v2(&ret, k);
180 		break;
181 	case KEY_TYPE_alloc_v3:
182 		bch2_alloc_unpack_v3(&ret, k);
183 		break;
184 	}
185 
186 	return ret;
187 }
188 
189 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
190 {
191 	unsigned i, bytes = offsetof(struct bch_alloc, data);
192 
193 	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
194 		if (a->fields & (1 << i))
195 			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
196 
197 	return DIV_ROUND_UP(bytes, sizeof(u64));
198 }
199 
200 int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
201 			   enum bch_validate_flags flags)
202 {
203 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
204 	int ret = 0;
205 
206 	/* allow for unknown fields */
207 	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
208 			 c, alloc_v1_val_size_bad,
209 			 "incorrect value size (%zu < %u)",
210 			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
211 fsck_err:
212 	return ret;
213 }
214 
215 int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
216 			   enum bch_validate_flags flags)
217 {
218 	struct bkey_alloc_unpacked u;
219 	int ret = 0;
220 
221 	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
222 			 c, alloc_v2_unpack_error,
223 			 "unpack error");
224 fsck_err:
225 	return ret;
226 }
227 
228 int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
229 			   enum bch_validate_flags flags)
230 {
231 	struct bkey_alloc_unpacked u;
232 	int ret = 0;
233 
234 	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
235 			 c, alloc_v2_unpack_error,
236 			 "unpack error");
237 fsck_err:
238 	return ret;
239 }
240 
241 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
242 			   enum bch_validate_flags flags)
243 {
244 	struct bch_alloc_v4 a;
245 	int ret = 0;
246 
247 	bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
248 
249 	bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
250 			 c, alloc_v4_val_size_bad,
251 			 "bad val size (%u > %zu)",
252 			 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
253 
254 	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
255 			 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
256 			 c, alloc_v4_backpointers_start_bad,
257 			 "invalid backpointers_start");
258 
259 	bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
260 			 c, alloc_key_data_type_bad,
261 			 "invalid data type (got %u should be %u)",
262 			 a.data_type, alloc_data_type(a, a.data_type));
263 
264 	for (unsigned i = 0; i < 2; i++)
265 		bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
266 				 c, alloc_key_io_time_bad,
267 				 "invalid io_time[%s]: %llu, max %llu",
268 				 i == READ ? "read" : "write",
269 				 a.io_time[i], LRU_TIME_MAX);
270 
271 	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
272 		offsetof(struct bch_alloc_v4, stripe_sectors)
273 		? a.stripe_sectors
274 		: 0;
275 
276 	switch (a.data_type) {
277 	case BCH_DATA_free:
278 	case BCH_DATA_need_gc_gens:
279 	case BCH_DATA_need_discard:
280 		bkey_fsck_err_on(stripe_sectors ||
281 				 a.dirty_sectors ||
282 				 a.cached_sectors ||
283 				 a.stripe,
284 				 c, alloc_key_empty_but_have_data,
285 				 "empty data type free but have data %u.%u.%u %u",
286 				 stripe_sectors,
287 				 a.dirty_sectors,
288 				 a.cached_sectors,
289 				 a.stripe);
290 		break;
291 	case BCH_DATA_sb:
292 	case BCH_DATA_journal:
293 	case BCH_DATA_btree:
294 	case BCH_DATA_user:
295 	case BCH_DATA_parity:
296 		bkey_fsck_err_on(!a.dirty_sectors &&
297 				 !stripe_sectors,
298 				 c, alloc_key_dirty_sectors_0,
299 				 "data_type %s but dirty_sectors==0",
300 				 bch2_data_type_str(a.data_type));
301 		break;
302 	case BCH_DATA_cached:
303 		bkey_fsck_err_on(!a.cached_sectors ||
304 				 a.dirty_sectors ||
305 				 stripe_sectors ||
306 				 a.stripe,
307 				 c, alloc_key_cached_inconsistency,
308 				 "data type inconsistency");
309 
310 		bkey_fsck_err_on(!a.io_time[READ] &&
311 				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
312 				 c, alloc_key_cached_but_read_time_zero,
313 				 "cached bucket with read_time == 0");
314 		break;
315 	case BCH_DATA_stripe:
316 		break;
317 	}
318 fsck_err:
319 	return ret;
320 }
321 
322 void bch2_alloc_v4_swab(struct bkey_s k)
323 {
324 	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
325 	struct bch_backpointer *bp, *bps;
326 
327 	a->journal_seq		= swab64(a->journal_seq);
328 	a->flags		= swab32(a->flags);
329 	a->dirty_sectors	= swab32(a->dirty_sectors);
330 	a->cached_sectors	= swab32(a->cached_sectors);
331 	a->io_time[0]		= swab64(a->io_time[0]);
332 	a->io_time[1]		= swab64(a->io_time[1]);
333 	a->stripe		= swab32(a->stripe);
334 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
335 	a->fragmentation_lru	= swab64(a->fragmentation_lru);
336 	a->stripe_sectors	= swab32(a->stripe_sectors);
337 
338 	bps = alloc_v4_backpointers(a);
339 	for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
340 		bp->bucket_offset	= swab40(bp->bucket_offset);
341 		bp->bucket_len		= swab32(bp->bucket_len);
342 		bch2_bpos_swab(&bp->pos);
343 	}
344 }
345 
346 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
347 {
348 	struct bch_alloc_v4 _a;
349 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
350 
351 	prt_newline(out);
352 	printbuf_indent_add(out, 2);
353 
354 	prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
355 	bch2_prt_data_type(out, a->data_type);
356 	prt_newline(out);
357 	prt_printf(out, "journal_seq       %llu\n",	a->journal_seq);
358 	prt_printf(out, "need_discard      %llu\n",	BCH_ALLOC_V4_NEED_DISCARD(a));
359 	prt_printf(out, "need_inc_gen      %llu\n",	BCH_ALLOC_V4_NEED_INC_GEN(a));
360 	prt_printf(out, "dirty_sectors     %u\n",	a->dirty_sectors);
361 	prt_printf(out, "stripe_sectors    %u\n",	a->stripe_sectors);
362 	prt_printf(out, "cached_sectors    %u\n",	a->cached_sectors);
363 	prt_printf(out, "stripe            %u\n",	a->stripe);
364 	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
365 	prt_printf(out, "io_time[READ]     %llu\n",	a->io_time[READ]);
366 	prt_printf(out, "io_time[WRITE]    %llu\n",	a->io_time[WRITE]);
367 	prt_printf(out, "fragmentation     %llu\n",	a->fragmentation_lru);
368 	prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
369 	printbuf_indent_sub(out, 2);
370 }
371 
372 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
373 {
374 	if (k.k->type == KEY_TYPE_alloc_v4) {
375 		void *src, *dst;
376 
377 		*out = *bkey_s_c_to_alloc_v4(k).v;
378 
379 		src = alloc_v4_backpointers(out);
380 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
381 		dst = alloc_v4_backpointers(out);
382 
383 		if (src < dst)
384 			memset(src, 0, dst - src);
385 
386 		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0);
387 	} else {
388 		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
389 
390 		*out = (struct bch_alloc_v4) {
391 			.journal_seq		= u.journal_seq,
392 			.flags			= u.need_discard,
393 			.gen			= u.gen,
394 			.oldest_gen		= u.oldest_gen,
395 			.data_type		= u.data_type,
396 			.stripe_redundancy	= u.stripe_redundancy,
397 			.dirty_sectors		= u.dirty_sectors,
398 			.cached_sectors		= u.cached_sectors,
399 			.io_time[READ]		= u.read_time,
400 			.io_time[WRITE]		= u.write_time,
401 			.stripe			= u.stripe,
402 		};
403 
404 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s);
405 	}
406 }
407 
408 static noinline struct bkey_i_alloc_v4 *
409 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
410 {
411 	struct bkey_i_alloc_v4 *ret;
412 
413 	ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4)));
414 	if (IS_ERR(ret))
415 		return ret;
416 
417 	if (k.k->type == KEY_TYPE_alloc_v4) {
418 		void *src, *dst;
419 
420 		bkey_reassemble(&ret->k_i, k);
421 
422 		src = alloc_v4_backpointers(&ret->v);
423 		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
424 		dst = alloc_v4_backpointers(&ret->v);
425 
426 		if (src < dst)
427 			memset(src, 0, dst - src);
428 
429 		SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0);
430 		set_alloc_v4_u64s(ret);
431 	} else {
432 		bkey_alloc_v4_init(&ret->k_i);
433 		ret->k.p = k.k->p;
434 		bch2_alloc_to_v4(k, &ret->v);
435 	}
436 	return ret;
437 }
438 
439 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
440 {
441 	struct bkey_s_c_alloc_v4 a;
442 
443 	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
444 	    ((a = bkey_s_c_to_alloc_v4(k), true) &&
445 	     BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0))
446 		return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4);
447 
448 	return __bch2_alloc_to_v4_mut(trans, k);
449 }
450 
451 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
452 {
453 	return bch2_alloc_to_v4_mut_inlined(trans, k);
454 }
455 
456 struct bkey_i_alloc_v4 *
457 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter,
458 				       struct bpos pos)
459 {
460 	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos,
461 					       BTREE_ITER_with_updates|
462 					       BTREE_ITER_cached|
463 					       BTREE_ITER_intent);
464 	int ret = bkey_err(k);
465 	if (unlikely(ret))
466 		return ERR_PTR(ret);
467 
468 	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
469 	ret = PTR_ERR_OR_ZERO(a);
470 	if (unlikely(ret))
471 		goto err;
472 	return a;
473 err:
474 	bch2_trans_iter_exit(trans, iter);
475 	return ERR_PTR(ret);
476 }
477 
478 __flatten
479 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos,
480 						      enum btree_iter_update_trigger_flags flags)
481 {
482 	struct btree_iter iter;
483 	struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
484 	int ret = PTR_ERR_OR_ZERO(a);
485 	if (ret)
486 		return ERR_PTR(ret);
487 
488 	ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
489 	bch2_trans_iter_exit(trans, &iter);
490 	return unlikely(ret) ? ERR_PTR(ret) : a;
491 }
492 
493 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset)
494 {
495 	*offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK;
496 
497 	pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS;
498 	return pos;
499 }
500 
501 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset)
502 {
503 	pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS;
504 	pos.offset += offset;
505 	return pos;
506 }
507 
508 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
509 {
510 	return k.k->type == KEY_TYPE_bucket_gens
511 		? bkey_s_c_to_bucket_gens(k).v->gens[offset]
512 		: 0;
513 }
514 
515 int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
516 			     enum bch_validate_flags flags)
517 {
518 	int ret = 0;
519 
520 	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
521 			 c, bucket_gens_val_size_bad,
522 			 "bad val size (%zu != %zu)",
523 			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
524 fsck_err:
525 	return ret;
526 }
527 
528 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
529 {
530 	struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k);
531 	unsigned i;
532 
533 	for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) {
534 		if (i)
535 			prt_char(out, ' ');
536 		prt_printf(out, "%u", g.v->gens[i]);
537 	}
538 }
539 
540 int bch2_bucket_gens_init(struct bch_fs *c)
541 {
542 	struct btree_trans *trans = bch2_trans_get(c);
543 	struct bkey_i_bucket_gens g;
544 	bool have_bucket_gens_key = false;
545 	int ret;
546 
547 	ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
548 				 BTREE_ITER_prefetch, k, ({
549 		/*
550 		 * Not a fsck error because this is checked/repaired by
551 		 * bch2_check_alloc_key() which runs later:
552 		 */
553 		if (!bch2_dev_bucket_exists(c, k.k->p))
554 			continue;
555 
556 		struct bch_alloc_v4 a;
557 		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
558 		unsigned offset;
559 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
560 		int ret2 = 0;
561 
562 		if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
563 			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
564 				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
565 			if (ret2)
566 				goto iter_err;
567 			have_bucket_gens_key = false;
568 		}
569 
570 		if (!have_bucket_gens_key) {
571 			bkey_bucket_gens_init(&g.k_i);
572 			g.k.p = pos;
573 			have_bucket_gens_key = true;
574 		}
575 
576 		g.v.gens[offset] = gen;
577 iter_err:
578 		ret2;
579 	}));
580 
581 	if (have_bucket_gens_key && !ret)
582 		ret = commit_do(trans, NULL, NULL,
583 				BCH_TRANS_COMMIT_no_enospc,
584 			bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
585 
586 	bch2_trans_put(trans);
587 
588 	bch_err_fn(c, ret);
589 	return ret;
590 }
591 
592 int bch2_alloc_read(struct bch_fs *c)
593 {
594 	struct btree_trans *trans = bch2_trans_get(c);
595 	struct bch_dev *ca = NULL;
596 	int ret;
597 
598 	if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
599 		ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
600 					 BTREE_ITER_prefetch, k, ({
601 			u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
602 			u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
603 
604 			if (k.k->type != KEY_TYPE_bucket_gens)
605 				continue;
606 
607 			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
608 			/*
609 			 * Not a fsck error because this is checked/repaired by
610 			 * bch2_check_alloc_key() which runs later:
611 			 */
612 			if (!ca) {
613 				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
614 				continue;
615 			}
616 
617 			const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
618 
619 			for (u64 b = max_t(u64, ca->mi.first_bucket, start);
620 			     b < min_t(u64, ca->mi.nbuckets, end);
621 			     b++)
622 				*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
623 			0;
624 		}));
625 	} else {
626 		ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
627 					 BTREE_ITER_prefetch, k, ({
628 			ca = bch2_dev_iterate(c, ca, k.k->p.inode);
629 			/*
630 			 * Not a fsck error because this is checked/repaired by
631 			 * bch2_check_alloc_key() which runs later:
632 			 */
633 			if (!ca) {
634 				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
635 				continue;
636 			}
637 
638 			struct bch_alloc_v4 a;
639 			*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
640 			0;
641 		}));
642 	}
643 
644 	bch2_dev_put(ca);
645 	bch2_trans_put(trans);
646 
647 	bch_err_fn(c, ret);
648 	return ret;
649 }
650 
651 /* Free space/discard btree: */
652 
653 static int bch2_bucket_do_index(struct btree_trans *trans,
654 				struct bch_dev *ca,
655 				struct bkey_s_c alloc_k,
656 				const struct bch_alloc_v4 *a,
657 				bool set)
658 {
659 	struct bch_fs *c = trans->c;
660 	struct btree_iter iter;
661 	struct bkey_s_c old;
662 	struct bkey_i *k;
663 	enum btree_id btree;
664 	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
665 	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
666 	struct printbuf buf = PRINTBUF;
667 	int ret;
668 
669 	if (a->data_type != BCH_DATA_free &&
670 	    a->data_type != BCH_DATA_need_discard)
671 		return 0;
672 
673 	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
674 	if (IS_ERR(k))
675 		return PTR_ERR(k);
676 
677 	bkey_init(&k->k);
678 	k->k.type = new_type;
679 
680 	switch (a->data_type) {
681 	case BCH_DATA_free:
682 		btree = BTREE_ID_freespace;
683 		k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
684 		bch2_key_resize(&k->k, 1);
685 		break;
686 	case BCH_DATA_need_discard:
687 		btree = BTREE_ID_need_discard;
688 		k->k.p = alloc_k.k->p;
689 		break;
690 	default:
691 		return 0;
692 	}
693 
694 	old = bch2_bkey_get_iter(trans, &iter, btree,
695 			     bkey_start_pos(&k->k),
696 			     BTREE_ITER_intent);
697 	ret = bkey_err(old);
698 	if (ret)
699 		return ret;
700 
701 	if (ca->mi.freespace_initialized &&
702 	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
703 	    bch2_trans_inconsistent_on(old.k->type != old_type, trans,
704 			"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
705 			"  for %s",
706 			set ? "setting" : "clearing",
707 			bch2_btree_id_str(btree),
708 			iter.pos.inode,
709 			iter.pos.offset,
710 			bch2_bkey_types[old.k->type],
711 			bch2_bkey_types[old_type],
712 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
713 		ret = -EIO;
714 		goto err;
715 	}
716 
717 	ret = bch2_trans_update(trans, &iter, k, 0);
718 err:
719 	bch2_trans_iter_exit(trans, &iter);
720 	printbuf_exit(&buf);
721 	return ret;
722 }
723 
724 static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
725 					   struct bpos bucket, u8 gen)
726 {
727 	struct btree_iter iter;
728 	unsigned offset;
729 	struct bpos pos = alloc_gens_pos(bucket, &offset);
730 	struct bkey_i_bucket_gens *g;
731 	struct bkey_s_c k;
732 	int ret;
733 
734 	g = bch2_trans_kmalloc(trans, sizeof(*g));
735 	ret = PTR_ERR_OR_ZERO(g);
736 	if (ret)
737 		return ret;
738 
739 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos,
740 			       BTREE_ITER_intent|
741 			       BTREE_ITER_with_updates);
742 	ret = bkey_err(k);
743 	if (ret)
744 		return ret;
745 
746 	if (k.k->type != KEY_TYPE_bucket_gens) {
747 		bkey_bucket_gens_init(&g->k_i);
748 		g->k.p = iter.pos;
749 	} else {
750 		bkey_reassemble(&g->k_i, k);
751 	}
752 
753 	g->v.gens[offset] = gen;
754 
755 	ret = bch2_trans_update(trans, &iter, &g->k_i, 0);
756 	bch2_trans_iter_exit(trans, &iter);
757 	return ret;
758 }
759 
760 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca,
761 						    enum bch_data_type data_type,
762 						    s64 delta_buckets,
763 						    s64 delta_sectors,
764 						    s64 delta_fragmented, unsigned flags)
765 {
766 	struct disk_accounting_pos acc = {
767 		.type = BCH_DISK_ACCOUNTING_dev_data_type,
768 		.dev_data_type.dev		= ca->dev_idx,
769 		.dev_data_type.data_type	= data_type,
770 	};
771 	s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
772 
773 	return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
774 }
775 
776 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
777 				   const struct bch_alloc_v4 *old,
778 				   const struct bch_alloc_v4 *new,
779 				   unsigned flags)
780 {
781 	s64 old_sectors = bch2_bucket_sectors(*old);
782 	s64 new_sectors = bch2_bucket_sectors(*new);
783 	if (old->data_type != new->data_type) {
784 		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
785 				 1,  new_sectors,  bch2_bucket_sectors_fragmented(ca, *new), flags) ?:
786 			  bch2_dev_data_type_accounting_mod(trans, ca, old->data_type,
787 				-1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags);
788 		if (ret)
789 			return ret;
790 	} else if (old_sectors != new_sectors) {
791 		int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type,
792 					 0,
793 					 new_sectors - old_sectors,
794 					 bch2_bucket_sectors_fragmented(ca, *new) -
795 					 bch2_bucket_sectors_fragmented(ca, *old), flags);
796 		if (ret)
797 			return ret;
798 	}
799 
800 	s64 old_unstriped = bch2_bucket_sectors_unstriped(*old);
801 	s64 new_unstriped = bch2_bucket_sectors_unstriped(*new);
802 	if (old_unstriped != new_unstriped) {
803 		int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped,
804 					 !!new_unstriped - !!old_unstriped,
805 					 new_unstriped - old_unstriped,
806 					 0,
807 					 flags);
808 		if (ret)
809 			return ret;
810 	}
811 
812 	return 0;
813 }
814 
815 int bch2_trigger_alloc(struct btree_trans *trans,
816 		       enum btree_id btree, unsigned level,
817 		       struct bkey_s_c old, struct bkey_s new,
818 		       enum btree_iter_update_trigger_flags flags)
819 {
820 	struct bch_fs *c = trans->c;
821 	struct printbuf buf = PRINTBUF;
822 	int ret = 0;
823 
824 	struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
825 	if (!ca)
826 		return -EIO;
827 
828 	struct bch_alloc_v4 old_a_convert;
829 	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
830 
831 	struct bch_alloc_v4 *new_a;
832 	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
833 		new_a = bkey_s_to_alloc_v4(new).v;
834 	} else {
835 		BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
836 
837 		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
838 		ret = PTR_ERR_OR_ZERO(new_ka);
839 		if (unlikely(ret))
840 			goto err;
841 		new_a = &new_ka->v;
842 	}
843 
844 	if (flags & BTREE_TRIGGER_transactional) {
845 		alloc_data_type_set(new_a, new_a->data_type);
846 
847 		if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
848 			new_a->io_time[READ] = bch2_current_io_time(c, READ);
849 			new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
850 			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
851 			SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
852 		}
853 
854 		if (data_type_is_empty(new_a->data_type) &&
855 		    BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
856 		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
857 			new_a->gen++;
858 			SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
859 			alloc_data_type_set(new_a, new_a->data_type);
860 		}
861 
862 		if (old_a->data_type != new_a->data_type ||
863 		    (new_a->data_type == BCH_DATA_free &&
864 		     alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
865 			ret =   bch2_bucket_do_index(trans, ca, old, old_a, false) ?:
866 				bch2_bucket_do_index(trans, ca, new.s_c, new_a, true);
867 			if (ret)
868 				goto err;
869 		}
870 
871 		if (new_a->data_type == BCH_DATA_cached &&
872 		    !new_a->io_time[READ])
873 			new_a->io_time[READ] = bch2_current_io_time(c, READ);
874 
875 		u64 old_lru = alloc_lru_idx_read(*old_a);
876 		u64 new_lru = alloc_lru_idx_read(*new_a);
877 		if (old_lru != new_lru) {
878 			ret = bch2_lru_change(trans, new.k->p.inode,
879 					      bucket_to_u64(new.k->p),
880 					      old_lru, new_lru);
881 			if (ret)
882 				goto err;
883 		}
884 
885 		new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
886 		if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
887 			ret = bch2_lru_change(trans,
888 					BCH_LRU_FRAGMENTATION_START,
889 					bucket_to_u64(new.k->p),
890 					old_a->fragmentation_lru, new_a->fragmentation_lru);
891 			if (ret)
892 				goto err;
893 		}
894 
895 		if (old_a->gen != new_a->gen) {
896 			ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
897 			if (ret)
898 				goto err;
899 		}
900 
901 		if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
902 		    old_a->cached_sectors) {
903 			ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
904 					 -((s64) old_a->cached_sectors),
905 					 flags & BTREE_TRIGGER_gc);
906 			if (ret)
907 				goto err;
908 		}
909 
910 		ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
911 		if (ret)
912 			goto err;
913 	}
914 
915 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
916 		u64 journal_seq = trans->journal_res.seq;
917 		u64 bucket_journal_seq = new_a->journal_seq;
918 
919 		if ((flags & BTREE_TRIGGER_insert) &&
920 		    data_type_is_empty(old_a->data_type) !=
921 		    data_type_is_empty(new_a->data_type) &&
922 		    new.k->type == KEY_TYPE_alloc_v4) {
923 			struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
924 
925 			/*
926 			 * If the btree updates referring to a bucket weren't flushed
927 			 * before the bucket became empty again, then the we don't have
928 			 * to wait on a journal flush before we can reuse the bucket:
929 			 */
930 			v->journal_seq = bucket_journal_seq =
931 				data_type_is_empty(new_a->data_type) &&
932 				(journal_seq == v->journal_seq ||
933 				 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
934 				? 0 : journal_seq;
935 		}
936 
937 		if (!data_type_is_empty(old_a->data_type) &&
938 		    data_type_is_empty(new_a->data_type) &&
939 		    bucket_journal_seq) {
940 			ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
941 					c->journal.flushed_seq_ondisk,
942 					new.k->p.inode, new.k->p.offset,
943 					bucket_journal_seq);
944 			if (bch2_fs_fatal_err_on(ret, c,
945 					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
946 				goto err;
947 		}
948 
949 		if (new_a->gen != old_a->gen) {
950 			rcu_read_lock();
951 			u8 *gen = bucket_gen(ca, new.k->p.offset);
952 			if (unlikely(!gen)) {
953 				rcu_read_unlock();
954 				goto invalid_bucket;
955 			}
956 			*gen = new_a->gen;
957 			rcu_read_unlock();
958 		}
959 
960 #define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
961 #define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
962 #define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
963 
964 		if (statechange(a->data_type == BCH_DATA_free) &&
965 		    bucket_flushed(new_a))
966 			closure_wake_up(&c->freelist_wait);
967 
968 		if (statechange(a->data_type == BCH_DATA_need_discard) &&
969 		    !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) &&
970 		    bucket_flushed(new_a))
971 			bch2_discard_one_bucket_fast(ca, new.k->p.offset);
972 
973 		if (statechange(a->data_type == BCH_DATA_cached) &&
974 		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
975 		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
976 			bch2_dev_do_invalidates(ca);
977 
978 		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
979 			bch2_gc_gens_async(c);
980 	}
981 
982 	if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
983 		rcu_read_lock();
984 		struct bucket *g = gc_bucket(ca, new.k->p.offset);
985 		if (unlikely(!g)) {
986 			rcu_read_unlock();
987 			goto invalid_bucket;
988 		}
989 		g->gen_valid	= 1;
990 		g->gen		= new_a->gen;
991 		rcu_read_unlock();
992 	}
993 err:
994 	printbuf_exit(&buf);
995 	bch2_dev_put(ca);
996 	return ret;
997 invalid_bucket:
998 	bch2_fs_inconsistent(c, "reference to invalid bucket\n  %s",
999 			     (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
1000 	ret = -EIO;
1001 	goto err;
1002 }
1003 
1004 /*
1005  * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
1006  * extents style btrees, but works on non-extents btrees:
1007  */
1008 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
1009 {
1010 	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
1011 
1012 	if (bkey_err(k))
1013 		return k;
1014 
1015 	if (k.k->type) {
1016 		return k;
1017 	} else {
1018 		struct btree_iter iter2;
1019 		struct bpos next;
1020 
1021 		bch2_trans_copy_iter(&iter2, iter);
1022 
1023 		struct btree_path *path = btree_iter_path(iter->trans, iter);
1024 		if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
1025 			end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
1026 
1027 		end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
1028 
1029 		/*
1030 		 * btree node min/max is a closed interval, upto takes a half
1031 		 * open interval:
1032 		 */
1033 		k = bch2_btree_iter_peek_upto(&iter2, end);
1034 		next = iter2.pos;
1035 		bch2_trans_iter_exit(iter->trans, &iter2);
1036 
1037 		BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
1038 
1039 		if (bkey_err(k))
1040 			return k;
1041 
1042 		bkey_init(hole);
1043 		hole->p = iter->pos;
1044 
1045 		bch2_key_resize(hole, next.offset - iter->pos.offset);
1046 		return (struct bkey_s_c) { hole, NULL };
1047 	}
1048 }
1049 
1050 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket)
1051 {
1052 	if (*ca) {
1053 		if (bucket->offset < (*ca)->mi.first_bucket)
1054 			bucket->offset = (*ca)->mi.first_bucket;
1055 
1056 		if (bucket->offset < (*ca)->mi.nbuckets)
1057 			return true;
1058 
1059 		bch2_dev_put(*ca);
1060 		*ca = NULL;
1061 		bucket->inode++;
1062 		bucket->offset = 0;
1063 	}
1064 
1065 	rcu_read_lock();
1066 	*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
1067 	if (*ca) {
1068 		*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
1069 		bch2_dev_get(*ca);
1070 	}
1071 	rcu_read_unlock();
1072 
1073 	return *ca != NULL;
1074 }
1075 
1076 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
1077 					struct bch_dev **ca, struct bkey *hole)
1078 {
1079 	struct bch_fs *c = iter->trans->c;
1080 	struct bkey_s_c k;
1081 again:
1082 	k = bch2_get_key_or_hole(iter, POS_MAX, hole);
1083 	if (bkey_err(k))
1084 		return k;
1085 
1086 	*ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode);
1087 
1088 	if (!k.k->type) {
1089 		struct bpos hole_start = bkey_start_pos(k.k);
1090 
1091 		if (!*ca || !bucket_valid(*ca, hole_start.offset)) {
1092 			if (!next_bucket(c, ca, &hole_start))
1093 				return bkey_s_c_null;
1094 
1095 			bch2_btree_iter_set_pos(iter, hole_start);
1096 			goto again;
1097 		}
1098 
1099 		if (k.k->p.offset > (*ca)->mi.nbuckets)
1100 			bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset);
1101 	}
1102 
1103 	return k;
1104 }
1105 
1106 static noinline_for_stack
1107 int bch2_check_alloc_key(struct btree_trans *trans,
1108 			 struct bkey_s_c alloc_k,
1109 			 struct btree_iter *alloc_iter,
1110 			 struct btree_iter *discard_iter,
1111 			 struct btree_iter *freespace_iter,
1112 			 struct btree_iter *bucket_gens_iter)
1113 {
1114 	struct bch_fs *c = trans->c;
1115 	struct bch_alloc_v4 a_convert;
1116 	const struct bch_alloc_v4 *a;
1117 	unsigned discard_key_type, freespace_key_type;
1118 	unsigned gens_offset;
1119 	struct bkey_s_c k;
1120 	struct printbuf buf = PRINTBUF;
1121 	int ret = 0;
1122 
1123 	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p);
1124 	if (fsck_err_on(!ca,
1125 			trans, alloc_key_to_missing_dev_bucket,
1126 			"alloc key for invalid device:bucket %llu:%llu",
1127 			alloc_k.k->p.inode, alloc_k.k->p.offset))
1128 		ret = bch2_btree_delete_at(trans, alloc_iter, 0);
1129 	if (!ca)
1130 		return ret;
1131 
1132 	if (!ca->mi.freespace_initialized)
1133 		goto out;
1134 
1135 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
1136 
1137 	discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
1138 	bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
1139 	k = bch2_btree_iter_peek_slot(discard_iter);
1140 	ret = bkey_err(k);
1141 	if (ret)
1142 		goto err;
1143 
1144 	if (fsck_err_on(k.k->type != discard_key_type,
1145 			trans, need_discard_key_wrong,
1146 			"incorrect key in need_discard btree (got %s should be %s)\n"
1147 			"  %s",
1148 			bch2_bkey_types[k.k->type],
1149 			bch2_bkey_types[discard_key_type],
1150 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1151 		struct bkey_i *update =
1152 			bch2_trans_kmalloc(trans, sizeof(*update));
1153 
1154 		ret = PTR_ERR_OR_ZERO(update);
1155 		if (ret)
1156 			goto err;
1157 
1158 		bkey_init(&update->k);
1159 		update->k.type	= discard_key_type;
1160 		update->k.p	= discard_iter->pos;
1161 
1162 		ret = bch2_trans_update(trans, discard_iter, update, 0);
1163 		if (ret)
1164 			goto err;
1165 	}
1166 
1167 	freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
1168 	bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
1169 	k = bch2_btree_iter_peek_slot(freespace_iter);
1170 	ret = bkey_err(k);
1171 	if (ret)
1172 		goto err;
1173 
1174 	if (fsck_err_on(k.k->type != freespace_key_type,
1175 			trans, freespace_key_wrong,
1176 			"incorrect key in freespace btree (got %s should be %s)\n"
1177 			"  %s",
1178 			bch2_bkey_types[k.k->type],
1179 			bch2_bkey_types[freespace_key_type],
1180 			(printbuf_reset(&buf),
1181 			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1182 		struct bkey_i *update =
1183 			bch2_trans_kmalloc(trans, sizeof(*update));
1184 
1185 		ret = PTR_ERR_OR_ZERO(update);
1186 		if (ret)
1187 			goto err;
1188 
1189 		bkey_init(&update->k);
1190 		update->k.type	= freespace_key_type;
1191 		update->k.p	= freespace_iter->pos;
1192 		bch2_key_resize(&update->k, 1);
1193 
1194 		ret = bch2_trans_update(trans, freespace_iter, update, 0);
1195 		if (ret)
1196 			goto err;
1197 	}
1198 
1199 	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
1200 	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1201 	ret = bkey_err(k);
1202 	if (ret)
1203 		goto err;
1204 
1205 	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
1206 			trans, bucket_gens_key_wrong,
1207 			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
1208 			"  %s",
1209 			alloc_gen(k, gens_offset), a->gen,
1210 			(printbuf_reset(&buf),
1211 			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1212 		struct bkey_i_bucket_gens *g =
1213 			bch2_trans_kmalloc(trans, sizeof(*g));
1214 
1215 		ret = PTR_ERR_OR_ZERO(g);
1216 		if (ret)
1217 			goto err;
1218 
1219 		if (k.k->type == KEY_TYPE_bucket_gens) {
1220 			bkey_reassemble(&g->k_i, k);
1221 		} else {
1222 			bkey_bucket_gens_init(&g->k_i);
1223 			g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset);
1224 		}
1225 
1226 		g->v.gens[gens_offset] = a->gen;
1227 
1228 		ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0);
1229 		if (ret)
1230 			goto err;
1231 	}
1232 out:
1233 err:
1234 fsck_err:
1235 	bch2_dev_put(ca);
1236 	printbuf_exit(&buf);
1237 	return ret;
1238 }
1239 
1240 static noinline_for_stack
1241 int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
1242 				    struct bch_dev *ca,
1243 				    struct bpos start,
1244 				    struct bpos *end,
1245 				    struct btree_iter *freespace_iter)
1246 {
1247 	struct bkey_s_c k;
1248 	struct printbuf buf = PRINTBUF;
1249 	int ret;
1250 
1251 	if (!ca->mi.freespace_initialized)
1252 		return 0;
1253 
1254 	bch2_btree_iter_set_pos(freespace_iter, start);
1255 
1256 	k = bch2_btree_iter_peek_slot(freespace_iter);
1257 	ret = bkey_err(k);
1258 	if (ret)
1259 		goto err;
1260 
1261 	*end = bkey_min(k.k->p, *end);
1262 
1263 	if (fsck_err_on(k.k->type != KEY_TYPE_set,
1264 			trans, freespace_hole_missing,
1265 			"hole in alloc btree missing in freespace btree\n"
1266 			"  device %llu buckets %llu-%llu",
1267 			freespace_iter->pos.inode,
1268 			freespace_iter->pos.offset,
1269 			end->offset)) {
1270 		struct bkey_i *update =
1271 			bch2_trans_kmalloc(trans, sizeof(*update));
1272 
1273 		ret = PTR_ERR_OR_ZERO(update);
1274 		if (ret)
1275 			goto err;
1276 
1277 		bkey_init(&update->k);
1278 		update->k.type	= KEY_TYPE_set;
1279 		update->k.p	= freespace_iter->pos;
1280 		bch2_key_resize(&update->k,
1281 				min_t(u64, U32_MAX, end->offset -
1282 				      freespace_iter->pos.offset));
1283 
1284 		ret = bch2_trans_update(trans, freespace_iter, update, 0);
1285 		if (ret)
1286 			goto err;
1287 	}
1288 err:
1289 fsck_err:
1290 	printbuf_exit(&buf);
1291 	return ret;
1292 }
1293 
1294 static noinline_for_stack
1295 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
1296 				      struct bpos start,
1297 				      struct bpos *end,
1298 				      struct btree_iter *bucket_gens_iter)
1299 {
1300 	struct bkey_s_c k;
1301 	struct printbuf buf = PRINTBUF;
1302 	unsigned i, gens_offset, gens_end_offset;
1303 	int ret;
1304 
1305 	bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
1306 
1307 	k = bch2_btree_iter_peek_slot(bucket_gens_iter);
1308 	ret = bkey_err(k);
1309 	if (ret)
1310 		goto err;
1311 
1312 	if (bkey_cmp(alloc_gens_pos(start, &gens_offset),
1313 		     alloc_gens_pos(*end,  &gens_end_offset)))
1314 		gens_end_offset = KEY_TYPE_BUCKET_GENS_NR;
1315 
1316 	if (k.k->type == KEY_TYPE_bucket_gens) {
1317 		struct bkey_i_bucket_gens g;
1318 		bool need_update = false;
1319 
1320 		bkey_reassemble(&g.k_i, k);
1321 
1322 		for (i = gens_offset; i < gens_end_offset; i++) {
1323 			if (fsck_err_on(g.v.gens[i], trans,
1324 					bucket_gens_hole_wrong,
1325 					"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
1326 					bucket_gens_pos_to_alloc(k.k->p, i).inode,
1327 					bucket_gens_pos_to_alloc(k.k->p, i).offset,
1328 					g.v.gens[i])) {
1329 				g.v.gens[i] = 0;
1330 				need_update = true;
1331 			}
1332 		}
1333 
1334 		if (need_update) {
1335 			struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1336 
1337 			ret = PTR_ERR_OR_ZERO(u);
1338 			if (ret)
1339 				goto err;
1340 
1341 			memcpy(u, &g, sizeof(g));
1342 
1343 			ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
1344 			if (ret)
1345 				goto err;
1346 		}
1347 	}
1348 
1349 	*end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0));
1350 err:
1351 fsck_err:
1352 	printbuf_exit(&buf);
1353 	return ret;
1354 }
1355 
1356 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
1357 					      struct btree_iter *iter)
1358 {
1359 	struct bch_fs *c = trans->c;
1360 	struct btree_iter alloc_iter;
1361 	struct bkey_s_c alloc_k;
1362 	struct bch_alloc_v4 a_convert;
1363 	const struct bch_alloc_v4 *a;
1364 	u64 genbits;
1365 	struct bpos pos;
1366 	enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
1367 		? BCH_DATA_need_discard
1368 		: BCH_DATA_free;
1369 	struct printbuf buf = PRINTBUF;
1370 	int ret;
1371 
1372 	pos = iter->pos;
1373 	pos.offset &= ~(~0ULL << 56);
1374 	genbits = iter->pos.offset & (~0ULL << 56);
1375 
1376 	alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
1377 	ret = bkey_err(alloc_k);
1378 	if (ret)
1379 		return ret;
1380 
1381 	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
1382 			trans, need_discard_freespace_key_to_invalid_dev_bucket,
1383 			"entry in %s btree for nonexistant dev:bucket %llu:%llu",
1384 			bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
1385 		goto delete;
1386 
1387 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
1388 
1389 	if (fsck_err_on(a->data_type != state ||
1390 			(state == BCH_DATA_free &&
1391 			 genbits != alloc_freespace_genbits(*a)),
1392 			trans, need_discard_freespace_key_bad,
1393 			"%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
1394 			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
1395 			bch2_btree_id_str(iter->btree_id),
1396 			iter->pos.inode,
1397 			iter->pos.offset,
1398 			a->data_type == state,
1399 			genbits >> 56, alloc_freespace_genbits(*a) >> 56))
1400 		goto delete;
1401 out:
1402 fsck_err:
1403 	bch2_set_btree_iter_dontneed(&alloc_iter);
1404 	bch2_trans_iter_exit(trans, &alloc_iter);
1405 	printbuf_exit(&buf);
1406 	return ret;
1407 delete:
1408 	ret =   bch2_btree_delete_extent_at(trans, iter,
1409 			iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
1410 		bch2_trans_commit(trans, NULL, NULL,
1411 			BCH_TRANS_COMMIT_no_enospc);
1412 	goto out;
1413 }
1414 
1415 /*
1416  * We've already checked that generation numbers in the bucket_gens btree are
1417  * valid for buckets that exist; this just checks for keys for nonexistent
1418  * buckets.
1419  */
1420 static noinline_for_stack
1421 int bch2_check_bucket_gens_key(struct btree_trans *trans,
1422 			       struct btree_iter *iter,
1423 			       struct bkey_s_c k)
1424 {
1425 	struct bch_fs *c = trans->c;
1426 	struct bkey_i_bucket_gens g;
1427 	u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
1428 	u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
1429 	u64 b;
1430 	bool need_update = false;
1431 	struct printbuf buf = PRINTBUF;
1432 	int ret = 0;
1433 
1434 	BUG_ON(k.k->type != KEY_TYPE_bucket_gens);
1435 	bkey_reassemble(&g.k_i, k);
1436 
1437 	struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
1438 	if (!ca) {
1439 		if (fsck_err(trans, bucket_gens_to_invalid_dev,
1440 			     "bucket_gens key for invalid device:\n  %s",
1441 			     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1442 			ret = bch2_btree_delete_at(trans, iter, 0);
1443 		goto out;
1444 	}
1445 
1446 	if (fsck_err_on(end <= ca->mi.first_bucket ||
1447 			start >= ca->mi.nbuckets,
1448 			trans, bucket_gens_to_invalid_buckets,
1449 			"bucket_gens key for invalid buckets:\n  %s",
1450 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1451 		ret = bch2_btree_delete_at(trans, iter, 0);
1452 		goto out;
1453 	}
1454 
1455 	for (b = start; b < ca->mi.first_bucket; b++)
1456 		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1457 				trans, bucket_gens_nonzero_for_invalid_buckets,
1458 				"bucket_gens key has nonzero gen for invalid bucket")) {
1459 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1460 			need_update = true;
1461 		}
1462 
1463 	for (b = ca->mi.nbuckets; b < end; b++)
1464 		if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK],
1465 				trans, bucket_gens_nonzero_for_invalid_buckets,
1466 				"bucket_gens key has nonzero gen for invalid bucket")) {
1467 			g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
1468 			need_update = true;
1469 		}
1470 
1471 	if (need_update) {
1472 		struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
1473 
1474 		ret = PTR_ERR_OR_ZERO(u);
1475 		if (ret)
1476 			goto out;
1477 
1478 		memcpy(u, &g, sizeof(g));
1479 		ret = bch2_trans_update(trans, iter, u, 0);
1480 	}
1481 out:
1482 fsck_err:
1483 	bch2_dev_put(ca);
1484 	printbuf_exit(&buf);
1485 	return ret;
1486 }
1487 
1488 int bch2_check_alloc_info(struct bch_fs *c)
1489 {
1490 	struct btree_trans *trans = bch2_trans_get(c);
1491 	struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
1492 	struct bch_dev *ca = NULL;
1493 	struct bkey hole;
1494 	struct bkey_s_c k;
1495 	int ret = 0;
1496 
1497 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
1498 			     BTREE_ITER_prefetch);
1499 	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
1500 			     BTREE_ITER_prefetch);
1501 	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
1502 			     BTREE_ITER_prefetch);
1503 	bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
1504 			     BTREE_ITER_prefetch);
1505 
1506 	while (1) {
1507 		struct bpos next;
1508 
1509 		bch2_trans_begin(trans);
1510 
1511 		k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
1512 		ret = bkey_err(k);
1513 		if (ret)
1514 			goto bkey_err;
1515 
1516 		if (!k.k)
1517 			break;
1518 
1519 		if (k.k->type) {
1520 			next = bpos_nosnap_successor(k.k->p);
1521 
1522 			ret = bch2_check_alloc_key(trans,
1523 						   k, &iter,
1524 						   &discard_iter,
1525 						   &freespace_iter,
1526 						   &bucket_gens_iter);
1527 			if (ret)
1528 				goto bkey_err;
1529 		} else {
1530 			next = k.k->p;
1531 
1532 			ret = bch2_check_alloc_hole_freespace(trans, ca,
1533 						    bkey_start_pos(k.k),
1534 						    &next,
1535 						    &freespace_iter) ?:
1536 				bch2_check_alloc_hole_bucket_gens(trans,
1537 						    bkey_start_pos(k.k),
1538 						    &next,
1539 						    &bucket_gens_iter);
1540 			if (ret)
1541 				goto bkey_err;
1542 		}
1543 
1544 		ret = bch2_trans_commit(trans, NULL, NULL,
1545 					BCH_TRANS_COMMIT_no_enospc);
1546 		if (ret)
1547 			goto bkey_err;
1548 
1549 		bch2_btree_iter_set_pos(&iter, next);
1550 bkey_err:
1551 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1552 			continue;
1553 		if (ret)
1554 			break;
1555 	}
1556 	bch2_trans_iter_exit(trans, &bucket_gens_iter);
1557 	bch2_trans_iter_exit(trans, &freespace_iter);
1558 	bch2_trans_iter_exit(trans, &discard_iter);
1559 	bch2_trans_iter_exit(trans, &iter);
1560 	bch2_dev_put(ca);
1561 	ca = NULL;
1562 
1563 	if (ret < 0)
1564 		goto err;
1565 
1566 	ret = for_each_btree_key(trans, iter,
1567 			BTREE_ID_need_discard, POS_MIN,
1568 			BTREE_ITER_prefetch, k,
1569 		bch2_check_discard_freespace_key(trans, &iter));
1570 	if (ret)
1571 		goto err;
1572 
1573 	bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
1574 			     BTREE_ITER_prefetch);
1575 	while (1) {
1576 		bch2_trans_begin(trans);
1577 		k = bch2_btree_iter_peek(&iter);
1578 		if (!k.k)
1579 			break;
1580 
1581 		ret = bkey_err(k) ?:
1582 			bch2_check_discard_freespace_key(trans, &iter);
1583 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1584 			ret = 0;
1585 			continue;
1586 		}
1587 		if (ret) {
1588 			struct printbuf buf = PRINTBUF;
1589 			bch2_bkey_val_to_text(&buf, c, k);
1590 
1591 			bch_err(c, "while checking %s", buf.buf);
1592 			printbuf_exit(&buf);
1593 			break;
1594 		}
1595 
1596 		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
1597 	}
1598 	bch2_trans_iter_exit(trans, &iter);
1599 	if (ret)
1600 		goto err;
1601 
1602 	ret = for_each_btree_key_commit(trans, iter,
1603 			BTREE_ID_bucket_gens, POS_MIN,
1604 			BTREE_ITER_prefetch, k,
1605 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1606 		bch2_check_bucket_gens_key(trans, &iter, k));
1607 err:
1608 	bch2_trans_put(trans);
1609 	bch_err_fn(c, ret);
1610 	return ret;
1611 }
1612 
1613 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
1614 				       struct btree_iter *alloc_iter,
1615 				       struct bkey_buf *last_flushed)
1616 {
1617 	struct bch_fs *c = trans->c;
1618 	struct bch_alloc_v4 a_convert;
1619 	const struct bch_alloc_v4 *a;
1620 	struct bkey_s_c alloc_k;
1621 	struct printbuf buf = PRINTBUF;
1622 	int ret;
1623 
1624 	alloc_k = bch2_btree_iter_peek(alloc_iter);
1625 	if (!alloc_k.k)
1626 		return 0;
1627 
1628 	ret = bkey_err(alloc_k);
1629 	if (ret)
1630 		return ret;
1631 
1632 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
1633 
1634 	if (a->fragmentation_lru) {
1635 		ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
1636 					 a->fragmentation_lru,
1637 					 alloc_k, last_flushed);
1638 		if (ret)
1639 			return ret;
1640 	}
1641 
1642 	if (a->data_type != BCH_DATA_cached)
1643 		return 0;
1644 
1645 	if (fsck_err_on(!a->io_time[READ],
1646 			trans, alloc_key_cached_but_read_time_zero,
1647 			"cached bucket with read_time 0\n"
1648 			"  %s",
1649 		(printbuf_reset(&buf),
1650 		 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
1651 		struct bkey_i_alloc_v4 *a_mut =
1652 			bch2_alloc_to_v4_mut(trans, alloc_k);
1653 		ret = PTR_ERR_OR_ZERO(a_mut);
1654 		if (ret)
1655 			goto err;
1656 
1657 		a_mut->v.io_time[READ] = bch2_current_io_time(c, READ);
1658 		ret = bch2_trans_update(trans, alloc_iter,
1659 					&a_mut->k_i, BTREE_TRIGGER_norun);
1660 		if (ret)
1661 			goto err;
1662 
1663 		a = &a_mut->v;
1664 	}
1665 
1666 	ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
1667 				 alloc_k, last_flushed);
1668 	if (ret)
1669 		goto err;
1670 err:
1671 fsck_err:
1672 	printbuf_exit(&buf);
1673 	return ret;
1674 }
1675 
1676 int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
1677 {
1678 	struct bkey_buf last_flushed;
1679 
1680 	bch2_bkey_buf_init(&last_flushed);
1681 	bkey_init(&last_flushed.k->k);
1682 
1683 	int ret = bch2_trans_run(c,
1684 		for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1685 				POS_MIN, BTREE_ITER_prefetch, k,
1686 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1687 			bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
1688 
1689 	bch2_bkey_buf_exit(&last_flushed, c);
1690 	bch_err_fn(c, ret);
1691 	return ret;
1692 }
1693 
1694 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
1695 {
1696 	int ret;
1697 
1698 	mutex_lock(&ca->discard_buckets_in_flight_lock);
1699 	darray_for_each(ca->discard_buckets_in_flight, i)
1700 		if (i->bucket == bucket) {
1701 			ret = -BCH_ERR_EEXIST_discard_in_flight_add;
1702 			goto out;
1703 		}
1704 
1705 	ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
1706 			   .in_progress = in_progress,
1707 			   .bucket	= bucket,
1708 	}));
1709 out:
1710 	mutex_unlock(&ca->discard_buckets_in_flight_lock);
1711 	return ret;
1712 }
1713 
1714 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
1715 {
1716 	mutex_lock(&ca->discard_buckets_in_flight_lock);
1717 	darray_for_each(ca->discard_buckets_in_flight, i)
1718 		if (i->bucket == bucket) {
1719 			BUG_ON(!i->in_progress);
1720 			darray_remove_item(&ca->discard_buckets_in_flight, i);
1721 			goto found;
1722 		}
1723 	BUG();
1724 found:
1725 	mutex_unlock(&ca->discard_buckets_in_flight_lock);
1726 }
1727 
1728 struct discard_buckets_state {
1729 	u64		seen;
1730 	u64		open;
1731 	u64		need_journal_commit;
1732 	u64		discarded;
1733 	u64		need_journal_commit_this_dev;
1734 };
1735 
1736 static int bch2_discard_one_bucket(struct btree_trans *trans,
1737 				   struct bch_dev *ca,
1738 				   struct btree_iter *need_discard_iter,
1739 				   struct bpos *discard_pos_done,
1740 				   struct discard_buckets_state *s)
1741 {
1742 	struct bch_fs *c = trans->c;
1743 	struct bpos pos = need_discard_iter->pos;
1744 	struct btree_iter iter = { NULL };
1745 	struct bkey_s_c k;
1746 	struct bkey_i_alloc_v4 *a;
1747 	struct printbuf buf = PRINTBUF;
1748 	bool discard_locked = false;
1749 	int ret = 0;
1750 
1751 	if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
1752 		s->open++;
1753 		goto out;
1754 	}
1755 
1756 	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
1757 			c->journal.flushed_seq_ondisk,
1758 			pos.inode, pos.offset)) {
1759 		s->need_journal_commit++;
1760 		s->need_journal_commit_this_dev++;
1761 		goto out;
1762 	}
1763 
1764 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
1765 			       need_discard_iter->pos,
1766 			       BTREE_ITER_cached);
1767 	ret = bkey_err(k);
1768 	if (ret)
1769 		goto out;
1770 
1771 	a = bch2_alloc_to_v4_mut(trans, k);
1772 	ret = PTR_ERR_OR_ZERO(a);
1773 	if (ret)
1774 		goto out;
1775 
1776 	if (bch2_bucket_sectors_total(a->v)) {
1777 		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1778 					       trans, "attempting to discard bucket with dirty data\n%s",
1779 					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1780 			ret = -EIO;
1781 		goto out;
1782 	}
1783 
1784 	if (a->v.data_type != BCH_DATA_need_discard) {
1785 		if (data_type_is_empty(a->v.data_type) &&
1786 		    BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
1787 			a->v.gen++;
1788 			SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
1789 			goto write;
1790 		}
1791 
1792 		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1793 					       trans, "bucket incorrectly set in need_discard btree\n"
1794 					       "%s",
1795 					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1796 			ret = -EIO;
1797 		goto out;
1798 	}
1799 
1800 	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
1801 		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
1802 					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
1803 					       a->v.journal_seq,
1804 					       c->journal.flushed_seq_ondisk,
1805 					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1806 			ret = -EIO;
1807 		goto out;
1808 	}
1809 
1810 	if (discard_in_flight_add(ca, iter.pos.offset, true))
1811 		goto out;
1812 
1813 	discard_locked = true;
1814 
1815 	if (!bkey_eq(*discard_pos_done, iter.pos) &&
1816 	    ca->mi.discard && !c->opts.nochanges) {
1817 		/*
1818 		 * This works without any other locks because this is the only
1819 		 * thread that removes items from the need_discard tree
1820 		 */
1821 		bch2_trans_unlock_long(trans);
1822 		blkdev_issue_discard(ca->disk_sb.bdev,
1823 				     k.k->p.offset * ca->mi.bucket_size,
1824 				     ca->mi.bucket_size,
1825 				     GFP_KERNEL);
1826 		*discard_pos_done = iter.pos;
1827 
1828 		ret = bch2_trans_relock_notrace(trans);
1829 		if (ret)
1830 			goto out;
1831 	}
1832 
1833 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1834 write:
1835 	alloc_data_type_set(&a->v, a->v.data_type);
1836 
1837 	ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
1838 		bch2_trans_commit(trans, NULL, NULL,
1839 				  BCH_WATERMARK_btree|
1840 				  BCH_TRANS_COMMIT_no_enospc);
1841 	if (ret)
1842 		goto out;
1843 
1844 	count_event(c, bucket_discard);
1845 	s->discarded++;
1846 out:
1847 	if (discard_locked)
1848 		discard_in_flight_remove(ca, iter.pos.offset);
1849 	s->seen++;
1850 	bch2_trans_iter_exit(trans, &iter);
1851 	printbuf_exit(&buf);
1852 	return ret;
1853 }
1854 
1855 static void bch2_do_discards_work(struct work_struct *work)
1856 {
1857 	struct bch_dev *ca = container_of(work, struct bch_dev, discard_work);
1858 	struct bch_fs *c = ca->fs;
1859 	struct discard_buckets_state s = {};
1860 	struct bpos discard_pos_done = POS_MAX;
1861 	int ret;
1862 
1863 	/*
1864 	 * We're doing the commit in bch2_discard_one_bucket instead of using
1865 	 * for_each_btree_key_commit() so that we can increment counters after
1866 	 * successful commit:
1867 	 */
1868 	ret = bch2_trans_run(c,
1869 		for_each_btree_key_upto(trans, iter,
1870 				   BTREE_ID_need_discard,
1871 				   POS(ca->dev_idx, 0),
1872 				   POS(ca->dev_idx, U64_MAX), 0, k,
1873 			bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
1874 
1875 	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
1876 			      bch2_err_str(ret));
1877 
1878 	percpu_ref_put(&ca->io_ref);
1879 	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1880 }
1881 
1882 void bch2_dev_do_discards(struct bch_dev *ca)
1883 {
1884 	struct bch_fs *c = ca->fs;
1885 
1886 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
1887 		return;
1888 
1889 	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1890 		goto put_write_ref;
1891 
1892 	if (queue_work(c->write_ref_wq, &ca->discard_work))
1893 		return;
1894 
1895 	percpu_ref_put(&ca->io_ref);
1896 put_write_ref:
1897 	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
1898 }
1899 
1900 void bch2_do_discards(struct bch_fs *c)
1901 {
1902 	for_each_member_device(c, ca)
1903 		bch2_dev_do_discards(ca);
1904 }
1905 
1906 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
1907 {
1908 	struct btree_iter iter;
1909 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
1910 	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
1911 	int ret = bkey_err(k);
1912 	if (ret)
1913 		goto err;
1914 
1915 	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
1916 	ret = PTR_ERR_OR_ZERO(a);
1917 	if (ret)
1918 		goto err;
1919 
1920 	BUG_ON(a->v.dirty_sectors);
1921 	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
1922 	alloc_data_type_set(&a->v, a->v.data_type);
1923 
1924 	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1925 err:
1926 	bch2_trans_iter_exit(trans, &iter);
1927 	return ret;
1928 }
1929 
1930 static void bch2_do_discards_fast_work(struct work_struct *work)
1931 {
1932 	struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
1933 	struct bch_fs *c = ca->fs;
1934 
1935 	while (1) {
1936 		bool got_bucket = false;
1937 		u64 bucket;
1938 
1939 		mutex_lock(&ca->discard_buckets_in_flight_lock);
1940 		darray_for_each(ca->discard_buckets_in_flight, i) {
1941 			if (i->in_progress)
1942 				continue;
1943 
1944 			got_bucket = true;
1945 			bucket = i->bucket;
1946 			i->in_progress = true;
1947 			break;
1948 		}
1949 		mutex_unlock(&ca->discard_buckets_in_flight_lock);
1950 
1951 		if (!got_bucket)
1952 			break;
1953 
1954 		if (ca->mi.discard && !c->opts.nochanges)
1955 			blkdev_issue_discard(ca->disk_sb.bdev,
1956 					     bucket_to_sector(ca, bucket),
1957 					     ca->mi.bucket_size,
1958 					     GFP_KERNEL);
1959 
1960 		int ret = bch2_trans_do(c, NULL, NULL,
1961 			BCH_WATERMARK_btree|
1962 			BCH_TRANS_COMMIT_no_enospc,
1963 			bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
1964 		bch_err_fn(c, ret);
1965 
1966 		discard_in_flight_remove(ca, bucket);
1967 
1968 		if (ret)
1969 			break;
1970 	}
1971 
1972 	percpu_ref_put(&ca->io_ref);
1973 	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1974 }
1975 
1976 static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
1977 {
1978 	struct bch_fs *c = ca->fs;
1979 
1980 	if (discard_in_flight_add(ca, bucket, false))
1981 		return;
1982 
1983 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
1984 		return;
1985 
1986 	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
1987 		goto put_ref;
1988 
1989 	if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
1990 		return;
1991 
1992 	percpu_ref_put(&ca->io_ref);
1993 put_ref:
1994 	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
1995 }
1996 
1997 static int invalidate_one_bucket(struct btree_trans *trans,
1998 				 struct btree_iter *lru_iter,
1999 				 struct bkey_s_c lru_k,
2000 				 s64 *nr_to_invalidate)
2001 {
2002 	struct bch_fs *c = trans->c;
2003 	struct bkey_i_alloc_v4 *a = NULL;
2004 	struct printbuf buf = PRINTBUF;
2005 	struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
2006 	unsigned cached_sectors;
2007 	int ret = 0;
2008 
2009 	if (*nr_to_invalidate <= 0)
2010 		return 1;
2011 
2012 	if (!bch2_dev_bucket_exists(c, bucket)) {
2013 		prt_str(&buf, "lru entry points to invalid bucket");
2014 		goto err;
2015 	}
2016 
2017 	if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
2018 		return 0;
2019 
2020 	a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
2021 	ret = PTR_ERR_OR_ZERO(a);
2022 	if (ret)
2023 		goto out;
2024 
2025 	/* We expect harmless races here due to the btree write buffer: */
2026 	if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
2027 		goto out;
2028 
2029 	BUG_ON(a->v.data_type != BCH_DATA_cached);
2030 	BUG_ON(a->v.dirty_sectors);
2031 
2032 	if (!a->v.cached_sectors)
2033 		bch_err(c, "invalidating empty bucket, confused");
2034 
2035 	cached_sectors = a->v.cached_sectors;
2036 
2037 	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
2038 	a->v.gen++;
2039 	a->v.data_type		= 0;
2040 	a->v.dirty_sectors	= 0;
2041 	a->v.stripe_sectors	= 0;
2042 	a->v.cached_sectors	= 0;
2043 	a->v.io_time[READ]	= bch2_current_io_time(c, READ);
2044 	a->v.io_time[WRITE]	= bch2_current_io_time(c, WRITE);
2045 
2046 	ret = bch2_trans_commit(trans, NULL, NULL,
2047 				BCH_WATERMARK_btree|
2048 				BCH_TRANS_COMMIT_no_enospc);
2049 	if (ret)
2050 		goto out;
2051 
2052 	trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
2053 	--*nr_to_invalidate;
2054 out:
2055 	printbuf_exit(&buf);
2056 	return ret;
2057 err:
2058 	prt_str(&buf, "\n  lru key: ");
2059 	bch2_bkey_val_to_text(&buf, c, lru_k);
2060 
2061 	prt_str(&buf, "\n  lru entry: ");
2062 	bch2_lru_pos_to_text(&buf, lru_iter->pos);
2063 
2064 	prt_str(&buf, "\n  alloc key: ");
2065 	if (!a)
2066 		bch2_bpos_to_text(&buf, bucket);
2067 	else
2068 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
2069 
2070 	bch_err(c, "%s", buf.buf);
2071 	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
2072 		bch2_inconsistent_error(c);
2073 		ret = -EINVAL;
2074 	}
2075 
2076 	goto out;
2077 }
2078 
2079 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
2080 				    struct bch_dev *ca, bool *wrapped)
2081 {
2082 	struct bkey_s_c k;
2083 again:
2084 	k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
2085 	if (!k.k && !*wrapped) {
2086 		bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
2087 		*wrapped = true;
2088 		goto again;
2089 	}
2090 
2091 	return k;
2092 }
2093 
2094 static void bch2_do_invalidates_work(struct work_struct *work)
2095 {
2096 	struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work);
2097 	struct bch_fs *c = ca->fs;
2098 	struct btree_trans *trans = bch2_trans_get(c);
2099 	int ret = 0;
2100 
2101 	ret = bch2_btree_write_buffer_tryflush(trans);
2102 	if (ret)
2103 		goto err;
2104 
2105 	s64 nr_to_invalidate =
2106 		should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
2107 	struct btree_iter iter;
2108 	bool wrapped = false;
2109 
2110 	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
2111 			     lru_pos(ca->dev_idx, 0,
2112 				     ((bch2_current_io_time(c, READ) + U32_MAX) &
2113 				      LRU_TIME_MAX)), 0);
2114 
2115 	while (true) {
2116 		bch2_trans_begin(trans);
2117 
2118 		struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
2119 		ret = bkey_err(k);
2120 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2121 			continue;
2122 		if (ret)
2123 			break;
2124 		if (!k.k)
2125 			break;
2126 
2127 		ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
2128 		if (ret)
2129 			break;
2130 
2131 		bch2_btree_iter_advance(&iter);
2132 	}
2133 	bch2_trans_iter_exit(trans, &iter);
2134 err:
2135 	bch2_trans_put(trans);
2136 	percpu_ref_put(&ca->io_ref);
2137 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2138 }
2139 
2140 void bch2_dev_do_invalidates(struct bch_dev *ca)
2141 {
2142 	struct bch_fs *c = ca->fs;
2143 
2144 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
2145 		return;
2146 
2147 	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
2148 		goto put_ref;
2149 
2150 	if (queue_work(c->write_ref_wq, &ca->invalidate_work))
2151 		return;
2152 
2153 	percpu_ref_put(&ca->io_ref);
2154 put_ref:
2155 	bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
2156 }
2157 
2158 void bch2_do_invalidates(struct bch_fs *c)
2159 {
2160 	for_each_member_device(c, ca)
2161 		bch2_dev_do_invalidates(ca);
2162 }
2163 
2164 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
2165 			    u64 bucket_start, u64 bucket_end)
2166 {
2167 	struct btree_trans *trans = bch2_trans_get(c);
2168 	struct btree_iter iter;
2169 	struct bkey_s_c k;
2170 	struct bkey hole;
2171 	struct bpos end = POS(ca->dev_idx, bucket_end);
2172 	struct bch_member *m;
2173 	unsigned long last_updated = jiffies;
2174 	int ret;
2175 
2176 	BUG_ON(bucket_start > bucket_end);
2177 	BUG_ON(bucket_end > ca->mi.nbuckets);
2178 
2179 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
2180 		POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
2181 		BTREE_ITER_prefetch);
2182 	/*
2183 	 * Scan the alloc btree for every bucket on @ca, and add buckets to the
2184 	 * freespace/need_discard/need_gc_gens btrees as needed:
2185 	 */
2186 	while (1) {
2187 		if (time_after(jiffies, last_updated + HZ * 10)) {
2188 			bch_info(ca, "%s: currently at %llu/%llu",
2189 				 __func__, iter.pos.offset, ca->mi.nbuckets);
2190 			last_updated = jiffies;
2191 		}
2192 
2193 		bch2_trans_begin(trans);
2194 
2195 		if (bkey_ge(iter.pos, end)) {
2196 			ret = 0;
2197 			break;
2198 		}
2199 
2200 		k = bch2_get_key_or_hole(&iter, end, &hole);
2201 		ret = bkey_err(k);
2202 		if (ret)
2203 			goto bkey_err;
2204 
2205 		if (k.k->type) {
2206 			/*
2207 			 * We process live keys in the alloc btree one at a
2208 			 * time:
2209 			 */
2210 			struct bch_alloc_v4 a_convert;
2211 			const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
2212 
2213 			ret =   bch2_bucket_do_index(trans, ca, k, a, true) ?:
2214 				bch2_trans_commit(trans, NULL, NULL,
2215 						  BCH_TRANS_COMMIT_no_enospc);
2216 			if (ret)
2217 				goto bkey_err;
2218 
2219 			bch2_btree_iter_advance(&iter);
2220 		} else {
2221 			struct bkey_i *freespace;
2222 
2223 			freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
2224 			ret = PTR_ERR_OR_ZERO(freespace);
2225 			if (ret)
2226 				goto bkey_err;
2227 
2228 			bkey_init(&freespace->k);
2229 			freespace->k.type	= KEY_TYPE_set;
2230 			freespace->k.p		= k.k->p;
2231 			freespace->k.size	= k.k->size;
2232 
2233 			ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
2234 				bch2_trans_commit(trans, NULL, NULL,
2235 						  BCH_TRANS_COMMIT_no_enospc);
2236 			if (ret)
2237 				goto bkey_err;
2238 
2239 			bch2_btree_iter_set_pos(&iter, k.k->p);
2240 		}
2241 bkey_err:
2242 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
2243 			continue;
2244 		if (ret)
2245 			break;
2246 	}
2247 
2248 	bch2_trans_iter_exit(trans, &iter);
2249 	bch2_trans_put(trans);
2250 
2251 	if (ret < 0) {
2252 		bch_err_msg(ca, ret, "initializing free space");
2253 		return ret;
2254 	}
2255 
2256 	mutex_lock(&c->sb_lock);
2257 	m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
2258 	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
2259 	mutex_unlock(&c->sb_lock);
2260 
2261 	return 0;
2262 }
2263 
2264 int bch2_fs_freespace_init(struct bch_fs *c)
2265 {
2266 	int ret = 0;
2267 	bool doing_init = false;
2268 
2269 	/*
2270 	 * We can crash during the device add path, so we need to check this on
2271 	 * every mount:
2272 	 */
2273 
2274 	for_each_member_device(c, ca) {
2275 		if (ca->mi.freespace_initialized)
2276 			continue;
2277 
2278 		if (!doing_init) {
2279 			bch_info(c, "initializing freespace");
2280 			doing_init = true;
2281 		}
2282 
2283 		ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
2284 		if (ret) {
2285 			bch2_dev_put(ca);
2286 			bch_err_fn(c, ret);
2287 			return ret;
2288 		}
2289 	}
2290 
2291 	if (doing_init) {
2292 		mutex_lock(&c->sb_lock);
2293 		bch2_write_super(c);
2294 		mutex_unlock(&c->sb_lock);
2295 		bch_verbose(c, "done initializing freespace");
2296 	}
2297 
2298 	return 0;
2299 }
2300 
2301 /* device removal */
2302 
2303 int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
2304 {
2305 	struct bpos start	= POS(ca->dev_idx, 0);
2306 	struct bpos end		= POS(ca->dev_idx, U64_MAX);
2307 	int ret;
2308 
2309 	/*
2310 	 * We clear the LRU and need_discard btrees first so that we don't race
2311 	 * with bch2_do_invalidates() and bch2_do_discards()
2312 	 */
2313 	ret =   bch2_dev_remove_stripes(c, ca->dev_idx) ?:
2314 		bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
2315 					BTREE_TRIGGER_norun, NULL) ?:
2316 		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
2317 					BTREE_TRIGGER_norun, NULL) ?:
2318 		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
2319 					BTREE_TRIGGER_norun, NULL) ?:
2320 		bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
2321 					BTREE_TRIGGER_norun, NULL) ?:
2322 		bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
2323 					BTREE_TRIGGER_norun, NULL) ?:
2324 		bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
2325 					BTREE_TRIGGER_norun, NULL) ?:
2326 		bch2_dev_usage_remove(c, ca->dev_idx);
2327 	bch_err_msg(ca, ret, "removing dev alloc info");
2328 	return ret;
2329 }
2330 
2331 /* Bucket IO clocks: */
2332 
2333 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
2334 			      size_t bucket_nr, int rw)
2335 {
2336 	struct bch_fs *c = trans->c;
2337 	struct btree_iter iter;
2338 	struct bkey_i_alloc_v4 *a;
2339 	u64 now;
2340 	int ret = 0;
2341 
2342 	if (bch2_trans_relock(trans))
2343 		bch2_trans_begin(trans);
2344 
2345 	a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
2346 	ret = PTR_ERR_OR_ZERO(a);
2347 	if (ret)
2348 		return ret;
2349 
2350 	now = bch2_current_io_time(c, rw);
2351 	if (a->v.io_time[rw] == now)
2352 		goto out;
2353 
2354 	a->v.io_time[rw] = now;
2355 
2356 	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
2357 		bch2_trans_commit(trans, NULL, NULL, 0);
2358 out:
2359 	bch2_trans_iter_exit(trans, &iter);
2360 	return ret;
2361 }
2362 
2363 /* Startup/shutdown (ro/rw): */
2364 
2365 void bch2_recalc_capacity(struct bch_fs *c)
2366 {
2367 	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
2368 	unsigned bucket_size_max = 0;
2369 	unsigned long ra_pages = 0;
2370 
2371 	lockdep_assert_held(&c->state_lock);
2372 
2373 	for_each_online_member(c, ca) {
2374 		struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
2375 
2376 		ra_pages += bdi->ra_pages;
2377 	}
2378 
2379 	bch2_set_ra_pages(c, ra_pages);
2380 
2381 	for_each_rw_member(c, ca) {
2382 		u64 dev_reserve = 0;
2383 
2384 		/*
2385 		 * We need to reserve buckets (from the number
2386 		 * of currently available buckets) against
2387 		 * foreground writes so that mainly copygc can
2388 		 * make forward progress.
2389 		 *
2390 		 * We need enough to refill the various reserves
2391 		 * from scratch - copygc will use its entire
2392 		 * reserve all at once, then run against when
2393 		 * its reserve is refilled (from the formerly
2394 		 * available buckets).
2395 		 *
2396 		 * This reserve is just used when considering if
2397 		 * allocations for foreground writes must wait -
2398 		 * not -ENOSPC calculations.
2399 		 */
2400 
2401 		dev_reserve += ca->nr_btree_reserve * 2;
2402 		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
2403 
2404 		dev_reserve += 1;	/* btree write point */
2405 		dev_reserve += 1;	/* copygc write point */
2406 		dev_reserve += 1;	/* rebalance write point */
2407 
2408 		dev_reserve *= ca->mi.bucket_size;
2409 
2410 		capacity += bucket_to_sector(ca, ca->mi.nbuckets -
2411 					     ca->mi.first_bucket);
2412 
2413 		reserved_sectors += dev_reserve * 2;
2414 
2415 		bucket_size_max = max_t(unsigned, bucket_size_max,
2416 					ca->mi.bucket_size);
2417 	}
2418 
2419 	gc_reserve = c->opts.gc_reserve_bytes
2420 		? c->opts.gc_reserve_bytes >> 9
2421 		: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
2422 
2423 	reserved_sectors = max(gc_reserve, reserved_sectors);
2424 
2425 	reserved_sectors = min(reserved_sectors, capacity);
2426 
2427 	c->reserved = reserved_sectors;
2428 	c->capacity = capacity - reserved_sectors;
2429 
2430 	c->bucket_size_max = bucket_size_max;
2431 
2432 	/* Wake up case someone was waiting for buckets */
2433 	closure_wake_up(&c->freelist_wait);
2434 }
2435 
2436 u64 bch2_min_rw_member_capacity(struct bch_fs *c)
2437 {
2438 	u64 ret = U64_MAX;
2439 
2440 	for_each_rw_member(c, ca)
2441 		ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
2442 	return ret;
2443 }
2444 
2445 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
2446 {
2447 	struct open_bucket *ob;
2448 	bool ret = false;
2449 
2450 	for (ob = c->open_buckets;
2451 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
2452 	     ob++) {
2453 		spin_lock(&ob->lock);
2454 		if (ob->valid && !ob->on_partial_list &&
2455 		    ob->dev == ca->dev_idx)
2456 			ret = true;
2457 		spin_unlock(&ob->lock);
2458 	}
2459 
2460 	return ret;
2461 }
2462 
2463 /* device goes ro: */
2464 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
2465 {
2466 	lockdep_assert_held(&c->state_lock);
2467 
2468 	/* First, remove device from allocation groups: */
2469 
2470 	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2471 		clear_bit(ca->dev_idx, c->rw_devs[i].d);
2472 
2473 	c->rw_devs_change_count++;
2474 
2475 	/*
2476 	 * Capacity is calculated based off of devices in allocation groups:
2477 	 */
2478 	bch2_recalc_capacity(c);
2479 
2480 	bch2_open_buckets_stop(c, ca, false);
2481 
2482 	/*
2483 	 * Wake up threads that were blocked on allocation, so they can notice
2484 	 * the device can no longer be removed and the capacity has changed:
2485 	 */
2486 	closure_wake_up(&c->freelist_wait);
2487 
2488 	/*
2489 	 * journal_res_get() can block waiting for free space in the journal -
2490 	 * it needs to notice there may not be devices to allocate from anymore:
2491 	 */
2492 	wake_up(&c->journal.wait);
2493 
2494 	/* Now wait for any in flight writes: */
2495 
2496 	closure_wait_event(&c->open_buckets_wait,
2497 			   !bch2_dev_has_open_write_point(c, ca));
2498 }
2499 
2500 /* device goes rw: */
2501 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
2502 {
2503 	lockdep_assert_held(&c->state_lock);
2504 
2505 	for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
2506 		if (ca->mi.data_allowed & (1 << i))
2507 			set_bit(ca->dev_idx, c->rw_devs[i].d);
2508 
2509 	c->rw_devs_change_count++;
2510 }
2511 
2512 void bch2_dev_allocator_background_exit(struct bch_dev *ca)
2513 {
2514 	darray_exit(&ca->discard_buckets_in_flight);
2515 }
2516 
2517 void bch2_dev_allocator_background_init(struct bch_dev *ca)
2518 {
2519 	mutex_init(&ca->discard_buckets_in_flight_lock);
2520 	INIT_WORK(&ca->discard_work, bch2_do_discards_work);
2521 	INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work);
2522 	INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work);
2523 }
2524 
2525 void bch2_fs_allocator_background_init(struct bch_fs *c)
2526 {
2527 	spin_lock_init(&c->freelist_lock);
2528 }
2529