1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code for manipulating bucket marks for garbage collection.
4 *
5 * Copyright 2014 Datera, Inc.
6 */
7
8 #include "bcachefs.h"
9 #include "alloc_background.h"
10 #include "backpointers.h"
11 #include "bset.h"
12 #include "btree_gc.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "disk_accounting.h"
17 #include "ec.h"
18 #include "error.h"
19 #include "inode.h"
20 #include "movinggc.h"
21 #include "recovery.h"
22 #include "reflink.h"
23 #include "replicas.h"
24 #include "subvolume.h"
25 #include "trace.h"
26
27 #include <linux/preempt.h>
28
bch2_dev_usage_read_fast(struct bch_dev * ca,struct bch_dev_usage * usage)29 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
30 {
31 memset(usage, 0, sizeof(*usage));
32 acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
33 }
34
reserve_factor(u64 r)35 static u64 reserve_factor(u64 r)
36 {
37 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
38 }
39
40 static struct bch_fs_usage_short
__bch2_fs_usage_read_short(struct bch_fs * c)41 __bch2_fs_usage_read_short(struct bch_fs *c)
42 {
43 struct bch_fs_usage_short ret;
44 u64 data, reserved;
45
46 ret.capacity = c->capacity -
47 percpu_u64_get(&c->usage->hidden);
48
49 data = percpu_u64_get(&c->usage->data) +
50 percpu_u64_get(&c->usage->btree);
51 reserved = percpu_u64_get(&c->usage->reserved) +
52 percpu_u64_get(c->online_reserved);
53
54 ret.used = min(ret.capacity, data + reserve_factor(reserved));
55 ret.free = ret.capacity - ret.used;
56
57 ret.nr_inodes = percpu_u64_get(&c->usage->nr_inodes);
58
59 return ret;
60 }
61
62 struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs * c)63 bch2_fs_usage_read_short(struct bch_fs *c)
64 {
65 struct bch_fs_usage_short ret;
66
67 percpu_down_read(&c->mark_lock);
68 ret = __bch2_fs_usage_read_short(c);
69 percpu_up_read(&c->mark_lock);
70
71 return ret;
72 }
73
bch2_dev_usage_to_text(struct printbuf * out,struct bch_dev * ca,struct bch_dev_usage * usage)74 void bch2_dev_usage_to_text(struct printbuf *out,
75 struct bch_dev *ca,
76 struct bch_dev_usage *usage)
77 {
78 prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
79
80 for (unsigned i = 0; i < BCH_DATA_NR; i++) {
81 bch2_prt_data_type(out, i);
82 prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
83 usage->d[i].buckets,
84 usage->d[i].sectors,
85 usage->d[i].fragmented);
86 }
87
88 prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
89 }
90
bch2_check_fix_ptr(struct btree_trans * trans,struct bkey_s_c k,struct extent_ptr_decoded p,const union bch_extent_entry * entry,bool * do_update)91 static int bch2_check_fix_ptr(struct btree_trans *trans,
92 struct bkey_s_c k,
93 struct extent_ptr_decoded p,
94 const union bch_extent_entry *entry,
95 bool *do_update)
96 {
97 struct bch_fs *c = trans->c;
98 struct printbuf buf = PRINTBUF;
99 int ret = 0;
100
101 struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
102 if (!ca) {
103 if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID,
104 trans, ptr_to_invalid_device,
105 "pointer to missing device %u\n"
106 "while marking %s",
107 p.ptr.dev,
108 (printbuf_reset(&buf),
109 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
110 *do_update = true;
111 return 0;
112 }
113
114 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
115 if (!g) {
116 if (fsck_err(trans, ptr_to_invalid_device,
117 "pointer to invalid bucket on device %u\n"
118 "while marking %s",
119 p.ptr.dev,
120 (printbuf_reset(&buf),
121 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
122 *do_update = true;
123 goto out;
124 }
125
126 enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
127
128 if (fsck_err_on(!g->gen_valid,
129 trans, ptr_to_missing_alloc_key,
130 "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
131 "while marking %s",
132 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
133 bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
134 p.ptr.gen,
135 (printbuf_reset(&buf),
136 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
137 if (!p.ptr.cached) {
138 g->gen_valid = true;
139 g->gen = p.ptr.gen;
140 } else {
141 *do_update = true;
142 }
143 }
144
145 if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
146 trans, ptr_gen_newer_than_bucket_gen,
147 "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
148 "while marking %s",
149 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
150 bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
151 p.ptr.gen, g->gen,
152 (printbuf_reset(&buf),
153 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
154 if (!p.ptr.cached &&
155 (g->data_type != BCH_DATA_btree ||
156 data_type == BCH_DATA_btree)) {
157 g->gen_valid = true;
158 g->gen = p.ptr.gen;
159 g->data_type = 0;
160 g->stripe_sectors = 0;
161 g->dirty_sectors = 0;
162 g->cached_sectors = 0;
163 } else {
164 *do_update = true;
165 }
166 }
167
168 if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
169 trans, ptr_gen_newer_than_bucket_gen,
170 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
171 "while marking %s",
172 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
173 bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
174 p.ptr.gen,
175 (printbuf_reset(&buf),
176 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
177 *do_update = true;
178
179 if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
180 trans, stale_dirty_ptr,
181 "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
182 "while marking %s",
183 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
184 bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
185 p.ptr.gen, g->gen,
186 (printbuf_reset(&buf),
187 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
188 *do_update = true;
189
190 if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
191 goto out;
192
193 if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
194 trans, ptr_bucket_data_type_mismatch,
195 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
196 "while marking %s",
197 p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
198 bch2_data_type_str(g->data_type),
199 bch2_data_type_str(data_type),
200 (printbuf_reset(&buf),
201 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
202 if (data_type == BCH_DATA_btree) {
203 g->gen_valid = true;
204 g->gen = p.ptr.gen;
205 g->data_type = data_type;
206 g->stripe_sectors = 0;
207 g->dirty_sectors = 0;
208 g->cached_sectors = 0;
209 } else {
210 *do_update = true;
211 }
212 }
213
214 if (p.has_ec) {
215 struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
216
217 if (fsck_err_on(!m || !m->alive,
218 trans, ptr_to_missing_stripe,
219 "pointer to nonexistent stripe %llu\n"
220 "while marking %s",
221 (u64) p.ec.idx,
222 (printbuf_reset(&buf),
223 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
224 *do_update = true;
225
226 if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
227 trans, ptr_to_incorrect_stripe,
228 "pointer does not match stripe %llu\n"
229 "while marking %s",
230 (u64) p.ec.idx,
231 (printbuf_reset(&buf),
232 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
233 *do_update = true;
234 }
235 out:
236 fsck_err:
237 bch2_dev_put(ca);
238 printbuf_exit(&buf);
239 return ret;
240 }
241
bch2_check_fix_ptrs(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c k,enum btree_iter_update_trigger_flags flags)242 int bch2_check_fix_ptrs(struct btree_trans *trans,
243 enum btree_id btree, unsigned level, struct bkey_s_c k,
244 enum btree_iter_update_trigger_flags flags)
245 {
246 struct bch_fs *c = trans->c;
247 struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
248 const union bch_extent_entry *entry_c;
249 struct extent_ptr_decoded p = { 0 };
250 bool do_update = false;
251 struct printbuf buf = PRINTBUF;
252 int ret = 0;
253
254 percpu_down_read(&c->mark_lock);
255
256 bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
257 ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
258 if (ret)
259 goto err;
260 }
261
262 if (do_update) {
263 if (flags & BTREE_TRIGGER_is_root) {
264 bch_err(c, "cannot update btree roots yet");
265 ret = -EINVAL;
266 goto err;
267 }
268
269 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
270 ret = PTR_ERR_OR_ZERO(new);
271 if (ret)
272 goto err;
273
274 rcu_read_lock();
275 bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev));
276 rcu_read_unlock();
277
278 if (level) {
279 /*
280 * We don't want to drop btree node pointers - if the
281 * btree node isn't there anymore, the read path will
282 * sort it out:
283 */
284 struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
285 rcu_read_lock();
286 bkey_for_each_ptr(ptrs, ptr) {
287 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
288 struct bucket *g = PTR_GC_BUCKET(ca, ptr);
289
290 ptr->gen = g->gen;
291 }
292 rcu_read_unlock();
293 } else {
294 struct bkey_ptrs ptrs;
295 union bch_extent_entry *entry;
296
297 rcu_read_lock();
298 restart_drop_ptrs:
299 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
300 bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
301 struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
302 struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
303 enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
304
305 if ((p.ptr.cached &&
306 (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
307 (!p.ptr.cached &&
308 gen_cmp(p.ptr.gen, g->gen) < 0) ||
309 gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
310 (g->data_type &&
311 g->data_type != data_type)) {
312 bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
313 goto restart_drop_ptrs;
314 }
315 }
316 rcu_read_unlock();
317 again:
318 ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
319 bkey_extent_entry_for_each(ptrs, entry) {
320 if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
321 struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
322 entry->stripe_ptr.idx);
323 union bch_extent_entry *next_ptr;
324
325 bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
326 if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
327 goto found;
328 next_ptr = NULL;
329 found:
330 if (!next_ptr) {
331 bch_err(c, "aieee, found stripe ptr with no data ptr");
332 continue;
333 }
334
335 if (!m || !m->alive ||
336 !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
337 &next_ptr->ptr,
338 m->sectors)) {
339 bch2_bkey_extent_entry_drop(new, entry);
340 goto again;
341 }
342 }
343 }
344 }
345
346 if (0) {
347 printbuf_reset(&buf);
348 bch2_bkey_val_to_text(&buf, c, k);
349 bch_info(c, "updated %s", buf.buf);
350
351 printbuf_reset(&buf);
352 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
353 bch_info(c, "new key %s", buf.buf);
354 }
355
356 percpu_up_read(&c->mark_lock);
357 struct btree_iter iter;
358 bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
359 BTREE_ITER_intent|BTREE_ITER_all_snapshots);
360 ret = bch2_btree_iter_traverse(&iter) ?:
361 bch2_trans_update(trans, &iter, new,
362 BTREE_UPDATE_internal_snapshot_node|
363 BTREE_TRIGGER_norun);
364 bch2_trans_iter_exit(trans, &iter);
365 percpu_down_read(&c->mark_lock);
366
367 if (ret)
368 goto err;
369
370 if (level)
371 bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
372 }
373 err:
374 percpu_up_read(&c->mark_lock);
375 printbuf_exit(&buf);
376 return ret;
377 }
378
bch2_bucket_ref_update(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,const struct bch_extent_ptr * ptr,s64 sectors,enum bch_data_type ptr_data_type,u8 b_gen,u8 bucket_data_type,u32 * bucket_sectors)379 int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
380 struct bkey_s_c k,
381 const struct bch_extent_ptr *ptr,
382 s64 sectors, enum bch_data_type ptr_data_type,
383 u8 b_gen, u8 bucket_data_type,
384 u32 *bucket_sectors)
385 {
386 struct bch_fs *c = trans->c;
387 size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
388 struct printbuf buf = PRINTBUF;
389 bool inserting = sectors > 0;
390 int ret = 0;
391
392 BUG_ON(!sectors);
393
394 if (gen_after(ptr->gen, b_gen)) {
395 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
396 ptr_gen_newer_than_bucket_gen,
397 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
398 "while marking %s",
399 ptr->dev, bucket_nr, b_gen,
400 bch2_data_type_str(bucket_data_type ?: ptr_data_type),
401 ptr->gen,
402 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
403 if (inserting)
404 goto err;
405 goto out;
406 }
407
408 if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
409 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
410 ptr_too_stale,
411 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
412 "while marking %s",
413 ptr->dev, bucket_nr, b_gen,
414 bch2_data_type_str(bucket_data_type ?: ptr_data_type),
415 ptr->gen,
416 (printbuf_reset(&buf),
417 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
418 if (inserting)
419 goto err;
420 goto out;
421 }
422
423 if (b_gen != ptr->gen && ptr->cached) {
424 ret = 1;
425 goto out;
426 }
427
428 if (b_gen != ptr->gen) {
429 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
430 stale_dirty_ptr,
431 "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
432 "while marking %s",
433 ptr->dev, bucket_nr, b_gen,
434 bucket_gen_get(ca, bucket_nr),
435 bch2_data_type_str(bucket_data_type ?: ptr_data_type),
436 ptr->gen,
437 (printbuf_reset(&buf),
438 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
439 if (inserting)
440 goto err;
441 goto out;
442 }
443
444 if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
445 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
446 ptr_bucket_data_type_mismatch,
447 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
448 "while marking %s",
449 ptr->dev, bucket_nr, b_gen,
450 bch2_data_type_str(bucket_data_type),
451 bch2_data_type_str(ptr_data_type),
452 (printbuf_reset(&buf),
453 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
454 if (inserting)
455 goto err;
456 goto out;
457 }
458
459 if ((u64) *bucket_sectors + sectors > U32_MAX) {
460 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
461 bucket_sector_count_overflow,
462 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
463 "while marking %s",
464 ptr->dev, bucket_nr, b_gen,
465 bch2_data_type_str(bucket_data_type ?: ptr_data_type),
466 *bucket_sectors, sectors,
467 (printbuf_reset(&buf),
468 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
469 if (inserting)
470 goto err;
471 sectors = -*bucket_sectors;
472 }
473
474 *bucket_sectors += sectors;
475 out:
476 printbuf_exit(&buf);
477 return ret;
478 err:
479 bch2_dump_trans_updates(trans);
480 ret = -EIO;
481 goto out;
482 }
483
bch2_trans_account_disk_usage_change(struct btree_trans * trans)484 void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
485 {
486 struct bch_fs *c = trans->c;
487 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
488 static int warned_disk_usage = 0;
489 bool warn = false;
490
491 percpu_down_read(&c->mark_lock);
492 struct bch_fs_usage_base *src = &trans->fs_usage_delta;
493
494 s64 added = src->btree + src->data + src->reserved;
495
496 /*
497 * Not allowed to reduce sectors_available except by getting a
498 * reservation:
499 */
500 s64 should_not_have_added = added - (s64) disk_res_sectors;
501 if (unlikely(should_not_have_added > 0)) {
502 u64 old, new;
503
504 old = atomic64_read(&c->sectors_available);
505 do {
506 new = max_t(s64, 0, old - should_not_have_added);
507 } while (!atomic64_try_cmpxchg(&c->sectors_available,
508 &old, new));
509
510 added -= should_not_have_added;
511 warn = true;
512 }
513
514 if (added > 0) {
515 trans->disk_res->sectors -= added;
516 this_cpu_sub(*c->online_reserved, added);
517 }
518
519 preempt_disable();
520 struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
521 acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64));
522 preempt_enable();
523 percpu_up_read(&c->mark_lock);
524
525 if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
526 bch2_trans_inconsistent(trans,
527 "disk usage increased %lli more than %llu sectors reserved)",
528 should_not_have_added, disk_res_sectors);
529 }
530
531 /* KEY_TYPE_extent: */
532
__mark_pointer(struct btree_trans * trans,struct bch_dev * ca,struct bkey_s_c k,const struct extent_ptr_decoded * p,s64 sectors,enum bch_data_type ptr_data_type,struct bch_alloc_v4 * a)533 static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
534 struct bkey_s_c k,
535 const struct extent_ptr_decoded *p,
536 s64 sectors, enum bch_data_type ptr_data_type,
537 struct bch_alloc_v4 *a)
538 {
539 u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
540 !p->ptr.cached ? &a->dirty_sectors :
541 &a->cached_sectors;
542 int ret = bch2_bucket_ref_update(trans, ca, k, &p->ptr, sectors, ptr_data_type,
543 a->gen, a->data_type, dst_sectors);
544
545 if (ret)
546 return ret;
547
548 alloc_data_type_set(a, ptr_data_type);
549 return 0;
550 }
551
bch2_trigger_pointer(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct bkey_s_c k,struct extent_ptr_decoded p,const union bch_extent_entry * entry,s64 * sectors,enum btree_iter_update_trigger_flags flags)552 static int bch2_trigger_pointer(struct btree_trans *trans,
553 enum btree_id btree_id, unsigned level,
554 struct bkey_s_c k, struct extent_ptr_decoded p,
555 const union bch_extent_entry *entry,
556 s64 *sectors,
557 enum btree_iter_update_trigger_flags flags)
558 {
559 bool insert = !(flags & BTREE_TRIGGER_overwrite);
560 struct printbuf buf = PRINTBUF;
561 int ret = 0;
562
563 struct bch_fs *c = trans->c;
564 struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
565 if (unlikely(!ca)) {
566 if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
567 ret = -EIO;
568 goto err;
569 }
570
571 struct bpos bucket;
572 struct bch_backpointer bp;
573 bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp);
574 *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
575
576 if (flags & BTREE_TRIGGER_transactional) {
577 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
578 ret = PTR_ERR_OR_ZERO(a) ?:
579 __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
580 if (ret)
581 goto err;
582
583 if (!p.ptr.cached) {
584 ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
585 if (ret)
586 goto err;
587 }
588 }
589
590 if (flags & BTREE_TRIGGER_gc) {
591 percpu_down_read(&c->mark_lock);
592 struct bucket *g = gc_bucket(ca, bucket.offset);
593 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
594 p.ptr.dev,
595 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
596 ret = -EIO;
597 goto err_unlock;
598 }
599
600 bucket_lock(g);
601 struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
602 ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
603 alloc_to_bucket(g, new);
604 bucket_unlock(g);
605 err_unlock:
606 percpu_up_read(&c->mark_lock);
607
608 if (!ret)
609 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
610 }
611 err:
612 bch2_dev_put(ca);
613 printbuf_exit(&buf);
614 return ret;
615 }
616
bch2_trigger_stripe_ptr(struct btree_trans * trans,struct bkey_s_c k,struct extent_ptr_decoded p,enum bch_data_type data_type,s64 sectors,enum btree_iter_update_trigger_flags flags)617 static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
618 struct bkey_s_c k,
619 struct extent_ptr_decoded p,
620 enum bch_data_type data_type,
621 s64 sectors,
622 enum btree_iter_update_trigger_flags flags)
623 {
624 if (flags & BTREE_TRIGGER_transactional) {
625 struct btree_iter iter;
626 struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
627 BTREE_ID_stripes, POS(0, p.ec.idx),
628 BTREE_ITER_with_updates, stripe);
629 int ret = PTR_ERR_OR_ZERO(s);
630 if (unlikely(ret)) {
631 bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
632 "pointer to nonexistent stripe %llu",
633 (u64) p.ec.idx);
634 goto err;
635 }
636
637 if (!bch2_ptr_matches_stripe(&s->v, p)) {
638 bch2_trans_inconsistent(trans,
639 "stripe pointer doesn't match stripe %llu",
640 (u64) p.ec.idx);
641 ret = -EIO;
642 goto err;
643 }
644
645 stripe_blockcount_set(&s->v, p.ec.block,
646 stripe_blockcount_get(&s->v, p.ec.block) +
647 sectors);
648
649 struct disk_accounting_pos acc = {
650 .type = BCH_DISK_ACCOUNTING_replicas,
651 };
652 bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
653 acc.replicas.data_type = data_type;
654 ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false);
655 err:
656 bch2_trans_iter_exit(trans, &iter);
657 return ret;
658 }
659
660 if (flags & BTREE_TRIGGER_gc) {
661 struct bch_fs *c = trans->c;
662
663 struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
664 if (!m) {
665 bch_err(c, "error allocating memory for gc_stripes, idx %llu",
666 (u64) p.ec.idx);
667 return -BCH_ERR_ENOMEM_mark_stripe_ptr;
668 }
669
670 mutex_lock(&c->ec_stripes_heap_lock);
671
672 if (!m || !m->alive) {
673 mutex_unlock(&c->ec_stripes_heap_lock);
674 struct printbuf buf = PRINTBUF;
675 bch2_bkey_val_to_text(&buf, c, k);
676 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
677 (u64) p.ec.idx, buf.buf);
678 printbuf_exit(&buf);
679 bch2_inconsistent_error(c);
680 return -EIO;
681 }
682
683 m->block_sectors[p.ec.block] += sectors;
684
685 struct disk_accounting_pos acc = {
686 .type = BCH_DISK_ACCOUNTING_replicas,
687 };
688 memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
689 mutex_unlock(&c->ec_stripes_heap_lock);
690
691 acc.replicas.data_type = data_type;
692 int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true);
693 if (ret)
694 return ret;
695 }
696
697 return 0;
698 }
699
__trigger_extent(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct bkey_s_c k,enum btree_iter_update_trigger_flags flags,s64 * replicas_sectors)700 static int __trigger_extent(struct btree_trans *trans,
701 enum btree_id btree_id, unsigned level,
702 struct bkey_s_c k,
703 enum btree_iter_update_trigger_flags flags,
704 s64 *replicas_sectors)
705 {
706 bool gc = flags & BTREE_TRIGGER_gc;
707 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
708 const union bch_extent_entry *entry;
709 struct extent_ptr_decoded p;
710 enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
711 ? BCH_DATA_btree
712 : BCH_DATA_user;
713 int ret = 0;
714
715 struct disk_accounting_pos acc_replicas_key = {
716 .type = BCH_DISK_ACCOUNTING_replicas,
717 .replicas.data_type = data_type,
718 .replicas.nr_devs = 0,
719 .replicas.nr_required = 1,
720 };
721
722 struct disk_accounting_pos acct_compression_key = {
723 .type = BCH_DISK_ACCOUNTING_compression,
724 };
725 u64 compression_acct[3] = { 1, 0, 0 };
726
727 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
728 s64 disk_sectors = 0;
729 ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
730 if (ret < 0)
731 return ret;
732
733 bool stale = ret > 0;
734
735 if (p.ptr.cached && stale)
736 continue;
737
738 if (p.ptr.cached) {
739 ret = bch2_mod_dev_cached_sectors(trans, p.ptr.dev, disk_sectors, gc);
740 if (ret)
741 return ret;
742 } else if (!p.has_ec) {
743 *replicas_sectors += disk_sectors;
744 acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
745 } else {
746 ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
747 if (ret)
748 return ret;
749
750 /*
751 * There may be other dirty pointers in this extent, but
752 * if so they're not required for mounting if we have an
753 * erasure coded pointer in this extent:
754 */
755 acc_replicas_key.replicas.nr_required = 0;
756 }
757
758 if (acct_compression_key.compression.type &&
759 acct_compression_key.compression.type != p.crc.compression_type) {
760 if (flags & BTREE_TRIGGER_overwrite)
761 bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
762
763 ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
764 ARRAY_SIZE(compression_acct), gc);
765 if (ret)
766 return ret;
767
768 compression_acct[0] = 1;
769 compression_acct[1] = 0;
770 compression_acct[2] = 0;
771 }
772
773 acct_compression_key.compression.type = p.crc.compression_type;
774 if (p.crc.compression_type) {
775 compression_acct[1] += p.crc.uncompressed_size;
776 compression_acct[2] += p.crc.compressed_size;
777 }
778 }
779
780 if (acc_replicas_key.replicas.nr_devs) {
781 ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
782 if (ret)
783 return ret;
784 }
785
786 if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
787 struct disk_accounting_pos acc_snapshot_key = {
788 .type = BCH_DISK_ACCOUNTING_snapshot,
789 .snapshot.id = k.k->p.snapshot,
790 };
791 ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
792 if (ret)
793 return ret;
794 }
795
796 if (acct_compression_key.compression.type) {
797 if (flags & BTREE_TRIGGER_overwrite)
798 bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
799
800 ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
801 ARRAY_SIZE(compression_acct), gc);
802 if (ret)
803 return ret;
804 }
805
806 if (level) {
807 struct disk_accounting_pos acc_btree_key = {
808 .type = BCH_DISK_ACCOUNTING_btree,
809 .btree.id = btree_id,
810 };
811 ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
812 if (ret)
813 return ret;
814 } else {
815 bool insert = !(flags & BTREE_TRIGGER_overwrite);
816 struct disk_accounting_pos acc_inum_key = {
817 .type = BCH_DISK_ACCOUNTING_inum,
818 .inum.inum = k.k->p.inode,
819 };
820 s64 v[3] = {
821 insert ? 1 : -1,
822 insert ? k.k->size : -((s64) k.k->size),
823 *replicas_sectors,
824 };
825 ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
826 if (ret)
827 return ret;
828 }
829
830 return 0;
831 }
832
bch2_trigger_extent(struct btree_trans * trans,enum btree_id btree,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)833 int bch2_trigger_extent(struct btree_trans *trans,
834 enum btree_id btree, unsigned level,
835 struct bkey_s_c old, struct bkey_s new,
836 enum btree_iter_update_trigger_flags flags)
837 {
838 struct bch_fs *c = trans->c;
839 struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
840 struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
841 unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
842 unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
843
844 if (unlikely(flags & BTREE_TRIGGER_check_repair))
845 return bch2_check_fix_ptrs(trans, btree, level, new.s_c, flags);
846
847 /* if pointers aren't changing - nothing to do: */
848 if (new_ptrs_bytes == old_ptrs_bytes &&
849 !memcmp(new_ptrs.start,
850 old_ptrs.start,
851 new_ptrs_bytes))
852 return 0;
853
854 if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
855 s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
856
857 if (old.k->type) {
858 int ret = __trigger_extent(trans, btree, level, old,
859 flags & ~BTREE_TRIGGER_insert,
860 &old_replicas_sectors);
861 if (ret)
862 return ret;
863 }
864
865 if (new.k->type) {
866 int ret = __trigger_extent(trans, btree, level, new.s_c,
867 flags & ~BTREE_TRIGGER_overwrite,
868 &new_replicas_sectors);
869 if (ret)
870 return ret;
871 }
872
873 int need_rebalance_delta = 0;
874 s64 need_rebalance_sectors_delta = 0;
875
876 s64 s = bch2_bkey_sectors_need_rebalance(c, old);
877 need_rebalance_delta -= s != 0;
878 need_rebalance_sectors_delta -= s;
879
880 s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
881 need_rebalance_delta += s != 0;
882 need_rebalance_sectors_delta += s;
883
884 if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
885 int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
886 new.k->p, need_rebalance_delta > 0);
887 if (ret)
888 return ret;
889 }
890
891 if (need_rebalance_sectors_delta) {
892 struct disk_accounting_pos acc = {
893 .type = BCH_DISK_ACCOUNTING_rebalance_work,
894 };
895 int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
896 flags & BTREE_TRIGGER_gc);
897 if (ret)
898 return ret;
899 }
900 }
901
902 return 0;
903 }
904
905 /* KEY_TYPE_reservation */
906
__trigger_reservation(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct bkey_s_c k,enum btree_iter_update_trigger_flags flags)907 static int __trigger_reservation(struct btree_trans *trans,
908 enum btree_id btree_id, unsigned level, struct bkey_s_c k,
909 enum btree_iter_update_trigger_flags flags)
910 {
911 if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
912 s64 sectors = k.k->size;
913
914 if (flags & BTREE_TRIGGER_overwrite)
915 sectors = -sectors;
916
917 struct disk_accounting_pos acc = {
918 .type = BCH_DISK_ACCOUNTING_persistent_reserved,
919 .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
920 };
921
922 return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc);
923 }
924
925 return 0;
926 }
927
bch2_trigger_reservation(struct btree_trans * trans,enum btree_id btree_id,unsigned level,struct bkey_s_c old,struct bkey_s new,enum btree_iter_update_trigger_flags flags)928 int bch2_trigger_reservation(struct btree_trans *trans,
929 enum btree_id btree_id, unsigned level,
930 struct bkey_s_c old, struct bkey_s new,
931 enum btree_iter_update_trigger_flags flags)
932 {
933 return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
934 }
935
936 /* Mark superblocks: */
937
__bch2_trans_mark_metadata_bucket(struct btree_trans * trans,struct bch_dev * ca,u64 b,enum bch_data_type type,unsigned sectors)938 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
939 struct bch_dev *ca, u64 b,
940 enum bch_data_type type,
941 unsigned sectors)
942 {
943 struct btree_iter iter;
944 int ret = 0;
945
946 struct bkey_i_alloc_v4 *a =
947 bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(ca->dev_idx, b));
948 if (IS_ERR(a))
949 return PTR_ERR(a);
950
951 if (a->v.data_type && type && a->v.data_type != type) {
952 bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
953 bucket_metadata_type_mismatch,
954 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
955 "while marking %s",
956 iter.pos.inode, iter.pos.offset, a->v.gen,
957 bch2_data_type_str(a->v.data_type),
958 bch2_data_type_str(type),
959 bch2_data_type_str(type));
960 ret = -EIO;
961 goto err;
962 }
963
964 if (a->v.data_type != type ||
965 a->v.dirty_sectors != sectors) {
966 a->v.data_type = type;
967 a->v.dirty_sectors = sectors;
968 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
969 }
970 err:
971 bch2_trans_iter_exit(trans, &iter);
972 return ret;
973 }
974
bch2_mark_metadata_bucket(struct btree_trans * trans,struct bch_dev * ca,u64 b,enum bch_data_type data_type,unsigned sectors,enum btree_iter_update_trigger_flags flags)975 static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca,
976 u64 b, enum bch_data_type data_type, unsigned sectors,
977 enum btree_iter_update_trigger_flags flags)
978 {
979 struct bch_fs *c = trans->c;
980 int ret = 0;
981
982 percpu_down_read(&c->mark_lock);
983 struct bucket *g = gc_bucket(ca, b);
984 if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
985 ca->dev_idx, bch2_data_type_str(data_type)))
986 goto err_unlock;
987
988 bucket_lock(g);
989 struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
990
991 if (bch2_fs_inconsistent_on(g->data_type &&
992 g->data_type != data_type, c,
993 "different types of data in same bucket: %s, %s",
994 bch2_data_type_str(g->data_type),
995 bch2_data_type_str(data_type)))
996 goto err;
997
998 if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
999 "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
1000 ca->dev_idx, b, g->gen,
1001 bch2_data_type_str(g->data_type ?: data_type),
1002 g->dirty_sectors, sectors))
1003 goto err;
1004
1005 g->data_type = data_type;
1006 g->dirty_sectors += sectors;
1007 struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
1008 bucket_unlock(g);
1009 percpu_up_read(&c->mark_lock);
1010 ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
1011 return ret;
1012 err:
1013 bucket_unlock(g);
1014 err_unlock:
1015 percpu_up_read(&c->mark_lock);
1016 return -EIO;
1017 }
1018
bch2_trans_mark_metadata_bucket(struct btree_trans * trans,struct bch_dev * ca,u64 b,enum bch_data_type type,unsigned sectors,enum btree_iter_update_trigger_flags flags)1019 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1020 struct bch_dev *ca, u64 b,
1021 enum bch_data_type type, unsigned sectors,
1022 enum btree_iter_update_trigger_flags flags)
1023 {
1024 BUG_ON(type != BCH_DATA_free &&
1025 type != BCH_DATA_sb &&
1026 type != BCH_DATA_journal);
1027
1028 /*
1029 * Backup superblock might be past the end of our normal usable space:
1030 */
1031 if (b >= ca->mi.nbuckets)
1032 return 0;
1033
1034 if (flags & BTREE_TRIGGER_gc)
1035 return bch2_mark_metadata_bucket(trans, ca, b, type, sectors, flags);
1036 else if (flags & BTREE_TRIGGER_transactional)
1037 return commit_do(trans, NULL, NULL, 0,
1038 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
1039 else
1040 BUG();
1041 }
1042
bch2_trans_mark_metadata_sectors(struct btree_trans * trans,struct bch_dev * ca,u64 start,u64 end,enum bch_data_type type,u64 * bucket,unsigned * bucket_sectors,enum btree_iter_update_trigger_flags flags)1043 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
1044 struct bch_dev *ca, u64 start, u64 end,
1045 enum bch_data_type type, u64 *bucket, unsigned *bucket_sectors,
1046 enum btree_iter_update_trigger_flags flags)
1047 {
1048 do {
1049 u64 b = sector_to_bucket(ca, start);
1050 unsigned sectors =
1051 min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1052
1053 if (b != *bucket && *bucket_sectors) {
1054 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
1055 type, *bucket_sectors, flags);
1056 if (ret)
1057 return ret;
1058
1059 *bucket_sectors = 0;
1060 }
1061
1062 *bucket = b;
1063 *bucket_sectors += sectors;
1064 start += sectors;
1065 } while (start < end);
1066
1067 return 0;
1068 }
1069
__bch2_trans_mark_dev_sb(struct btree_trans * trans,struct bch_dev * ca,enum btree_iter_update_trigger_flags flags)1070 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
1071 enum btree_iter_update_trigger_flags flags)
1072 {
1073 struct bch_fs *c = trans->c;
1074
1075 mutex_lock(&c->sb_lock);
1076 struct bch_sb_layout layout = ca->disk_sb.sb->layout;
1077 mutex_unlock(&c->sb_lock);
1078
1079 u64 bucket = 0;
1080 unsigned i, bucket_sectors = 0;
1081 int ret;
1082
1083 for (i = 0; i < layout.nr_superblocks; i++) {
1084 u64 offset = le64_to_cpu(layout.sb_offset[i]);
1085
1086 if (offset == BCH_SB_SECTOR) {
1087 ret = bch2_trans_mark_metadata_sectors(trans, ca,
1088 0, BCH_SB_SECTOR,
1089 BCH_DATA_sb, &bucket, &bucket_sectors, flags);
1090 if (ret)
1091 return ret;
1092 }
1093
1094 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
1095 offset + (1 << layout.sb_max_size_bits),
1096 BCH_DATA_sb, &bucket, &bucket_sectors, flags);
1097 if (ret)
1098 return ret;
1099 }
1100
1101 if (bucket_sectors) {
1102 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1103 bucket, BCH_DATA_sb, bucket_sectors, flags);
1104 if (ret)
1105 return ret;
1106 }
1107
1108 for (i = 0; i < ca->journal.nr; i++) {
1109 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1110 ca->journal.buckets[i],
1111 BCH_DATA_journal, ca->mi.bucket_size, flags);
1112 if (ret)
1113 return ret;
1114 }
1115
1116 return 0;
1117 }
1118
bch2_trans_mark_dev_sb(struct bch_fs * c,struct bch_dev * ca,enum btree_iter_update_trigger_flags flags)1119 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
1120 enum btree_iter_update_trigger_flags flags)
1121 {
1122 int ret = bch2_trans_run(c,
1123 __bch2_trans_mark_dev_sb(trans, ca, flags));
1124 bch_err_fn(c, ret);
1125 return ret;
1126 }
1127
bch2_trans_mark_dev_sbs_flags(struct bch_fs * c,enum btree_iter_update_trigger_flags flags)1128 int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
1129 enum btree_iter_update_trigger_flags flags)
1130 {
1131 for_each_online_member(c, ca) {
1132 int ret = bch2_trans_mark_dev_sb(c, ca, flags);
1133 if (ret) {
1134 percpu_ref_put(&ca->io_ref);
1135 return ret;
1136 }
1137 }
1138
1139 return 0;
1140 }
1141
bch2_trans_mark_dev_sbs(struct bch_fs * c)1142 int bch2_trans_mark_dev_sbs(struct bch_fs *c)
1143 {
1144 return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
1145 }
1146
1147 /* Disk reservations: */
1148
1149 #define SECTORS_CACHE 1024
1150
__bch2_disk_reservation_add(struct bch_fs * c,struct disk_reservation * res,u64 sectors,int flags)1151 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1152 u64 sectors, int flags)
1153 {
1154 struct bch_fs_pcpu *pcpu;
1155 u64 old, get;
1156 s64 sectors_available;
1157 int ret;
1158
1159 percpu_down_read(&c->mark_lock);
1160 preempt_disable();
1161 pcpu = this_cpu_ptr(c->pcpu);
1162
1163 if (sectors <= pcpu->sectors_available)
1164 goto out;
1165
1166 old = atomic64_read(&c->sectors_available);
1167 do {
1168 get = min((u64) sectors + SECTORS_CACHE, old);
1169
1170 if (get < sectors) {
1171 preempt_enable();
1172 goto recalculate;
1173 }
1174 } while (!atomic64_try_cmpxchg(&c->sectors_available,
1175 &old, old - get));
1176
1177 pcpu->sectors_available += get;
1178
1179 out:
1180 pcpu->sectors_available -= sectors;
1181 this_cpu_add(*c->online_reserved, sectors);
1182 res->sectors += sectors;
1183
1184 preempt_enable();
1185 percpu_up_read(&c->mark_lock);
1186 return 0;
1187
1188 recalculate:
1189 mutex_lock(&c->sectors_available_lock);
1190
1191 percpu_u64_set(&c->pcpu->sectors_available, 0);
1192 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
1193
1194 if (sectors <= sectors_available ||
1195 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1196 atomic64_set(&c->sectors_available,
1197 max_t(s64, 0, sectors_available - sectors));
1198 this_cpu_add(*c->online_reserved, sectors);
1199 res->sectors += sectors;
1200 ret = 0;
1201 } else {
1202 atomic64_set(&c->sectors_available, sectors_available);
1203 ret = -BCH_ERR_ENOSPC_disk_reservation;
1204 }
1205
1206 mutex_unlock(&c->sectors_available_lock);
1207 percpu_up_read(&c->mark_lock);
1208
1209 return ret;
1210 }
1211
1212 /* Startup/shutdown: */
1213
bch2_buckets_nouse_free(struct bch_fs * c)1214 void bch2_buckets_nouse_free(struct bch_fs *c)
1215 {
1216 for_each_member_device(c, ca) {
1217 kvfree_rcu_mightsleep(ca->buckets_nouse);
1218 ca->buckets_nouse = NULL;
1219 }
1220 }
1221
bch2_buckets_nouse_alloc(struct bch_fs * c)1222 int bch2_buckets_nouse_alloc(struct bch_fs *c)
1223 {
1224 for_each_member_device(c, ca) {
1225 BUG_ON(ca->buckets_nouse);
1226
1227 ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
1228 sizeof(unsigned long),
1229 GFP_KERNEL|__GFP_ZERO);
1230 if (!ca->buckets_nouse) {
1231 bch2_dev_put(ca);
1232 return -BCH_ERR_ENOMEM_buckets_nouse;
1233 }
1234 }
1235
1236 return 0;
1237 }
1238
bucket_gens_free_rcu(struct rcu_head * rcu)1239 static void bucket_gens_free_rcu(struct rcu_head *rcu)
1240 {
1241 struct bucket_gens *buckets =
1242 container_of(rcu, struct bucket_gens, rcu);
1243
1244 kvfree(buckets);
1245 }
1246
bch2_dev_buckets_resize(struct bch_fs * c,struct bch_dev * ca,u64 nbuckets)1247 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1248 {
1249 struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
1250 bool resize = ca->bucket_gens != NULL;
1251 int ret;
1252
1253 BUG_ON(resize && ca->buckets_nouse);
1254
1255 if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
1256 GFP_KERNEL|__GFP_ZERO))) {
1257 ret = -BCH_ERR_ENOMEM_bucket_gens;
1258 goto err;
1259 }
1260
1261 bucket_gens->first_bucket = ca->mi.first_bucket;
1262 bucket_gens->nbuckets = nbuckets;
1263 bucket_gens->nbuckets_minus_first =
1264 bucket_gens->nbuckets - bucket_gens->first_bucket;
1265
1266 if (resize) {
1267 down_write(&ca->bucket_lock);
1268 percpu_down_write(&c->mark_lock);
1269 }
1270
1271 old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
1272
1273 if (resize) {
1274 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
1275
1276 memcpy(bucket_gens->b,
1277 old_bucket_gens->b,
1278 n);
1279 }
1280
1281 rcu_assign_pointer(ca->bucket_gens, bucket_gens);
1282 bucket_gens = old_bucket_gens;
1283
1284 nbuckets = ca->mi.nbuckets;
1285
1286 if (resize) {
1287 percpu_up_write(&c->mark_lock);
1288 up_write(&ca->bucket_lock);
1289 }
1290
1291 ret = 0;
1292 err:
1293 if (bucket_gens)
1294 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
1295
1296 return ret;
1297 }
1298
bch2_dev_buckets_free(struct bch_dev * ca)1299 void bch2_dev_buckets_free(struct bch_dev *ca)
1300 {
1301 kvfree(ca->buckets_nouse);
1302 kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
1303 free_percpu(ca->usage);
1304 }
1305
bch2_dev_buckets_alloc(struct bch_fs * c,struct bch_dev * ca)1306 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1307 {
1308 ca->usage = alloc_percpu(struct bch_dev_usage);
1309 if (!ca->usage)
1310 return -BCH_ERR_ENOMEM_usage_init;
1311
1312 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
1313 }
1314