1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
9 #include "buckets.h"
10 #include "error.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
13
14 #include <linux/kthread.h>
15 #include <linux/sort.h>
16
17 struct find_btree_nodes_worker {
18 struct closure *cl;
19 struct find_btree_nodes *f;
20 struct bch_dev *ca;
21 };
22
found_btree_node_to_text(struct printbuf * out,struct bch_fs * c,const struct found_btree_node * n)23 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24 {
25 prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
26 bch2_btree_id_str(n->btree_id), n->level, n->seq,
27 n->journal_seq, n->cookie);
28 bch2_bpos_to_text(out, n->min_key);
29 prt_str(out, "-");
30 bch2_bpos_to_text(out, n->max_key);
31
32 if (n->range_updated)
33 prt_str(out, " range updated");
34 if (n->overwritten)
35 prt_str(out, " overwritten");
36
37 for (unsigned i = 0; i < n->nr_ptrs; i++) {
38 prt_char(out, ' ');
39 bch2_extent_ptr_to_text(out, c, n->ptrs + i);
40 }
41 }
42
found_btree_nodes_to_text(struct printbuf * out,struct bch_fs * c,found_btree_nodes nodes)43 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
44 {
45 printbuf_indent_add(out, 2);
46 darray_for_each(nodes, i) {
47 found_btree_node_to_text(out, c, i);
48 prt_newline(out);
49 }
50 printbuf_indent_sub(out, 2);
51 }
52
found_btree_node_to_key(struct bkey_i * k,const struct found_btree_node * f)53 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
54 {
55 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
56
57 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
58 bp->k.p = f->max_key;
59 bp->v.seq = cpu_to_le64(f->cookie);
60 bp->v.sectors_written = 0;
61 bp->v.flags = 0;
62 bp->v.sectors_written = cpu_to_le16(f->sectors_written);
63 bp->v.min_key = f->min_key;
64 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
65 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
66 }
67
bkey_journal_seq(struct bkey_s_c k)68 static inline u64 bkey_journal_seq(struct bkey_s_c k)
69 {
70 switch (k.k->type) {
71 case KEY_TYPE_inode_v3:
72 return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq);
73 default:
74 return 0;
75 }
76 }
77
found_btree_node_is_readable(struct btree_trans * trans,struct found_btree_node * f)78 static bool found_btree_node_is_readable(struct btree_trans *trans,
79 struct found_btree_node *f)
80 {
81 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
82
83 found_btree_node_to_key(&tmp.k, f);
84
85 struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
86 bool ret = !IS_ERR_OR_NULL(b);
87 if (!ret)
88 return ret;
89
90 f->sectors_written = b->written;
91 f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
92
93 struct bkey_s_c k;
94 struct bkey unpacked;
95 struct btree_node_iter iter;
96 for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
97 f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
98
99 six_unlock_read(&b->c.lock);
100
101 /*
102 * We might update this node's range; if that happens, we need the node
103 * to be re-read so the read path can trim keys that are no longer in
104 * this node
105 */
106 if (b != btree_node_root(trans->c, b))
107 bch2_btree_node_evict(trans, &tmp.k);
108 return ret;
109 }
110
found_btree_node_cmp_cookie(const void * _l,const void * _r)111 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
112 {
113 const struct found_btree_node *l = _l;
114 const struct found_btree_node *r = _r;
115
116 return cmp_int(l->btree_id, r->btree_id) ?:
117 cmp_int(l->level, r->level) ?:
118 cmp_int(l->cookie, r->cookie);
119 }
120
121 /*
122 * Given two found btree nodes, if their sequence numbers are equal, take the
123 * one that's readable:
124 */
found_btree_node_cmp_time(const struct found_btree_node * l,const struct found_btree_node * r)125 static int found_btree_node_cmp_time(const struct found_btree_node *l,
126 const struct found_btree_node *r)
127 {
128 return cmp_int(l->seq, r->seq) ?:
129 cmp_int(l->journal_seq, r->journal_seq);
130 }
131
found_btree_node_cmp_pos(const void * _l,const void * _r)132 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
133 {
134 const struct found_btree_node *l = _l;
135 const struct found_btree_node *r = _r;
136
137 return cmp_int(l->btree_id, r->btree_id) ?:
138 -cmp_int(l->level, r->level) ?:
139 bpos_cmp(l->min_key, r->min_key) ?:
140 -found_btree_node_cmp_time(l, r);
141 }
142
try_read_btree_node(struct find_btree_nodes * f,struct bch_dev * ca,struct bio * bio,struct btree_node * bn,u64 offset)143 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
144 struct bio *bio, struct btree_node *bn, u64 offset)
145 {
146 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
147
148 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
149 bio->bi_iter.bi_sector = offset;
150 bch2_bio_map(bio, bn, PAGE_SIZE);
151
152 submit_bio_wait(bio);
153 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
154 "IO error in try_read_btree_node() at %llu: %s",
155 offset, bch2_blk_status_to_str(bio->bi_status)))
156 return;
157
158 if (le64_to_cpu(bn->magic) != bset_magic(c))
159 return;
160
161 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
162 struct nonce nonce = btree_nonce(&bn->keys, 0);
163 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
164
165 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
166 }
167
168 if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
169 return;
170
171 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
172 return;
173
174 rcu_read_lock();
175 struct found_btree_node n = {
176 .btree_id = BTREE_NODE_ID(bn),
177 .level = BTREE_NODE_LEVEL(bn),
178 .seq = BTREE_NODE_SEQ(bn),
179 .cookie = le64_to_cpu(bn->keys.seq),
180 .min_key = bn->min_key,
181 .max_key = bn->max_key,
182 .nr_ptrs = 1,
183 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
184 .ptrs[0].offset = offset,
185 .ptrs[0].dev = ca->dev_idx,
186 .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
187 };
188 rcu_read_unlock();
189
190 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
191 mutex_lock(&f->lock);
192 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
193 bch_err(c, "try_read_btree_node() can't handle endian conversion");
194 f->ret = -EINVAL;
195 goto unlock;
196 }
197
198 if (darray_push(&f->nodes, n))
199 f->ret = -ENOMEM;
200 unlock:
201 mutex_unlock(&f->lock);
202 }
203 }
204
read_btree_nodes_worker(void * p)205 static int read_btree_nodes_worker(void *p)
206 {
207 struct find_btree_nodes_worker *w = p;
208 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
209 struct bch_dev *ca = w->ca;
210 void *buf = (void *) __get_free_page(GFP_KERNEL);
211 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
212 unsigned long last_print = jiffies;
213
214 if (!buf || !bio) {
215 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
216 w->f->ret = -ENOMEM;
217 goto err;
218 }
219
220 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
221 for (unsigned bucket_offset = 0;
222 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
223 bucket_offset += btree_sectors(c)) {
224 if (time_after(jiffies, last_print + HZ * 30)) {
225 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
226 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
227
228 bch_info(ca, "%s: %2u%% done", __func__,
229 (unsigned) div64_u64(cur_sector * 100, end_sector));
230 last_print = jiffies;
231 }
232
233 u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
234
235 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
236 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
237 continue;
238
239 try_read_btree_node(w->f, ca, bio, buf, sector);
240 }
241 err:
242 bio_put(bio);
243 free_page((unsigned long) buf);
244 percpu_ref_get(&ca->io_ref);
245 closure_put(w->cl);
246 kfree(w);
247 return 0;
248 }
249
read_btree_nodes(struct find_btree_nodes * f)250 static int read_btree_nodes(struct find_btree_nodes *f)
251 {
252 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
253 struct closure cl;
254 int ret = 0;
255
256 closure_init_stack(&cl);
257
258 for_each_online_member(c, ca) {
259 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
260 continue;
261
262 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
263 struct task_struct *t;
264
265 if (!w) {
266 percpu_ref_put(&ca->io_ref);
267 ret = -ENOMEM;
268 goto err;
269 }
270
271 percpu_ref_get(&ca->io_ref);
272 closure_get(&cl);
273 w->cl = &cl;
274 w->f = f;
275 w->ca = ca;
276
277 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
278 ret = PTR_ERR_OR_ZERO(t);
279 if (ret) {
280 percpu_ref_put(&ca->io_ref);
281 closure_put(&cl);
282 f->ret = ret;
283 bch_err(c, "error starting kthread: %i", ret);
284 break;
285 }
286 }
287 err:
288 closure_sync(&cl);
289 return f->ret ?: ret;
290 }
291
bubble_up(struct found_btree_node * n,struct found_btree_node * end)292 static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
293 {
294 while (n + 1 < end &&
295 found_btree_node_cmp_pos(n, n + 1) > 0) {
296 swap(n[0], n[1]);
297 n++;
298 }
299 }
300
handle_overwrites(struct bch_fs * c,struct found_btree_node * start,struct found_btree_node * end)301 static int handle_overwrites(struct bch_fs *c,
302 struct found_btree_node *start,
303 struct found_btree_node *end)
304 {
305 struct found_btree_node *n;
306 again:
307 for (n = start + 1;
308 n < end &&
309 n->btree_id == start->btree_id &&
310 n->level == start->level &&
311 bpos_lt(n->min_key, start->max_key);
312 n++) {
313 int cmp = found_btree_node_cmp_time(start, n);
314
315 if (cmp > 0) {
316 if (bpos_cmp(start->max_key, n->max_key) >= 0)
317 n->overwritten = true;
318 else {
319 n->range_updated = true;
320 n->min_key = bpos_successor(start->max_key);
321 n->range_updated = true;
322 bubble_up(n, end);
323 goto again;
324 }
325 } else if (cmp < 0) {
326 BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
327
328 start->max_key = bpos_predecessor(n->min_key);
329 start->range_updated = true;
330 } else if (n->level) {
331 n->overwritten = true;
332 } else {
333 if (bpos_cmp(start->max_key, n->max_key) >= 0)
334 n->overwritten = true;
335 else {
336 n->range_updated = true;
337 n->min_key = bpos_successor(start->max_key);
338 n->range_updated = true;
339 bubble_up(n, end);
340 goto again;
341 }
342 }
343 }
344
345 return 0;
346 }
347
bch2_scan_for_btree_nodes(struct bch_fs * c)348 int bch2_scan_for_btree_nodes(struct bch_fs *c)
349 {
350 struct find_btree_nodes *f = &c->found_btree_nodes;
351 struct printbuf buf = PRINTBUF;
352 size_t dst;
353 int ret = 0;
354
355 if (f->nodes.nr)
356 return 0;
357
358 mutex_init(&f->lock);
359
360 ret = read_btree_nodes(f);
361 if (ret)
362 return ret;
363
364 if (!f->nodes.nr) {
365 bch_err(c, "%s: no btree nodes found", __func__);
366 ret = -EINVAL;
367 goto err;
368 }
369
370 if (0 && c->opts.verbose) {
371 printbuf_reset(&buf);
372 prt_printf(&buf, "%s: nodes found:\n", __func__);
373 found_btree_nodes_to_text(&buf, c, f->nodes);
374 bch2_print_string_as_lines(KERN_INFO, buf.buf);
375 }
376
377 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
378
379 dst = 0;
380 darray_for_each(f->nodes, i) {
381 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
382
383 if (prev &&
384 prev->cookie == i->cookie) {
385 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
386 bch_err(c, "%s: found too many replicas for btree node", __func__);
387 ret = -EINVAL;
388 goto err;
389 }
390 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
391 } else {
392 f->nodes.data[dst++] = *i;
393 }
394 }
395 f->nodes.nr = dst;
396
397 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
398
399 if (0 && c->opts.verbose) {
400 printbuf_reset(&buf);
401 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
402 found_btree_nodes_to_text(&buf, c, f->nodes);
403 bch2_print_string_as_lines(KERN_INFO, buf.buf);
404 }
405
406 dst = 0;
407 darray_for_each(f->nodes, i) {
408 if (i->overwritten)
409 continue;
410
411 ret = handle_overwrites(c, i, &darray_top(f->nodes));
412 if (ret)
413 goto err;
414
415 BUG_ON(i->overwritten);
416 f->nodes.data[dst++] = *i;
417 }
418 f->nodes.nr = dst;
419
420 if (c->opts.verbose) {
421 printbuf_reset(&buf);
422 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
423 found_btree_nodes_to_text(&buf, c, f->nodes);
424 bch2_print_string_as_lines(KERN_INFO, buf.buf);
425 }
426
427 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
428 err:
429 printbuf_exit(&buf);
430 return ret;
431 }
432
found_btree_node_range_start_cmp(const void * _l,const void * _r)433 static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
434 {
435 const struct found_btree_node *l = _l;
436 const struct found_btree_node *r = _r;
437
438 return cmp_int(l->btree_id, r->btree_id) ?:
439 -cmp_int(l->level, r->level) ?:
440 bpos_cmp(l->max_key, r->min_key);
441 }
442
443 #define for_each_found_btree_node_in_range(_f, _search, _idx) \
444 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
445 sizeof((_f)->nodes.data[0]), \
446 found_btree_node_range_start_cmp, &search); \
447 _idx < (_f)->nodes.nr && \
448 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
449 (_f)->nodes.data[_idx].level == _search.level && \
450 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
451 _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
452
bch2_btree_node_is_stale(struct bch_fs * c,struct btree * b)453 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
454 {
455 struct find_btree_nodes *f = &c->found_btree_nodes;
456
457 struct found_btree_node search = {
458 .btree_id = b->c.btree_id,
459 .level = b->c.level,
460 .min_key = b->data->min_key,
461 .max_key = b->key.k.p,
462 };
463
464 for_each_found_btree_node_in_range(f, search, idx)
465 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
466 return true;
467 return false;
468 }
469
bch2_btree_has_scanned_nodes(struct bch_fs * c,enum btree_id btree)470 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
471 {
472 struct found_btree_node search = {
473 .btree_id = btree,
474 .level = 0,
475 .min_key = POS_MIN,
476 .max_key = SPOS_MAX,
477 };
478
479 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
480 return true;
481 return false;
482 }
483
bch2_get_scanned_nodes(struct bch_fs * c,enum btree_id btree,unsigned level,struct bpos node_min,struct bpos node_max)484 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
485 unsigned level, struct bpos node_min, struct bpos node_max)
486 {
487 if (btree_id_is_alloc(btree))
488 return 0;
489
490 struct find_btree_nodes *f = &c->found_btree_nodes;
491
492 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
493 if (ret)
494 return ret;
495
496 if (c->opts.verbose) {
497 struct printbuf buf = PRINTBUF;
498
499 prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
500 bch2_bpos_to_text(&buf, node_min);
501 prt_str(&buf, " - ");
502 bch2_bpos_to_text(&buf, node_max);
503
504 bch_info(c, "%s(): %s", __func__, buf.buf);
505 printbuf_exit(&buf);
506 }
507
508 struct found_btree_node search = {
509 .btree_id = btree,
510 .level = level,
511 .min_key = node_min,
512 .max_key = node_max,
513 };
514
515 for_each_found_btree_node_in_range(f, search, idx) {
516 struct found_btree_node n = f->nodes.data[idx];
517
518 n.range_updated |= bpos_lt(n.min_key, node_min);
519 n.min_key = bpos_max(n.min_key, node_min);
520
521 n.range_updated |= bpos_gt(n.max_key, node_max);
522 n.max_key = bpos_min(n.max_key, node_max);
523
524 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
525
526 found_btree_node_to_key(&tmp.k, &n);
527
528 struct printbuf buf = PRINTBUF;
529 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
530 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
531 printbuf_exit(&buf);
532
533 BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
534
535 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
536 if (ret)
537 return ret;
538 }
539
540 return 0;
541 }
542
bch2_find_btree_nodes_exit(struct find_btree_nodes * f)543 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
544 {
545 darray_exit(&f->nodes);
546 }
547