xref: /linux/fs/bcachefs/btree_node_scan.c (revision 7954c92ede882b0dfd52a5db90291a4151b44c1a)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_journal_iter.h"
7 #include "btree_node_scan.h"
8 #include "btree_update_interior.h"
9 #include "buckets.h"
10 #include "error.h"
11 #include "journal_io.h"
12 #include "recovery_passes.h"
13 
14 #include <linux/kthread.h>
15 #include <linux/sort.h>
16 
17 struct find_btree_nodes_worker {
18 	struct closure		*cl;
19 	struct find_btree_nodes	*f;
20 	struct bch_dev		*ca;
21 };
22 
23 static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24 {
25 	prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
26 	bch2_bpos_to_text(out, n->min_key);
27 	prt_str(out, "-");
28 	bch2_bpos_to_text(out, n->max_key);
29 
30 	if (n->range_updated)
31 		prt_str(out, " range updated");
32 	if (n->overwritten)
33 		prt_str(out, " overwritten");
34 
35 	for (unsigned i = 0; i < n->nr_ptrs; i++) {
36 		prt_char(out, ' ');
37 		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
38 	}
39 }
40 
41 static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
42 {
43 	printbuf_indent_add(out, 2);
44 	darray_for_each(nodes, i) {
45 		found_btree_node_to_text(out, c, i);
46 		prt_newline(out);
47 	}
48 	printbuf_indent_sub(out, 2);
49 }
50 
51 static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
52 {
53 	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
54 
55 	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
56 	bp->k.p			= f->max_key;
57 	bp->v.seq		= cpu_to_le64(f->cookie);
58 	bp->v.sectors_written	= 0;
59 	bp->v.flags		= 0;
60 	bp->v.min_key		= f->min_key;
61 	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
62 	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
63 }
64 
65 static bool found_btree_node_is_readable(struct btree_trans *trans,
66 					 const struct found_btree_node *f)
67 {
68 	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
69 
70 	found_btree_node_to_key(&k.k, f);
71 
72 	struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
73 	bool ret = !IS_ERR_OR_NULL(b);
74 	if (ret)
75 		six_unlock_read(&b->c.lock);
76 
77 	/*
78 	 * We might update this node's range; if that happens, we need the node
79 	 * to be re-read so the read path can trim keys that are no longer in
80 	 * this node
81 	 */
82 	if (b != btree_node_root(trans->c, b))
83 		bch2_btree_node_evict(trans, &k.k);
84 	return ret;
85 }
86 
87 static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
88 {
89 	const struct found_btree_node *l = _l;
90 	const struct found_btree_node *r = _r;
91 
92 	return  cmp_int(l->btree_id,	r->btree_id) ?:
93 		cmp_int(l->level,	r->level) ?:
94 		cmp_int(l->cookie,	r->cookie);
95 }
96 
97 /*
98  * Given two found btree nodes, if their sequence numbers are equal, take the
99  * one that's readable:
100  */
101 static int found_btree_node_cmp_time(const struct found_btree_node *l,
102 				     const struct found_btree_node *r)
103 {
104 	return cmp_int(l->seq, r->seq);
105 }
106 
107 static int found_btree_node_cmp_pos(const void *_l, const void *_r)
108 {
109 	const struct found_btree_node *l = _l;
110 	const struct found_btree_node *r = _r;
111 
112 	return  cmp_int(l->btree_id,	r->btree_id) ?:
113 	       -cmp_int(l->level,	r->level) ?:
114 		bpos_cmp(l->min_key,	r->min_key) ?:
115 	       -found_btree_node_cmp_time(l, r);
116 }
117 
118 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
119 				struct bio *bio, struct btree_node *bn, u64 offset)
120 {
121 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
122 
123 	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
124 	bio->bi_iter.bi_sector	= offset;
125 	bch2_bio_map(bio, bn, PAGE_SIZE);
126 
127 	submit_bio_wait(bio);
128 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
129 			       "IO error in try_read_btree_node() at %llu: %s",
130 			       offset, bch2_blk_status_to_str(bio->bi_status)))
131 		return;
132 
133 	if (le64_to_cpu(bn->magic) != bset_magic(c))
134 		return;
135 
136 	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
137 		return;
138 
139 	rcu_read_lock();
140 	struct found_btree_node n = {
141 		.btree_id	= BTREE_NODE_ID(bn),
142 		.level		= BTREE_NODE_LEVEL(bn),
143 		.seq		= BTREE_NODE_SEQ(bn),
144 		.cookie		= le64_to_cpu(bn->keys.seq),
145 		.min_key	= bn->min_key,
146 		.max_key	= bn->max_key,
147 		.nr_ptrs	= 1,
148 		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
149 		.ptrs[0].offset	= offset,
150 		.ptrs[0].dev	= ca->dev_idx,
151 		.ptrs[0].gen	= *bucket_gen(ca, sector_to_bucket(ca, offset)),
152 	};
153 	rcu_read_unlock();
154 
155 	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
156 		mutex_lock(&f->lock);
157 		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
158 			bch_err(c, "try_read_btree_node() can't handle endian conversion");
159 			f->ret = -EINVAL;
160 			goto unlock;
161 		}
162 
163 		if (darray_push(&f->nodes, n))
164 			f->ret = -ENOMEM;
165 unlock:
166 		mutex_unlock(&f->lock);
167 	}
168 }
169 
170 static int read_btree_nodes_worker(void *p)
171 {
172 	struct find_btree_nodes_worker *w = p;
173 	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
174 	struct bch_dev *ca = w->ca;
175 	void *buf = (void *) __get_free_page(GFP_KERNEL);
176 	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
177 	unsigned long last_print = jiffies;
178 
179 	if (!buf || !bio) {
180 		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
181 		w->f->ret = -ENOMEM;
182 		goto err;
183 	}
184 
185 	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
186 		for (unsigned bucket_offset = 0;
187 		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
188 		     bucket_offset += btree_sectors(c)) {
189 			if (time_after(jiffies, last_print + HZ * 30)) {
190 				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
191 				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
192 
193 				bch_info(ca, "%s: %2u%% done", __func__,
194 					 (unsigned) div64_u64(cur_sector * 100, end_sector));
195 				last_print = jiffies;
196 			}
197 
198 			try_read_btree_node(w->f, ca, bio, buf,
199 					    bucket * ca->mi.bucket_size + bucket_offset);
200 		}
201 err:
202 	bio_put(bio);
203 	free_page((unsigned long) buf);
204 	percpu_ref_get(&ca->io_ref);
205 	closure_put(w->cl);
206 	kfree(w);
207 	return 0;
208 }
209 
210 static int read_btree_nodes(struct find_btree_nodes *f)
211 {
212 	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
213 	struct closure cl;
214 	int ret = 0;
215 
216 	closure_init_stack(&cl);
217 
218 	for_each_online_member(c, ca) {
219 		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
220 			continue;
221 
222 		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
223 		struct task_struct *t;
224 
225 		if (!w) {
226 			percpu_ref_put(&ca->io_ref);
227 			ret = -ENOMEM;
228 			goto err;
229 		}
230 
231 		percpu_ref_get(&ca->io_ref);
232 		closure_get(&cl);
233 		w->cl		= &cl;
234 		w->f		= f;
235 		w->ca		= ca;
236 
237 		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
238 		ret = IS_ERR_OR_NULL(t);
239 		if (ret) {
240 			percpu_ref_put(&ca->io_ref);
241 			closure_put(&cl);
242 			f->ret = ret;
243 			bch_err(c, "error starting kthread: %i", ret);
244 			break;
245 		}
246 	}
247 err:
248 	closure_sync(&cl);
249 	return f->ret ?: ret;
250 }
251 
252 static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
253 {
254 	while (n + 1 < end &&
255 	       found_btree_node_cmp_pos(n, n + 1) > 0) {
256 		swap(n[0], n[1]);
257 		n++;
258 	}
259 }
260 
261 static int handle_overwrites(struct bch_fs *c,
262 			     struct found_btree_node *start,
263 			     struct found_btree_node *end)
264 {
265 	struct found_btree_node *n;
266 again:
267 	for (n = start + 1;
268 	     n < end &&
269 	     n->btree_id	== start->btree_id &&
270 	     n->level		== start->level &&
271 	     bpos_lt(n->min_key, start->max_key);
272 	     n++)  {
273 		int cmp = found_btree_node_cmp_time(start, n);
274 
275 		if (cmp > 0) {
276 			if (bpos_cmp(start->max_key, n->max_key) >= 0)
277 				n->overwritten = true;
278 			else {
279 				n->range_updated = true;
280 				n->min_key = bpos_successor(start->max_key);
281 				n->range_updated = true;
282 				bubble_up(n, end);
283 				goto again;
284 			}
285 		} else if (cmp < 0) {
286 			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
287 
288 			start->max_key = bpos_predecessor(n->min_key);
289 			start->range_updated = true;
290 		} else {
291 			struct printbuf buf = PRINTBUF;
292 
293 			prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
294 			found_btree_node_to_text(&buf, c, start);
295 			prt_str(&buf, "\n  ");
296 			found_btree_node_to_text(&buf, c, n);
297 			bch_err(c, "%s", buf.buf);
298 			printbuf_exit(&buf);
299 			return -BCH_ERR_fsck_repair_unimplemented;
300 		}
301 	}
302 
303 	return 0;
304 }
305 
306 int bch2_scan_for_btree_nodes(struct bch_fs *c)
307 {
308 	struct find_btree_nodes *f = &c->found_btree_nodes;
309 	struct printbuf buf = PRINTBUF;
310 	size_t dst;
311 	int ret = 0;
312 
313 	if (f->nodes.nr)
314 		return 0;
315 
316 	mutex_init(&f->lock);
317 
318 	ret = read_btree_nodes(f);
319 	if (ret)
320 		return ret;
321 
322 	if (!f->nodes.nr) {
323 		bch_err(c, "%s: no btree nodes found", __func__);
324 		ret = -EINVAL;
325 		goto err;
326 	}
327 
328 	if (0 && c->opts.verbose) {
329 		printbuf_reset(&buf);
330 		prt_printf(&buf, "%s: nodes found:\n", __func__);
331 		found_btree_nodes_to_text(&buf, c, f->nodes);
332 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
333 	}
334 
335 	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
336 
337 	dst = 0;
338 	darray_for_each(f->nodes, i) {
339 		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
340 
341 		if (prev &&
342 		    prev->cookie == i->cookie) {
343 			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
344 				bch_err(c, "%s: found too many replicas for btree node", __func__);
345 				ret = -EINVAL;
346 				goto err;
347 			}
348 			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
349 		} else {
350 			f->nodes.data[dst++] = *i;
351 		}
352 	}
353 	f->nodes.nr = dst;
354 
355 	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
356 
357 	if (0 && c->opts.verbose) {
358 		printbuf_reset(&buf);
359 		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
360 		found_btree_nodes_to_text(&buf, c, f->nodes);
361 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
362 	}
363 
364 	dst = 0;
365 	darray_for_each(f->nodes, i) {
366 		if (i->overwritten)
367 			continue;
368 
369 		ret = handle_overwrites(c, i, &darray_top(f->nodes));
370 		if (ret)
371 			goto err;
372 
373 		BUG_ON(i->overwritten);
374 		f->nodes.data[dst++] = *i;
375 	}
376 	f->nodes.nr = dst;
377 
378 	if (c->opts.verbose) {
379 		printbuf_reset(&buf);
380 		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
381 		found_btree_nodes_to_text(&buf, c, f->nodes);
382 		bch2_print_string_as_lines(KERN_INFO, buf.buf);
383 	}
384 
385 	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
386 err:
387 	printbuf_exit(&buf);
388 	return ret;
389 }
390 
391 static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
392 {
393 	const struct found_btree_node *l = _l;
394 	const struct found_btree_node *r = _r;
395 
396 	return  cmp_int(l->btree_id,	r->btree_id) ?:
397 	       -cmp_int(l->level,	r->level) ?:
398 		bpos_cmp(l->max_key,	r->min_key);
399 }
400 
401 #define for_each_found_btree_node_in_range(_f, _search, _idx)				\
402 	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
403 					sizeof((_f)->nodes.data[0]),			\
404 					found_btree_node_range_start_cmp, &search);	\
405 	     _idx < (_f)->nodes.nr &&							\
406 	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
407 	     (_f)->nodes.data[_idx].level == _search.level &&				\
408 	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
409 	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
410 
411 bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
412 {
413 	struct find_btree_nodes *f = &c->found_btree_nodes;
414 
415 	struct found_btree_node search = {
416 		.btree_id	= b->c.btree_id,
417 		.level		= b->c.level,
418 		.min_key	= b->data->min_key,
419 		.max_key	= b->key.k.p,
420 	};
421 
422 	for_each_found_btree_node_in_range(f, search, idx)
423 		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
424 			return true;
425 	return false;
426 }
427 
428 bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
429 {
430 	struct found_btree_node search = {
431 		.btree_id	= btree,
432 		.level		= 0,
433 		.min_key	= POS_MIN,
434 		.max_key	= SPOS_MAX,
435 	};
436 
437 	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
438 		return true;
439 	return false;
440 }
441 
442 int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
443 			   unsigned level, struct bpos node_min, struct bpos node_max)
444 {
445 	if (btree_id_is_alloc(btree))
446 		return 0;
447 
448 	struct find_btree_nodes *f = &c->found_btree_nodes;
449 
450 	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
451 	if (ret)
452 		return ret;
453 
454 	if (c->opts.verbose) {
455 		struct printbuf buf = PRINTBUF;
456 
457 		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
458 		bch2_bpos_to_text(&buf, node_min);
459 		prt_str(&buf, " - ");
460 		bch2_bpos_to_text(&buf, node_max);
461 
462 		bch_info(c, "%s(): %s", __func__, buf.buf);
463 		printbuf_exit(&buf);
464 	}
465 
466 	struct found_btree_node search = {
467 		.btree_id	= btree,
468 		.level		= level,
469 		.min_key	= node_min,
470 		.max_key	= node_max,
471 	};
472 
473 	for_each_found_btree_node_in_range(f, search, idx) {
474 		struct found_btree_node n = f->nodes.data[idx];
475 
476 		n.range_updated |= bpos_lt(n.min_key, node_min);
477 		n.min_key = bpos_max(n.min_key, node_min);
478 
479 		n.range_updated |= bpos_gt(n.max_key, node_max);
480 		n.max_key = bpos_min(n.max_key, node_max);
481 
482 		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
483 
484 		found_btree_node_to_key(&tmp.k, &n);
485 
486 		struct printbuf buf = PRINTBUF;
487 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
488 		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
489 		printbuf_exit(&buf);
490 
491 		BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
492 
493 		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
494 		if (ret)
495 			return ret;
496 	}
497 
498 	return 0;
499 }
500 
501 void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
502 {
503 	darray_exit(&f->nodes);
504 }
505