xref: /linux/fs/bcachefs/btree_journal_iter.c (revision 8e1bb4a41aa78d6105e59186af3dcd545fc66e70)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "bset.h"
6 #include "btree_cache.h"
7 #include "btree_journal_iter.h"
8 #include "journal_io.h"
9 
10 #include <linux/sort.h>
11 
12 /*
13  * For managing keys we read from the journal: until journal replay works normal
14  * btree lookups need to be able to find and return keys from the journal where
15  * they overwrite what's in the btree, so we have a special iterator and
16  * operations for the regular btree iter code to use:
17  */
18 
19 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
20 {
21 	size_t gap_size = keys->size - keys->nr;
22 
23 	if (idx >= keys->gap)
24 		idx += gap_size;
25 	return idx;
26 }
27 
28 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
29 {
30 	return keys->data + idx_to_pos(keys, idx);
31 }
32 
33 static size_t __bch2_journal_key_search(struct journal_keys *keys,
34 					enum btree_id id, unsigned level,
35 					struct bpos pos)
36 {
37 	size_t l = 0, r = keys->nr, m;
38 
39 	while (l < r) {
40 		m = l + ((r - l) >> 1);
41 		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
42 			l = m + 1;
43 		else
44 			r = m;
45 	}
46 
47 	BUG_ON(l < keys->nr &&
48 	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
49 
50 	BUG_ON(l &&
51 	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
52 
53 	return l;
54 }
55 
56 static size_t bch2_journal_key_search(struct journal_keys *keys,
57 				      enum btree_id id, unsigned level,
58 				      struct bpos pos)
59 {
60 	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
61 }
62 
63 /* Returns first non-overwritten key >= search key: */
64 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
65 					   unsigned level, struct bpos pos,
66 					   struct bpos end_pos, size_t *idx)
67 {
68 	struct journal_keys *keys = &c->journal_keys;
69 	unsigned iters = 0;
70 	struct journal_key *k;
71 
72 	BUG_ON(*idx > keys->nr);
73 search:
74 	if (!*idx)
75 		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
76 
77 	while (*idx &&
78 	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
79 		--(*idx);
80 		iters++;
81 		if (iters == 10) {
82 			*idx = 0;
83 			goto search;
84 		}
85 	}
86 
87 	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
88 		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
89 			return NULL;
90 
91 		if (k->overwritten) {
92 			(*idx)++;
93 			continue;
94 		}
95 
96 		if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
97 			return k->k;
98 
99 		(*idx)++;
100 		iters++;
101 		if (iters == 10) {
102 			*idx = 0;
103 			goto search;
104 		}
105 	}
106 
107 	return NULL;
108 }
109 
110 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
111 					   unsigned level, struct bpos pos)
112 {
113 	size_t idx = 0;
114 
115 	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
116 }
117 
118 static void journal_iter_verify(struct journal_iter *iter)
119 {
120 	struct journal_keys *keys = iter->keys;
121 	size_t gap_size = keys->size - keys->nr;
122 
123 	BUG_ON(iter->idx >= keys->gap &&
124 	       iter->idx <  keys->gap + gap_size);
125 
126 	if (iter->idx < keys->size) {
127 		struct journal_key *k = keys->data + iter->idx;
128 
129 		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
130 			  cmp_int(k->level,	iter->level);
131 		BUG_ON(cmp < 0);
132 	}
133 }
134 
135 static void journal_iters_fix(struct bch_fs *c)
136 {
137 	struct journal_keys *keys = &c->journal_keys;
138 	/* The key we just inserted is immediately before the gap: */
139 	size_t gap_end = keys->gap + (keys->size - keys->nr);
140 	struct journal_key *new_key = &keys->data[keys->gap - 1];
141 	struct journal_iter *iter;
142 
143 	/*
144 	 * If an iterator points one after the key we just inserted, decrement
145 	 * the iterator so it points at the key we just inserted - if the
146 	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
147 	 * handle that:
148 	 */
149 	list_for_each_entry(iter, &c->journal_iters, list) {
150 		journal_iter_verify(iter);
151 		if (iter->idx		== gap_end &&
152 		    new_key->btree_id	== iter->btree_id &&
153 		    new_key->level	== iter->level)
154 			iter->idx = keys->gap - 1;
155 		journal_iter_verify(iter);
156 	}
157 }
158 
159 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
160 {
161 	struct journal_keys *keys = &c->journal_keys;
162 	struct journal_iter *iter;
163 	size_t gap_size = keys->size - keys->nr;
164 
165 	list_for_each_entry(iter, &c->journal_iters, list) {
166 		if (iter->idx > old_gap)
167 			iter->idx -= gap_size;
168 		if (iter->idx >= new_gap)
169 			iter->idx += gap_size;
170 	}
171 }
172 
173 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
174 				 unsigned level, struct bkey_i *k)
175 {
176 	struct journal_key n = {
177 		.btree_id	= id,
178 		.level		= level,
179 		.k		= k,
180 		.allocated	= true,
181 		/*
182 		 * Ensure these keys are done last by journal replay, to unblock
183 		 * journal reclaim:
184 		 */
185 		.journal_seq	= U32_MAX,
186 	};
187 	struct journal_keys *keys = &c->journal_keys;
188 	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
189 
190 	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
191 
192 	if (idx < keys->size &&
193 	    journal_key_cmp(&n, &keys->data[idx]) == 0) {
194 		if (keys->data[idx].allocated)
195 			kfree(keys->data[idx].k);
196 		keys->data[idx] = n;
197 		return 0;
198 	}
199 
200 	if (idx > keys->gap)
201 		idx -= keys->size - keys->nr;
202 
203 	size_t old_gap = keys->gap;
204 
205 	if (keys->nr == keys->size) {
206 		journal_iters_move_gap(c, old_gap, keys->size);
207 		old_gap = keys->size;
208 
209 		struct journal_keys new_keys = {
210 			.nr			= keys->nr,
211 			.size			= max_t(size_t, keys->size, 8) * 2,
212 		};
213 
214 		new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
215 		if (!new_keys.data) {
216 			bch_err(c, "%s: error allocating new key array (size %zu)",
217 				__func__, new_keys.size);
218 			return -BCH_ERR_ENOMEM_journal_key_insert;
219 		}
220 
221 		/* Since @keys was full, there was no gap: */
222 		memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
223 		kvfree(keys->data);
224 		keys->data	= new_keys.data;
225 		keys->nr	= new_keys.nr;
226 		keys->size	= new_keys.size;
227 
228 		/* And now the gap is at the end: */
229 		keys->gap	= keys->nr;
230 	}
231 
232 	journal_iters_move_gap(c, old_gap, idx);
233 
234 	move_gap(keys, idx);
235 
236 	keys->nr++;
237 	keys->data[keys->gap++] = n;
238 
239 	journal_iters_fix(c);
240 
241 	return 0;
242 }
243 
244 /*
245  * Can only be used from the recovery thread while we're still RO - can't be
246  * used once we've got RW, as journal_keys is at that point used by multiple
247  * threads:
248  */
249 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
250 			    unsigned level, struct bkey_i *k)
251 {
252 	struct bkey_i *n;
253 	int ret;
254 
255 	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
256 	if (!n)
257 		return -BCH_ERR_ENOMEM_journal_key_insert;
258 
259 	bkey_copy(n, k);
260 	ret = bch2_journal_key_insert_take(c, id, level, n);
261 	if (ret)
262 		kfree(n);
263 	return ret;
264 }
265 
266 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
267 			    unsigned level, struct bpos pos)
268 {
269 	struct bkey_i whiteout;
270 
271 	bkey_init(&whiteout.k);
272 	whiteout.k.p = pos;
273 
274 	return bch2_journal_key_insert(c, id, level, &whiteout);
275 }
276 
277 bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
278 				 unsigned level, struct bpos pos)
279 {
280 	struct journal_keys *keys = &trans->c->journal_keys;
281 	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
282 
283 	if (!trans->journal_replay_not_finished)
284 		return false;
285 
286 	return (idx < keys->size &&
287 		keys->data[idx].btree_id	== btree &&
288 		keys->data[idx].level		== level &&
289 		bpos_eq(keys->data[idx].k->k.p, pos) &&
290 		bkey_deleted(&keys->data[idx].k->k));
291 }
292 
293 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
294 				  unsigned level, struct bpos pos)
295 {
296 	struct journal_keys *keys = &c->journal_keys;
297 	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
298 
299 	if (idx < keys->size &&
300 	    keys->data[idx].btree_id	== btree &&
301 	    keys->data[idx].level	== level &&
302 	    bpos_eq(keys->data[idx].k->k.p, pos))
303 		keys->data[idx].overwritten = true;
304 }
305 
306 static void bch2_journal_iter_advance(struct journal_iter *iter)
307 {
308 	if (iter->idx < iter->keys->size) {
309 		iter->idx++;
310 		if (iter->idx == iter->keys->gap)
311 			iter->idx += iter->keys->size - iter->keys->nr;
312 	}
313 }
314 
315 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
316 {
317 	journal_iter_verify(iter);
318 
319 	while (iter->idx < iter->keys->size) {
320 		struct journal_key *k = iter->keys->data + iter->idx;
321 
322 		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
323 			  cmp_int(k->level,	iter->level);
324 		if (cmp > 0)
325 			break;
326 		BUG_ON(cmp);
327 
328 		if (!k->overwritten)
329 			return bkey_i_to_s_c(k->k);
330 
331 		bch2_journal_iter_advance(iter);
332 	}
333 
334 	return bkey_s_c_null;
335 }
336 
337 static void bch2_journal_iter_exit(struct journal_iter *iter)
338 {
339 	list_del(&iter->list);
340 }
341 
342 static void bch2_journal_iter_init(struct bch_fs *c,
343 				   struct journal_iter *iter,
344 				   enum btree_id id, unsigned level,
345 				   struct bpos pos)
346 {
347 	iter->btree_id	= id;
348 	iter->level	= level;
349 	iter->keys	= &c->journal_keys;
350 	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
351 
352 	journal_iter_verify(iter);
353 }
354 
355 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
356 {
357 	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
358 						iter->b, &iter->unpacked);
359 }
360 
361 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
362 {
363 	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
364 }
365 
366 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
367 {
368 	if (bpos_eq(iter->pos, SPOS_MAX))
369 		iter->at_end = true;
370 	else
371 		iter->pos = bpos_successor(iter->pos);
372 }
373 
374 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
375 {
376 	struct btree_and_journal_iter iter = *_iter;
377 	struct bch_fs *c = iter.trans->c;
378 	unsigned level = iter.journal.level;
379 	struct bkey_buf tmp;
380 	unsigned nr = test_bit(BCH_FS_started, &c->flags)
381 		? (level > 1 ? 0 :  2)
382 		: (level > 1 ? 1 : 16);
383 
384 	iter.prefetch = false;
385 	bch2_bkey_buf_init(&tmp);
386 
387 	while (nr--) {
388 		bch2_btree_and_journal_iter_advance(&iter);
389 		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
390 		if (!k.k)
391 			break;
392 
393 		bch2_bkey_buf_reassemble(&tmp, c, k);
394 		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
395 	}
396 
397 	bch2_bkey_buf_exit(&tmp, c);
398 }
399 
400 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
401 {
402 	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
403 
404 	if (iter->prefetch && iter->journal.level)
405 		btree_and_journal_iter_prefetch(iter);
406 again:
407 	if (iter->at_end)
408 		return bkey_s_c_null;
409 
410 	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
411 	       bpos_lt(btree_k.k->p, iter->pos))
412 		bch2_journal_iter_advance_btree(iter);
413 
414 	if (iter->trans->journal_replay_not_finished)
415 		while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
416 		       bpos_lt(journal_k.k->p, iter->pos))
417 			bch2_journal_iter_advance(&iter->journal);
418 
419 	ret = journal_k.k &&
420 		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
421 		? journal_k
422 		: btree_k;
423 
424 	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
425 		ret = bkey_s_c_null;
426 
427 	if (ret.k) {
428 		iter->pos = ret.k->p;
429 		if (bkey_deleted(ret.k)) {
430 			bch2_btree_and_journal_iter_advance(iter);
431 			goto again;
432 		}
433 	} else {
434 		iter->pos = SPOS_MAX;
435 		iter->at_end = true;
436 	}
437 
438 	return ret;
439 }
440 
441 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
442 {
443 	bch2_journal_iter_exit(&iter->journal);
444 }
445 
446 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
447 						  struct btree_and_journal_iter *iter,
448 						  struct btree *b,
449 						  struct btree_node_iter node_iter,
450 						  struct bpos pos)
451 {
452 	memset(iter, 0, sizeof(*iter));
453 
454 	iter->trans = trans;
455 	iter->b = b;
456 	iter->node_iter = node_iter;
457 	iter->pos = b->data->min_key;
458 	iter->at_end = false;
459 	INIT_LIST_HEAD(&iter->journal.list);
460 
461 	if (trans->journal_replay_not_finished) {
462 		bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
463 		if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
464 			list_add(&iter->journal.list, &trans->c->journal_iters);
465 	}
466 }
467 
468 /*
469  * this version is used by btree_gc before filesystem has gone RW and
470  * multithreaded, so uses the journal_iters list:
471  */
472 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
473 						struct btree_and_journal_iter *iter,
474 						struct btree *b)
475 {
476 	struct btree_node_iter node_iter;
477 
478 	bch2_btree_node_iter_init_from_start(&node_iter, b);
479 	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
480 }
481 
482 /* sort and dedup all keys in the journal: */
483 
484 void bch2_journal_entries_free(struct bch_fs *c)
485 {
486 	struct journal_replay **i;
487 	struct genradix_iter iter;
488 
489 	genradix_for_each(&c->journal_entries, iter, i)
490 		kvfree(*i);
491 	genradix_free(&c->journal_entries);
492 }
493 
494 /*
495  * When keys compare equal, oldest compares first:
496  */
497 static int journal_sort_key_cmp(const void *_l, const void *_r)
498 {
499 	const struct journal_key *l = _l;
500 	const struct journal_key *r = _r;
501 
502 	return  journal_key_cmp(l, r) ?:
503 		cmp_int(l->journal_seq, r->journal_seq) ?:
504 		cmp_int(l->journal_offset, r->journal_offset);
505 }
506 
507 void bch2_journal_keys_put(struct bch_fs *c)
508 {
509 	struct journal_keys *keys = &c->journal_keys;
510 
511 	BUG_ON(atomic_read(&keys->ref) <= 0);
512 
513 	if (!atomic_dec_and_test(&keys->ref))
514 		return;
515 
516 	move_gap(keys, keys->nr);
517 
518 	darray_for_each(*keys, i)
519 		if (i->allocated)
520 			kfree(i->k);
521 
522 	kvfree(keys->data);
523 	keys->data = NULL;
524 	keys->nr = keys->gap = keys->size = 0;
525 
526 	bch2_journal_entries_free(c);
527 }
528 
529 static void __journal_keys_sort(struct journal_keys *keys)
530 {
531 	sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
532 
533 	struct journal_key *dst = keys->data;
534 
535 	darray_for_each(*keys, src) {
536 		/*
537 		 * We don't accumulate accounting keys here because we have to
538 		 * compare each individual accounting key against the version in
539 		 * the btree during replay:
540 		 */
541 		if (src->k->k.type != KEY_TYPE_accounting &&
542 		    src + 1 < &darray_top(*keys) &&
543 		    !journal_key_cmp(src, src + 1))
544 			continue;
545 
546 		*dst++ = *src;
547 	}
548 
549 	keys->nr = dst - keys->data;
550 }
551 
552 int bch2_journal_keys_sort(struct bch_fs *c)
553 {
554 	struct genradix_iter iter;
555 	struct journal_replay *i, **_i;
556 	struct journal_keys *keys = &c->journal_keys;
557 	size_t nr_read = 0;
558 
559 	genradix_for_each(&c->journal_entries, iter, _i) {
560 		i = *_i;
561 
562 		if (journal_replay_ignore(i))
563 			continue;
564 
565 		cond_resched();
566 
567 		for_each_jset_key(k, entry, &i->j) {
568 			struct journal_key n = (struct journal_key) {
569 				.btree_id	= entry->btree_id,
570 				.level		= entry->level,
571 				.k		= k,
572 				.journal_seq	= le64_to_cpu(i->j.seq),
573 				.journal_offset	= k->_data - i->j._data,
574 			};
575 
576 			if (darray_push(keys, n)) {
577 				__journal_keys_sort(keys);
578 
579 				if (keys->nr * 8 > keys->size * 7) {
580 					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
581 						keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
582 					return -BCH_ERR_ENOMEM_journal_keys_sort;
583 				}
584 
585 				BUG_ON(darray_push(keys, n));
586 			}
587 
588 			nr_read++;
589 		}
590 	}
591 
592 	__journal_keys_sort(keys);
593 	keys->gap = keys->nr;
594 
595 	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
596 	return 0;
597 }
598 
599 void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
600 				  unsigned level_min, unsigned level_max,
601 				  struct bpos start, struct bpos end)
602 {
603 	struct journal_keys *keys = &c->journal_keys;
604 	size_t dst = 0;
605 
606 	move_gap(keys, keys->nr);
607 
608 	darray_for_each(*keys, i)
609 		if (!(i->btree_id == btree &&
610 		      i->level >= level_min &&
611 		      i->level <= level_max &&
612 		      bpos_ge(i->k->k.p, start) &&
613 		      bpos_le(i->k->k.p, end)))
614 			keys->data[dst++] = *i;
615 	keys->nr = keys->gap = dst;
616 }
617 
618 void bch2_journal_keys_dump(struct bch_fs *c)
619 {
620 	struct journal_keys *keys = &c->journal_keys;
621 	struct printbuf buf = PRINTBUF;
622 
623 	pr_info("%zu keys:", keys->nr);
624 
625 	move_gap(keys, keys->nr);
626 
627 	darray_for_each(*keys, i) {
628 		printbuf_reset(&buf);
629 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
630 		pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
631 	}
632 	printbuf_exit(&buf);
633 }
634