1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "bset.h" 6 #include "btree_cache.h" 7 #include "btree_journal_iter.h" 8 #include "journal_io.h" 9 10 #include <linux/sort.h> 11 12 /* 13 * For managing keys we read from the journal: until journal replay works normal 14 * btree lookups need to be able to find and return keys from the journal where 15 * they overwrite what's in the btree, so we have a special iterator and 16 * operations for the regular btree iter code to use: 17 */ 18 19 static int __journal_key_cmp(enum btree_id l_btree_id, 20 unsigned l_level, 21 struct bpos l_pos, 22 const struct journal_key *r) 23 { 24 return (cmp_int(l_btree_id, r->btree_id) ?: 25 cmp_int(l_level, r->level) ?: 26 bpos_cmp(l_pos, r->k->k.p)); 27 } 28 29 static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) 30 { 31 return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); 32 } 33 34 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) 35 { 36 size_t gap_size = keys->size - keys->nr; 37 38 if (idx >= keys->gap) 39 idx += gap_size; 40 return idx; 41 } 42 43 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 44 { 45 return keys->data + idx_to_pos(keys, idx); 46 } 47 48 static size_t __bch2_journal_key_search(struct journal_keys *keys, 49 enum btree_id id, unsigned level, 50 struct bpos pos) 51 { 52 size_t l = 0, r = keys->nr, m; 53 54 while (l < r) { 55 m = l + ((r - l) >> 1); 56 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) 57 l = m + 1; 58 else 59 r = m; 60 } 61 62 BUG_ON(l < keys->nr && 63 __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); 64 65 BUG_ON(l && 66 __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); 67 68 return l; 69 } 70 71 static size_t bch2_journal_key_search(struct journal_keys *keys, 72 enum btree_id id, unsigned level, 73 struct bpos pos) 74 { 75 return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); 76 } 77 78 /* Returns first non-overwritten key >= search key: */ 79 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, 80 unsigned level, struct bpos pos, 81 struct bpos end_pos, size_t *idx) 82 { 83 struct journal_keys *keys = &c->journal_keys; 84 unsigned iters = 0; 85 struct journal_key *k; 86 87 BUG_ON(*idx > keys->nr); 88 search: 89 if (!*idx) 90 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 91 92 while (*idx && 93 __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 94 --(*idx); 95 iters++; 96 if (iters == 10) { 97 *idx = 0; 98 goto search; 99 } 100 } 101 102 while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 103 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) 104 return NULL; 105 106 if (k->overwritten) { 107 (*idx)++; 108 continue; 109 } 110 111 if (__journal_key_cmp(btree_id, level, pos, k) <= 0) 112 return k->k; 113 114 (*idx)++; 115 iters++; 116 if (iters == 10) { 117 *idx = 0; 118 goto search; 119 } 120 } 121 122 return NULL; 123 } 124 125 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, 126 unsigned level, struct bpos pos) 127 { 128 size_t idx = 0; 129 130 return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); 131 } 132 133 static void journal_iter_verify(struct journal_iter *iter) 134 { 135 struct journal_keys *keys = iter->keys; 136 size_t gap_size = keys->size - keys->nr; 137 138 BUG_ON(iter->idx >= keys->gap && 139 iter->idx < keys->gap + gap_size); 140 141 if (iter->idx < keys->size) { 142 struct journal_key *k = keys->data + iter->idx; 143 144 int cmp = cmp_int(k->btree_id, iter->btree_id) ?: 145 cmp_int(k->level, iter->level); 146 BUG_ON(cmp < 0); 147 } 148 } 149 150 static void journal_iters_fix(struct bch_fs *c) 151 { 152 struct journal_keys *keys = &c->journal_keys; 153 /* The key we just inserted is immediately before the gap: */ 154 size_t gap_end = keys->gap + (keys->size - keys->nr); 155 struct journal_key *new_key = &keys->data[keys->gap - 1]; 156 struct journal_iter *iter; 157 158 /* 159 * If an iterator points one after the key we just inserted, decrement 160 * the iterator so it points at the key we just inserted - if the 161 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will 162 * handle that: 163 */ 164 list_for_each_entry(iter, &c->journal_iters, list) { 165 journal_iter_verify(iter); 166 if (iter->idx == gap_end && 167 new_key->btree_id == iter->btree_id && 168 new_key->level == iter->level) 169 iter->idx = keys->gap - 1; 170 journal_iter_verify(iter); 171 } 172 } 173 174 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) 175 { 176 struct journal_keys *keys = &c->journal_keys; 177 struct journal_iter *iter; 178 size_t gap_size = keys->size - keys->nr; 179 180 list_for_each_entry(iter, &c->journal_iters, list) { 181 if (iter->idx > old_gap) 182 iter->idx -= gap_size; 183 if (iter->idx >= new_gap) 184 iter->idx += gap_size; 185 } 186 } 187 188 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, 189 unsigned level, struct bkey_i *k) 190 { 191 struct journal_key n = { 192 .btree_id = id, 193 .level = level, 194 .k = k, 195 .allocated = true, 196 /* 197 * Ensure these keys are done last by journal replay, to unblock 198 * journal reclaim: 199 */ 200 .journal_seq = U32_MAX, 201 }; 202 struct journal_keys *keys = &c->journal_keys; 203 size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); 204 205 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 206 207 if (idx < keys->size && 208 journal_key_cmp(&n, &keys->data[idx]) == 0) { 209 if (keys->data[idx].allocated) 210 kfree(keys->data[idx].k); 211 keys->data[idx] = n; 212 return 0; 213 } 214 215 if (idx > keys->gap) 216 idx -= keys->size - keys->nr; 217 218 size_t old_gap = keys->gap; 219 220 if (keys->nr == keys->size) { 221 journal_iters_move_gap(c, old_gap, keys->size); 222 old_gap = keys->size; 223 224 struct journal_keys new_keys = { 225 .nr = keys->nr, 226 .size = max_t(size_t, keys->size, 8) * 2, 227 }; 228 229 new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); 230 if (!new_keys.data) { 231 bch_err(c, "%s: error allocating new key array (size %zu)", 232 __func__, new_keys.size); 233 return -BCH_ERR_ENOMEM_journal_key_insert; 234 } 235 236 /* Since @keys was full, there was no gap: */ 237 memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); 238 kvfree(keys->data); 239 keys->data = new_keys.data; 240 keys->nr = new_keys.nr; 241 keys->size = new_keys.size; 242 243 /* And now the gap is at the end: */ 244 keys->gap = keys->nr; 245 } 246 247 journal_iters_move_gap(c, old_gap, idx); 248 249 move_gap(keys, idx); 250 251 keys->nr++; 252 keys->data[keys->gap++] = n; 253 254 journal_iters_fix(c); 255 256 return 0; 257 } 258 259 /* 260 * Can only be used from the recovery thread while we're still RO - can't be 261 * used once we've got RW, as journal_keys is at that point used by multiple 262 * threads: 263 */ 264 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, 265 unsigned level, struct bkey_i *k) 266 { 267 struct bkey_i *n; 268 int ret; 269 270 n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); 271 if (!n) 272 return -BCH_ERR_ENOMEM_journal_key_insert; 273 274 bkey_copy(n, k); 275 ret = bch2_journal_key_insert_take(c, id, level, n); 276 if (ret) 277 kfree(n); 278 return ret; 279 } 280 281 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, 282 unsigned level, struct bpos pos) 283 { 284 struct bkey_i whiteout; 285 286 bkey_init(&whiteout.k); 287 whiteout.k.p = pos; 288 289 return bch2_journal_key_insert(c, id, level, &whiteout); 290 } 291 292 bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, 293 unsigned level, struct bpos pos) 294 { 295 struct journal_keys *keys = &trans->c->journal_keys; 296 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 297 298 if (!trans->journal_replay_not_finished) 299 return false; 300 301 return (idx < keys->size && 302 keys->data[idx].btree_id == btree && 303 keys->data[idx].level == level && 304 bpos_eq(keys->data[idx].k->k.p, pos) && 305 bkey_deleted(&keys->data[idx].k->k)); 306 } 307 308 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, 309 unsigned level, struct bpos pos) 310 { 311 struct journal_keys *keys = &c->journal_keys; 312 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 313 314 if (idx < keys->size && 315 keys->data[idx].btree_id == btree && 316 keys->data[idx].level == level && 317 bpos_eq(keys->data[idx].k->k.p, pos)) 318 keys->data[idx].overwritten = true; 319 } 320 321 static void bch2_journal_iter_advance(struct journal_iter *iter) 322 { 323 if (iter->idx < iter->keys->size) { 324 iter->idx++; 325 if (iter->idx == iter->keys->gap) 326 iter->idx += iter->keys->size - iter->keys->nr; 327 } 328 } 329 330 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 331 { 332 journal_iter_verify(iter); 333 334 while (iter->idx < iter->keys->size) { 335 struct journal_key *k = iter->keys->data + iter->idx; 336 337 int cmp = cmp_int(k->btree_id, iter->btree_id) ?: 338 cmp_int(k->level, iter->level); 339 if (cmp > 0) 340 break; 341 BUG_ON(cmp); 342 343 if (!k->overwritten) 344 return bkey_i_to_s_c(k->k); 345 346 bch2_journal_iter_advance(iter); 347 } 348 349 return bkey_s_c_null; 350 } 351 352 static void bch2_journal_iter_exit(struct journal_iter *iter) 353 { 354 list_del(&iter->list); 355 } 356 357 static void bch2_journal_iter_init(struct bch_fs *c, 358 struct journal_iter *iter, 359 enum btree_id id, unsigned level, 360 struct bpos pos) 361 { 362 iter->btree_id = id; 363 iter->level = level; 364 iter->keys = &c->journal_keys; 365 iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); 366 367 journal_iter_verify(iter); 368 } 369 370 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) 371 { 372 return bch2_btree_node_iter_peek_unpack(&iter->node_iter, 373 iter->b, &iter->unpacked); 374 } 375 376 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) 377 { 378 bch2_btree_node_iter_advance(&iter->node_iter, iter->b); 379 } 380 381 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) 382 { 383 if (bpos_eq(iter->pos, SPOS_MAX)) 384 iter->at_end = true; 385 else 386 iter->pos = bpos_successor(iter->pos); 387 } 388 389 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) 390 { 391 struct btree_and_journal_iter iter = *_iter; 392 struct bch_fs *c = iter.trans->c; 393 unsigned level = iter.journal.level; 394 struct bkey_buf tmp; 395 unsigned nr = test_bit(BCH_FS_started, &c->flags) 396 ? (level > 1 ? 0 : 2) 397 : (level > 1 ? 1 : 16); 398 399 iter.prefetch = false; 400 bch2_bkey_buf_init(&tmp); 401 402 while (nr--) { 403 bch2_btree_and_journal_iter_advance(&iter); 404 struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); 405 if (!k.k) 406 break; 407 408 bch2_bkey_buf_reassemble(&tmp, c, k); 409 bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); 410 } 411 412 bch2_bkey_buf_exit(&tmp, c); 413 } 414 415 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 416 { 417 struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; 418 419 if (iter->prefetch && iter->journal.level) 420 btree_and_journal_iter_prefetch(iter); 421 again: 422 if (iter->at_end) 423 return bkey_s_c_null; 424 425 while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && 426 bpos_lt(btree_k.k->p, iter->pos)) 427 bch2_journal_iter_advance_btree(iter); 428 429 if (iter->trans->journal_replay_not_finished) 430 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && 431 bpos_lt(journal_k.k->p, iter->pos)) 432 bch2_journal_iter_advance(&iter->journal); 433 434 ret = journal_k.k && 435 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) 436 ? journal_k 437 : btree_k; 438 439 if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) 440 ret = bkey_s_c_null; 441 442 if (ret.k) { 443 iter->pos = ret.k->p; 444 if (bkey_deleted(ret.k)) { 445 bch2_btree_and_journal_iter_advance(iter); 446 goto again; 447 } 448 } else { 449 iter->pos = SPOS_MAX; 450 iter->at_end = true; 451 } 452 453 return ret; 454 } 455 456 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) 457 { 458 bch2_journal_iter_exit(&iter->journal); 459 } 460 461 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 462 struct btree_and_journal_iter *iter, 463 struct btree *b, 464 struct btree_node_iter node_iter, 465 struct bpos pos) 466 { 467 memset(iter, 0, sizeof(*iter)); 468 469 iter->trans = trans; 470 iter->b = b; 471 iter->node_iter = node_iter; 472 iter->pos = b->data->min_key; 473 iter->at_end = false; 474 INIT_LIST_HEAD(&iter->journal.list); 475 476 if (trans->journal_replay_not_finished) { 477 bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); 478 if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) 479 list_add(&iter->journal.list, &trans->c->journal_iters); 480 } 481 } 482 483 /* 484 * this version is used by btree_gc before filesystem has gone RW and 485 * multithreaded, so uses the journal_iters list: 486 */ 487 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 488 struct btree_and_journal_iter *iter, 489 struct btree *b) 490 { 491 struct btree_node_iter node_iter; 492 493 bch2_btree_node_iter_init_from_start(&node_iter, b); 494 __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); 495 } 496 497 /* sort and dedup all keys in the journal: */ 498 499 void bch2_journal_entries_free(struct bch_fs *c) 500 { 501 struct journal_replay **i; 502 struct genradix_iter iter; 503 504 genradix_for_each(&c->journal_entries, iter, i) 505 kvfree(*i); 506 genradix_free(&c->journal_entries); 507 } 508 509 /* 510 * When keys compare equal, oldest compares first: 511 */ 512 static int journal_sort_key_cmp(const void *_l, const void *_r) 513 { 514 const struct journal_key *l = _l; 515 const struct journal_key *r = _r; 516 517 return journal_key_cmp(l, r) ?: 518 cmp_int(l->journal_seq, r->journal_seq) ?: 519 cmp_int(l->journal_offset, r->journal_offset); 520 } 521 522 void bch2_journal_keys_put(struct bch_fs *c) 523 { 524 struct journal_keys *keys = &c->journal_keys; 525 526 BUG_ON(atomic_read(&keys->ref) <= 0); 527 528 if (!atomic_dec_and_test(&keys->ref)) 529 return; 530 531 move_gap(keys, keys->nr); 532 533 darray_for_each(*keys, i) 534 if (i->allocated) 535 kfree(i->k); 536 537 kvfree(keys->data); 538 keys->data = NULL; 539 keys->nr = keys->gap = keys->size = 0; 540 541 bch2_journal_entries_free(c); 542 } 543 544 static void __journal_keys_sort(struct journal_keys *keys) 545 { 546 sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); 547 548 struct journal_key *dst = keys->data; 549 550 darray_for_each(*keys, src) { 551 if (src + 1 < &darray_top(*keys) && 552 !journal_key_cmp(src, src + 1)) 553 continue; 554 555 *dst++ = *src; 556 } 557 558 keys->nr = dst - keys->data; 559 } 560 561 int bch2_journal_keys_sort(struct bch_fs *c) 562 { 563 struct genradix_iter iter; 564 struct journal_replay *i, **_i; 565 struct journal_keys *keys = &c->journal_keys; 566 size_t nr_read = 0; 567 568 genradix_for_each(&c->journal_entries, iter, _i) { 569 i = *_i; 570 571 if (journal_replay_ignore(i)) 572 continue; 573 574 cond_resched(); 575 576 for_each_jset_key(k, entry, &i->j) { 577 struct journal_key n = (struct journal_key) { 578 .btree_id = entry->btree_id, 579 .level = entry->level, 580 .k = k, 581 .journal_seq = le64_to_cpu(i->j.seq), 582 .journal_offset = k->_data - i->j._data, 583 }; 584 585 if (darray_push(keys, n)) { 586 __journal_keys_sort(keys); 587 588 if (keys->nr * 8 > keys->size * 7) { 589 bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 590 keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 591 return -BCH_ERR_ENOMEM_journal_keys_sort; 592 } 593 594 BUG_ON(darray_push(keys, n)); 595 } 596 597 nr_read++; 598 } 599 } 600 601 __journal_keys_sort(keys); 602 keys->gap = keys->nr; 603 604 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 605 return 0; 606 } 607 608 void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, 609 unsigned level_min, unsigned level_max, 610 struct bpos start, struct bpos end) 611 { 612 struct journal_keys *keys = &c->journal_keys; 613 size_t dst = 0; 614 615 move_gap(keys, keys->nr); 616 617 darray_for_each(*keys, i) 618 if (!(i->btree_id == btree && 619 i->level >= level_min && 620 i->level <= level_max && 621 bpos_ge(i->k->k.p, start) && 622 bpos_le(i->k->k.p, end))) 623 keys->data[dst++] = *i; 624 keys->nr = keys->gap = dst; 625 } 626 627 void bch2_journal_keys_dump(struct bch_fs *c) 628 { 629 struct journal_keys *keys = &c->journal_keys; 630 struct printbuf buf = PRINTBUF; 631 632 pr_info("%zu keys:", keys->nr); 633 634 move_gap(keys, keys->nr); 635 636 darray_for_each(*keys, i) { 637 printbuf_reset(&buf); 638 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); 639 pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); 640 } 641 printbuf_exit(&buf); 642 } 643