1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "bset.h" 6 #include "btree_cache.h" 7 #include "btree_journal_iter.h" 8 #include "journal_io.h" 9 10 #include <linux/sort.h> 11 12 /* 13 * For managing keys we read from the journal: until journal replay works normal 14 * btree lookups need to be able to find and return keys from the journal where 15 * they overwrite what's in the btree, so we have a special iterator and 16 * operations for the regular btree iter code to use: 17 */ 18 19 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) 20 { 21 size_t gap_size = keys->size - keys->nr; 22 23 if (idx >= keys->gap) 24 idx += gap_size; 25 return idx; 26 } 27 28 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 29 { 30 return keys->data + idx_to_pos(keys, idx); 31 } 32 33 static size_t __bch2_journal_key_search(struct journal_keys *keys, 34 enum btree_id id, unsigned level, 35 struct bpos pos) 36 { 37 size_t l = 0, r = keys->nr, m; 38 39 while (l < r) { 40 m = l + ((r - l) >> 1); 41 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) 42 l = m + 1; 43 else 44 r = m; 45 } 46 47 BUG_ON(l < keys->nr && 48 __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); 49 50 BUG_ON(l && 51 __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); 52 53 return l; 54 } 55 56 static size_t bch2_journal_key_search(struct journal_keys *keys, 57 enum btree_id id, unsigned level, 58 struct bpos pos) 59 { 60 return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); 61 } 62 63 /* Returns first non-overwritten key >= search key: */ 64 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, 65 unsigned level, struct bpos pos, 66 struct bpos end_pos, size_t *idx) 67 { 68 struct journal_keys *keys = &c->journal_keys; 69 unsigned iters = 0; 70 struct journal_key *k; 71 72 BUG_ON(*idx > keys->nr); 73 search: 74 if (!*idx) 75 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 76 77 while (*idx && 78 __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 79 --(*idx); 80 iters++; 81 if (iters == 10) { 82 *idx = 0; 83 goto search; 84 } 85 } 86 87 while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 88 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) 89 return NULL; 90 91 if (k->overwritten) { 92 (*idx)++; 93 continue; 94 } 95 96 if (__journal_key_cmp(btree_id, level, pos, k) <= 0) 97 return k->k; 98 99 (*idx)++; 100 iters++; 101 if (iters == 10) { 102 *idx = 0; 103 goto search; 104 } 105 } 106 107 return NULL; 108 } 109 110 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, 111 unsigned level, struct bpos pos) 112 { 113 size_t idx = 0; 114 115 return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); 116 } 117 118 static void journal_iter_verify(struct journal_iter *iter) 119 { 120 struct journal_keys *keys = iter->keys; 121 size_t gap_size = keys->size - keys->nr; 122 123 BUG_ON(iter->idx >= keys->gap && 124 iter->idx < keys->gap + gap_size); 125 126 if (iter->idx < keys->size) { 127 struct journal_key *k = keys->data + iter->idx; 128 129 int cmp = cmp_int(k->btree_id, iter->btree_id) ?: 130 cmp_int(k->level, iter->level); 131 BUG_ON(cmp < 0); 132 } 133 } 134 135 static void journal_iters_fix(struct bch_fs *c) 136 { 137 struct journal_keys *keys = &c->journal_keys; 138 /* The key we just inserted is immediately before the gap: */ 139 size_t gap_end = keys->gap + (keys->size - keys->nr); 140 struct journal_key *new_key = &keys->data[keys->gap - 1]; 141 struct journal_iter *iter; 142 143 /* 144 * If an iterator points one after the key we just inserted, decrement 145 * the iterator so it points at the key we just inserted - if the 146 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will 147 * handle that: 148 */ 149 list_for_each_entry(iter, &c->journal_iters, list) { 150 journal_iter_verify(iter); 151 if (iter->idx == gap_end && 152 new_key->btree_id == iter->btree_id && 153 new_key->level == iter->level) 154 iter->idx = keys->gap - 1; 155 journal_iter_verify(iter); 156 } 157 } 158 159 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) 160 { 161 struct journal_keys *keys = &c->journal_keys; 162 struct journal_iter *iter; 163 size_t gap_size = keys->size - keys->nr; 164 165 list_for_each_entry(iter, &c->journal_iters, list) { 166 if (iter->idx > old_gap) 167 iter->idx -= gap_size; 168 if (iter->idx >= new_gap) 169 iter->idx += gap_size; 170 } 171 } 172 173 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, 174 unsigned level, struct bkey_i *k) 175 { 176 struct journal_key n = { 177 .btree_id = id, 178 .level = level, 179 .k = k, 180 .allocated = true, 181 /* 182 * Ensure these keys are done last by journal replay, to unblock 183 * journal reclaim: 184 */ 185 .journal_seq = U32_MAX, 186 }; 187 struct journal_keys *keys = &c->journal_keys; 188 size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); 189 190 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 191 192 if (idx < keys->size && 193 journal_key_cmp(&n, &keys->data[idx]) == 0) { 194 if (keys->data[idx].allocated) 195 kfree(keys->data[idx].k); 196 keys->data[idx] = n; 197 return 0; 198 } 199 200 if (idx > keys->gap) 201 idx -= keys->size - keys->nr; 202 203 size_t old_gap = keys->gap; 204 205 if (keys->nr == keys->size) { 206 journal_iters_move_gap(c, old_gap, keys->size); 207 old_gap = keys->size; 208 209 struct journal_keys new_keys = { 210 .nr = keys->nr, 211 .size = max_t(size_t, keys->size, 8) * 2, 212 }; 213 214 new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); 215 if (!new_keys.data) { 216 bch_err(c, "%s: error allocating new key array (size %zu)", 217 __func__, new_keys.size); 218 return -BCH_ERR_ENOMEM_journal_key_insert; 219 } 220 221 /* Since @keys was full, there was no gap: */ 222 memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); 223 kvfree(keys->data); 224 keys->data = new_keys.data; 225 keys->nr = new_keys.nr; 226 keys->size = new_keys.size; 227 228 /* And now the gap is at the end: */ 229 keys->gap = keys->nr; 230 } 231 232 journal_iters_move_gap(c, old_gap, idx); 233 234 move_gap(keys, idx); 235 236 keys->nr++; 237 keys->data[keys->gap++] = n; 238 239 journal_iters_fix(c); 240 241 return 0; 242 } 243 244 /* 245 * Can only be used from the recovery thread while we're still RO - can't be 246 * used once we've got RW, as journal_keys is at that point used by multiple 247 * threads: 248 */ 249 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, 250 unsigned level, struct bkey_i *k) 251 { 252 struct bkey_i *n; 253 int ret; 254 255 n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); 256 if (!n) 257 return -BCH_ERR_ENOMEM_journal_key_insert; 258 259 bkey_copy(n, k); 260 ret = bch2_journal_key_insert_take(c, id, level, n); 261 if (ret) 262 kfree(n); 263 return ret; 264 } 265 266 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, 267 unsigned level, struct bpos pos) 268 { 269 struct bkey_i whiteout; 270 271 bkey_init(&whiteout.k); 272 whiteout.k.p = pos; 273 274 return bch2_journal_key_insert(c, id, level, &whiteout); 275 } 276 277 bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, 278 unsigned level, struct bpos pos) 279 { 280 struct journal_keys *keys = &trans->c->journal_keys; 281 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 282 283 if (!trans->journal_replay_not_finished) 284 return false; 285 286 return (idx < keys->size && 287 keys->data[idx].btree_id == btree && 288 keys->data[idx].level == level && 289 bpos_eq(keys->data[idx].k->k.p, pos) && 290 bkey_deleted(&keys->data[idx].k->k)); 291 } 292 293 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, 294 unsigned level, struct bpos pos) 295 { 296 struct journal_keys *keys = &c->journal_keys; 297 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 298 299 if (idx < keys->size && 300 keys->data[idx].btree_id == btree && 301 keys->data[idx].level == level && 302 bpos_eq(keys->data[idx].k->k.p, pos)) 303 keys->data[idx].overwritten = true; 304 } 305 306 static void bch2_journal_iter_advance(struct journal_iter *iter) 307 { 308 if (iter->idx < iter->keys->size) { 309 iter->idx++; 310 if (iter->idx == iter->keys->gap) 311 iter->idx += iter->keys->size - iter->keys->nr; 312 } 313 } 314 315 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 316 { 317 journal_iter_verify(iter); 318 319 while (iter->idx < iter->keys->size) { 320 struct journal_key *k = iter->keys->data + iter->idx; 321 322 int cmp = cmp_int(k->btree_id, iter->btree_id) ?: 323 cmp_int(k->level, iter->level); 324 if (cmp > 0) 325 break; 326 BUG_ON(cmp); 327 328 if (!k->overwritten) 329 return bkey_i_to_s_c(k->k); 330 331 bch2_journal_iter_advance(iter); 332 } 333 334 return bkey_s_c_null; 335 } 336 337 static void bch2_journal_iter_exit(struct journal_iter *iter) 338 { 339 list_del(&iter->list); 340 } 341 342 static void bch2_journal_iter_init(struct bch_fs *c, 343 struct journal_iter *iter, 344 enum btree_id id, unsigned level, 345 struct bpos pos) 346 { 347 iter->btree_id = id; 348 iter->level = level; 349 iter->keys = &c->journal_keys; 350 iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); 351 352 journal_iter_verify(iter); 353 } 354 355 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) 356 { 357 return bch2_btree_node_iter_peek_unpack(&iter->node_iter, 358 iter->b, &iter->unpacked); 359 } 360 361 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) 362 { 363 bch2_btree_node_iter_advance(&iter->node_iter, iter->b); 364 } 365 366 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) 367 { 368 if (bpos_eq(iter->pos, SPOS_MAX)) 369 iter->at_end = true; 370 else 371 iter->pos = bpos_successor(iter->pos); 372 } 373 374 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) 375 { 376 struct btree_and_journal_iter iter = *_iter; 377 struct bch_fs *c = iter.trans->c; 378 unsigned level = iter.journal.level; 379 struct bkey_buf tmp; 380 unsigned nr = test_bit(BCH_FS_started, &c->flags) 381 ? (level > 1 ? 0 : 2) 382 : (level > 1 ? 1 : 16); 383 384 iter.prefetch = false; 385 bch2_bkey_buf_init(&tmp); 386 387 while (nr--) { 388 bch2_btree_and_journal_iter_advance(&iter); 389 struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); 390 if (!k.k) 391 break; 392 393 bch2_bkey_buf_reassemble(&tmp, c, k); 394 bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); 395 } 396 397 bch2_bkey_buf_exit(&tmp, c); 398 } 399 400 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 401 { 402 struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; 403 404 if (iter->prefetch && iter->journal.level) 405 btree_and_journal_iter_prefetch(iter); 406 again: 407 if (iter->at_end) 408 return bkey_s_c_null; 409 410 while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && 411 bpos_lt(btree_k.k->p, iter->pos)) 412 bch2_journal_iter_advance_btree(iter); 413 414 if (iter->trans->journal_replay_not_finished) 415 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && 416 bpos_lt(journal_k.k->p, iter->pos)) 417 bch2_journal_iter_advance(&iter->journal); 418 419 ret = journal_k.k && 420 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) 421 ? journal_k 422 : btree_k; 423 424 if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) 425 ret = bkey_s_c_null; 426 427 if (ret.k) { 428 iter->pos = ret.k->p; 429 if (bkey_deleted(ret.k)) { 430 bch2_btree_and_journal_iter_advance(iter); 431 goto again; 432 } 433 } else { 434 iter->pos = SPOS_MAX; 435 iter->at_end = true; 436 } 437 438 return ret; 439 } 440 441 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) 442 { 443 bch2_journal_iter_exit(&iter->journal); 444 } 445 446 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 447 struct btree_and_journal_iter *iter, 448 struct btree *b, 449 struct btree_node_iter node_iter, 450 struct bpos pos) 451 { 452 memset(iter, 0, sizeof(*iter)); 453 454 iter->trans = trans; 455 iter->b = b; 456 iter->node_iter = node_iter; 457 iter->pos = b->data->min_key; 458 iter->at_end = false; 459 INIT_LIST_HEAD(&iter->journal.list); 460 461 if (trans->journal_replay_not_finished) { 462 bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); 463 if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) 464 list_add(&iter->journal.list, &trans->c->journal_iters); 465 } 466 } 467 468 /* 469 * this version is used by btree_gc before filesystem has gone RW and 470 * multithreaded, so uses the journal_iters list: 471 */ 472 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 473 struct btree_and_journal_iter *iter, 474 struct btree *b) 475 { 476 struct btree_node_iter node_iter; 477 478 bch2_btree_node_iter_init_from_start(&node_iter, b); 479 __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); 480 } 481 482 /* sort and dedup all keys in the journal: */ 483 484 void bch2_journal_entries_free(struct bch_fs *c) 485 { 486 struct journal_replay **i; 487 struct genradix_iter iter; 488 489 genradix_for_each(&c->journal_entries, iter, i) 490 kvfree(*i); 491 genradix_free(&c->journal_entries); 492 } 493 494 /* 495 * When keys compare equal, oldest compares first: 496 */ 497 static int journal_sort_key_cmp(const void *_l, const void *_r) 498 { 499 const struct journal_key *l = _l; 500 const struct journal_key *r = _r; 501 502 return journal_key_cmp(l, r) ?: 503 cmp_int(l->journal_seq, r->journal_seq) ?: 504 cmp_int(l->journal_offset, r->journal_offset); 505 } 506 507 void bch2_journal_keys_put(struct bch_fs *c) 508 { 509 struct journal_keys *keys = &c->journal_keys; 510 511 BUG_ON(atomic_read(&keys->ref) <= 0); 512 513 if (!atomic_dec_and_test(&keys->ref)) 514 return; 515 516 move_gap(keys, keys->nr); 517 518 darray_for_each(*keys, i) 519 if (i->allocated) 520 kfree(i->k); 521 522 kvfree(keys->data); 523 keys->data = NULL; 524 keys->nr = keys->gap = keys->size = 0; 525 526 bch2_journal_entries_free(c); 527 } 528 529 static void __journal_keys_sort(struct journal_keys *keys) 530 { 531 sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); 532 533 cond_resched(); 534 535 struct journal_key *dst = keys->data; 536 537 darray_for_each(*keys, src) { 538 /* 539 * We don't accumulate accounting keys here because we have to 540 * compare each individual accounting key against the version in 541 * the btree during replay: 542 */ 543 if (src->k->k.type != KEY_TYPE_accounting && 544 src + 1 < &darray_top(*keys) && 545 !journal_key_cmp(src, src + 1)) 546 continue; 547 548 *dst++ = *src; 549 } 550 551 keys->nr = dst - keys->data; 552 } 553 554 int bch2_journal_keys_sort(struct bch_fs *c) 555 { 556 struct genradix_iter iter; 557 struct journal_replay *i, **_i; 558 struct journal_keys *keys = &c->journal_keys; 559 size_t nr_read = 0; 560 561 genradix_for_each(&c->journal_entries, iter, _i) { 562 i = *_i; 563 564 if (journal_replay_ignore(i)) 565 continue; 566 567 cond_resched(); 568 569 for_each_jset_key(k, entry, &i->j) { 570 struct journal_key n = (struct journal_key) { 571 .btree_id = entry->btree_id, 572 .level = entry->level, 573 .k = k, 574 .journal_seq = le64_to_cpu(i->j.seq), 575 .journal_offset = k->_data - i->j._data, 576 }; 577 578 if (darray_push(keys, n)) { 579 __journal_keys_sort(keys); 580 581 if (keys->nr * 8 > keys->size * 7) { 582 bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 583 keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 584 return -BCH_ERR_ENOMEM_journal_keys_sort; 585 } 586 587 BUG_ON(darray_push(keys, n)); 588 } 589 590 nr_read++; 591 } 592 } 593 594 __journal_keys_sort(keys); 595 keys->gap = keys->nr; 596 597 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 598 return 0; 599 } 600 601 void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, 602 unsigned level_min, unsigned level_max, 603 struct bpos start, struct bpos end) 604 { 605 struct journal_keys *keys = &c->journal_keys; 606 size_t dst = 0; 607 608 move_gap(keys, keys->nr); 609 610 darray_for_each(*keys, i) 611 if (!(i->btree_id == btree && 612 i->level >= level_min && 613 i->level <= level_max && 614 bpos_ge(i->k->k.p, start) && 615 bpos_le(i->k->k.p, end))) 616 keys->data[dst++] = *i; 617 keys->nr = keys->gap = dst; 618 } 619 620 void bch2_journal_keys_dump(struct bch_fs *c) 621 { 622 struct journal_keys *keys = &c->journal_keys; 623 struct printbuf buf = PRINTBUF; 624 625 pr_info("%zu keys:", keys->nr); 626 627 move_gap(keys, keys->nr); 628 629 darray_for_each(*keys, i) { 630 printbuf_reset(&buf); 631 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); 632 pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); 633 } 634 printbuf_exit(&buf); 635 } 636