1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "bset.h" 6 #include "btree_cache.h" 7 #include "btree_journal_iter.h" 8 #include "journal_io.h" 9 10 #include <linux/sort.h> 11 12 /* 13 * For managing keys we read from the journal: until journal replay works normal 14 * btree lookups need to be able to find and return keys from the journal where 15 * they overwrite what's in the btree, so we have a special iterator and 16 * operations for the regular btree iter code to use: 17 */ 18 19 static int __journal_key_cmp(enum btree_id l_btree_id, 20 unsigned l_level, 21 struct bpos l_pos, 22 const struct journal_key *r) 23 { 24 return (cmp_int(l_btree_id, r->btree_id) ?: 25 cmp_int(l_level, r->level) ?: 26 bpos_cmp(l_pos, r->k->k.p)); 27 } 28 29 static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) 30 { 31 return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); 32 } 33 34 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) 35 { 36 size_t gap_size = keys->size - keys->nr; 37 38 if (idx >= keys->gap) 39 idx += gap_size; 40 return idx; 41 } 42 43 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 44 { 45 return keys->data + idx_to_pos(keys, idx); 46 } 47 48 static size_t __bch2_journal_key_search(struct journal_keys *keys, 49 enum btree_id id, unsigned level, 50 struct bpos pos) 51 { 52 size_t l = 0, r = keys->nr, m; 53 54 while (l < r) { 55 m = l + ((r - l) >> 1); 56 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) 57 l = m + 1; 58 else 59 r = m; 60 } 61 62 BUG_ON(l < keys->nr && 63 __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); 64 65 BUG_ON(l && 66 __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); 67 68 return l; 69 } 70 71 static size_t bch2_journal_key_search(struct journal_keys *keys, 72 enum btree_id id, unsigned level, 73 struct bpos pos) 74 { 75 return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); 76 } 77 78 /* Returns first non-overwritten key >= search key: */ 79 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, 80 unsigned level, struct bpos pos, 81 struct bpos end_pos, size_t *idx) 82 { 83 struct journal_keys *keys = &c->journal_keys; 84 unsigned iters = 0; 85 struct journal_key *k; 86 87 BUG_ON(*idx > keys->nr); 88 search: 89 if (!*idx) 90 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 91 92 while (*idx && 93 __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 94 --(*idx); 95 iters++; 96 if (iters == 10) { 97 *idx = 0; 98 goto search; 99 } 100 } 101 102 while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 103 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) 104 return NULL; 105 106 if (k->overwritten) { 107 (*idx)++; 108 continue; 109 } 110 111 if (__journal_key_cmp(btree_id, level, pos, k) <= 0) 112 return k->k; 113 114 (*idx)++; 115 iters++; 116 if (iters == 10) { 117 *idx = 0; 118 goto search; 119 } 120 } 121 122 return NULL; 123 } 124 125 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, 126 unsigned level, struct bpos pos) 127 { 128 size_t idx = 0; 129 130 return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); 131 } 132 133 static void journal_iters_fix(struct bch_fs *c) 134 { 135 struct journal_keys *keys = &c->journal_keys; 136 /* The key we just inserted is immediately before the gap: */ 137 size_t gap_end = keys->gap + (keys->size - keys->nr); 138 struct btree_and_journal_iter *iter; 139 140 /* 141 * If an iterator points one after the key we just inserted, decrement 142 * the iterator so it points at the key we just inserted - if the 143 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will 144 * handle that: 145 */ 146 list_for_each_entry(iter, &c->journal_iters, journal.list) 147 if (iter->journal.idx == gap_end) 148 iter->journal.idx = keys->gap - 1; 149 } 150 151 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) 152 { 153 struct journal_keys *keys = &c->journal_keys; 154 struct journal_iter *iter; 155 size_t gap_size = keys->size - keys->nr; 156 157 list_for_each_entry(iter, &c->journal_iters, list) { 158 if (iter->idx > old_gap) 159 iter->idx -= gap_size; 160 if (iter->idx >= new_gap) 161 iter->idx += gap_size; 162 } 163 } 164 165 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, 166 unsigned level, struct bkey_i *k) 167 { 168 struct journal_key n = { 169 .btree_id = id, 170 .level = level, 171 .k = k, 172 .allocated = true, 173 /* 174 * Ensure these keys are done last by journal replay, to unblock 175 * journal reclaim: 176 */ 177 .journal_seq = U32_MAX, 178 }; 179 struct journal_keys *keys = &c->journal_keys; 180 size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); 181 182 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 183 184 if (idx < keys->size && 185 journal_key_cmp(&n, &keys->data[idx]) == 0) { 186 if (keys->data[idx].allocated) 187 kfree(keys->data[idx].k); 188 keys->data[idx] = n; 189 return 0; 190 } 191 192 if (idx > keys->gap) 193 idx -= keys->size - keys->nr; 194 195 if (keys->nr == keys->size) { 196 struct journal_keys new_keys = { 197 .nr = keys->nr, 198 .size = max_t(size_t, keys->size, 8) * 2, 199 }; 200 201 new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); 202 if (!new_keys.data) { 203 bch_err(c, "%s: error allocating new key array (size %zu)", 204 __func__, new_keys.size); 205 return -BCH_ERR_ENOMEM_journal_key_insert; 206 } 207 208 /* Since @keys was full, there was no gap: */ 209 memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); 210 kvfree(keys->data); 211 keys->data = new_keys.data; 212 keys->nr = new_keys.nr; 213 keys->size = new_keys.size; 214 215 /* And now the gap is at the end: */ 216 keys->gap = keys->nr; 217 } 218 219 journal_iters_move_gap(c, keys->gap, idx); 220 221 move_gap(keys, idx); 222 223 keys->nr++; 224 keys->data[keys->gap++] = n; 225 226 journal_iters_fix(c); 227 228 return 0; 229 } 230 231 /* 232 * Can only be used from the recovery thread while we're still RO - can't be 233 * used once we've got RW, as journal_keys is at that point used by multiple 234 * threads: 235 */ 236 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, 237 unsigned level, struct bkey_i *k) 238 { 239 struct bkey_i *n; 240 int ret; 241 242 n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); 243 if (!n) 244 return -BCH_ERR_ENOMEM_journal_key_insert; 245 246 bkey_copy(n, k); 247 ret = bch2_journal_key_insert_take(c, id, level, n); 248 if (ret) 249 kfree(n); 250 return ret; 251 } 252 253 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, 254 unsigned level, struct bpos pos) 255 { 256 struct bkey_i whiteout; 257 258 bkey_init(&whiteout.k); 259 whiteout.k.p = pos; 260 261 return bch2_journal_key_insert(c, id, level, &whiteout); 262 } 263 264 bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, 265 unsigned level, struct bpos pos) 266 { 267 struct journal_keys *keys = &trans->c->journal_keys; 268 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 269 270 if (!trans->journal_replay_not_finished) 271 return false; 272 273 return (idx < keys->size && 274 keys->data[idx].btree_id == btree && 275 keys->data[idx].level == level && 276 bpos_eq(keys->data[idx].k->k.p, pos) && 277 bkey_deleted(&keys->data[idx].k->k)); 278 } 279 280 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, 281 unsigned level, struct bpos pos) 282 { 283 struct journal_keys *keys = &c->journal_keys; 284 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 285 286 if (idx < keys->size && 287 keys->data[idx].btree_id == btree && 288 keys->data[idx].level == level && 289 bpos_eq(keys->data[idx].k->k.p, pos)) 290 keys->data[idx].overwritten = true; 291 } 292 293 static void bch2_journal_iter_advance(struct journal_iter *iter) 294 { 295 if (iter->idx < iter->keys->size) { 296 iter->idx++; 297 if (iter->idx == iter->keys->gap) 298 iter->idx += iter->keys->size - iter->keys->nr; 299 } 300 } 301 302 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 303 { 304 struct journal_key *k = iter->keys->data + iter->idx; 305 306 while (k < iter->keys->data + iter->keys->size && 307 k->btree_id == iter->btree_id && 308 k->level == iter->level) { 309 if (!k->overwritten) 310 return bkey_i_to_s_c(k->k); 311 312 bch2_journal_iter_advance(iter); 313 k = iter->keys->data + iter->idx; 314 } 315 316 return bkey_s_c_null; 317 } 318 319 static void bch2_journal_iter_exit(struct journal_iter *iter) 320 { 321 list_del(&iter->list); 322 } 323 324 static void bch2_journal_iter_init(struct bch_fs *c, 325 struct journal_iter *iter, 326 enum btree_id id, unsigned level, 327 struct bpos pos) 328 { 329 iter->btree_id = id; 330 iter->level = level; 331 iter->keys = &c->journal_keys; 332 iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); 333 } 334 335 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) 336 { 337 return bch2_btree_node_iter_peek_unpack(&iter->node_iter, 338 iter->b, &iter->unpacked); 339 } 340 341 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) 342 { 343 bch2_btree_node_iter_advance(&iter->node_iter, iter->b); 344 } 345 346 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) 347 { 348 if (bpos_eq(iter->pos, SPOS_MAX)) 349 iter->at_end = true; 350 else 351 iter->pos = bpos_successor(iter->pos); 352 } 353 354 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) 355 { 356 struct btree_and_journal_iter iter = *_iter; 357 struct bch_fs *c = iter.trans->c; 358 unsigned level = iter.journal.level; 359 struct bkey_buf tmp; 360 unsigned nr = test_bit(BCH_FS_started, &c->flags) 361 ? (level > 1 ? 0 : 2) 362 : (level > 1 ? 1 : 16); 363 364 iter.prefetch = false; 365 bch2_bkey_buf_init(&tmp); 366 367 while (nr--) { 368 bch2_btree_and_journal_iter_advance(&iter); 369 struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); 370 if (!k.k) 371 break; 372 373 bch2_bkey_buf_reassemble(&tmp, c, k); 374 bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); 375 } 376 377 bch2_bkey_buf_exit(&tmp, c); 378 } 379 380 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 381 { 382 struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; 383 384 if (iter->prefetch && iter->journal.level) 385 btree_and_journal_iter_prefetch(iter); 386 again: 387 if (iter->at_end) 388 return bkey_s_c_null; 389 390 while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && 391 bpos_lt(btree_k.k->p, iter->pos)) 392 bch2_journal_iter_advance_btree(iter); 393 394 if (iter->trans->journal_replay_not_finished) 395 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && 396 bpos_lt(journal_k.k->p, iter->pos)) 397 bch2_journal_iter_advance(&iter->journal); 398 399 ret = journal_k.k && 400 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) 401 ? journal_k 402 : btree_k; 403 404 if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) 405 ret = bkey_s_c_null; 406 407 if (ret.k) { 408 iter->pos = ret.k->p; 409 if (bkey_deleted(ret.k)) { 410 bch2_btree_and_journal_iter_advance(iter); 411 goto again; 412 } 413 } else { 414 iter->pos = SPOS_MAX; 415 iter->at_end = true; 416 } 417 418 return ret; 419 } 420 421 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) 422 { 423 bch2_journal_iter_exit(&iter->journal); 424 } 425 426 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 427 struct btree_and_journal_iter *iter, 428 struct btree *b, 429 struct btree_node_iter node_iter, 430 struct bpos pos) 431 { 432 memset(iter, 0, sizeof(*iter)); 433 434 iter->trans = trans; 435 iter->b = b; 436 iter->node_iter = node_iter; 437 bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); 438 INIT_LIST_HEAD(&iter->journal.list); 439 iter->pos = b->data->min_key; 440 iter->at_end = false; 441 } 442 443 /* 444 * this version is used by btree_gc before filesystem has gone RW and 445 * multithreaded, so uses the journal_iters list: 446 */ 447 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 448 struct btree_and_journal_iter *iter, 449 struct btree *b) 450 { 451 struct btree_node_iter node_iter; 452 453 bch2_btree_node_iter_init_from_start(&node_iter, b); 454 __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); 455 if (trans->journal_replay_not_finished && 456 !test_bit(BCH_FS_may_go_rw, &trans->c->flags)) 457 list_add(&iter->journal.list, &trans->c->journal_iters); 458 } 459 460 /* sort and dedup all keys in the journal: */ 461 462 void bch2_journal_entries_free(struct bch_fs *c) 463 { 464 struct journal_replay **i; 465 struct genradix_iter iter; 466 467 genradix_for_each(&c->journal_entries, iter, i) 468 kvfree(*i); 469 genradix_free(&c->journal_entries); 470 } 471 472 /* 473 * When keys compare equal, oldest compares first: 474 */ 475 static int journal_sort_key_cmp(const void *_l, const void *_r) 476 { 477 const struct journal_key *l = _l; 478 const struct journal_key *r = _r; 479 480 return journal_key_cmp(l, r) ?: 481 cmp_int(l->journal_seq, r->journal_seq) ?: 482 cmp_int(l->journal_offset, r->journal_offset); 483 } 484 485 void bch2_journal_keys_put(struct bch_fs *c) 486 { 487 struct journal_keys *keys = &c->journal_keys; 488 489 BUG_ON(atomic_read(&keys->ref) <= 0); 490 491 if (!atomic_dec_and_test(&keys->ref)) 492 return; 493 494 move_gap(keys, keys->nr); 495 496 darray_for_each(*keys, i) 497 if (i->allocated) 498 kfree(i->k); 499 500 kvfree(keys->data); 501 keys->data = NULL; 502 keys->nr = keys->gap = keys->size = 0; 503 504 bch2_journal_entries_free(c); 505 } 506 507 static void __journal_keys_sort(struct journal_keys *keys) 508 { 509 sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); 510 511 struct journal_key *dst = keys->data; 512 513 darray_for_each(*keys, src) { 514 if (src + 1 < &darray_top(*keys) && 515 !journal_key_cmp(src, src + 1)) 516 continue; 517 518 *dst++ = *src; 519 } 520 521 keys->nr = dst - keys->data; 522 } 523 524 int bch2_journal_keys_sort(struct bch_fs *c) 525 { 526 struct genradix_iter iter; 527 struct journal_replay *i, **_i; 528 struct journal_keys *keys = &c->journal_keys; 529 size_t nr_read = 0; 530 531 genradix_for_each(&c->journal_entries, iter, _i) { 532 i = *_i; 533 534 if (journal_replay_ignore(i)) 535 continue; 536 537 cond_resched(); 538 539 for_each_jset_key(k, entry, &i->j) { 540 struct journal_key n = (struct journal_key) { 541 .btree_id = entry->btree_id, 542 .level = entry->level, 543 .k = k, 544 .journal_seq = le64_to_cpu(i->j.seq), 545 .journal_offset = k->_data - i->j._data, 546 }; 547 548 if (darray_push(keys, n)) { 549 __journal_keys_sort(keys); 550 551 if (keys->nr * 8 > keys->size * 7) { 552 bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 553 keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 554 return -BCH_ERR_ENOMEM_journal_keys_sort; 555 } 556 557 BUG_ON(darray_push(keys, n)); 558 } 559 560 nr_read++; 561 } 562 } 563 564 __journal_keys_sort(keys); 565 keys->gap = keys->nr; 566 567 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 568 return 0; 569 } 570 571 void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, 572 unsigned level_min, unsigned level_max, 573 struct bpos start, struct bpos end) 574 { 575 struct journal_keys *keys = &c->journal_keys; 576 size_t dst = 0; 577 578 move_gap(keys, keys->nr); 579 580 darray_for_each(*keys, i) 581 if (!(i->btree_id == btree && 582 i->level >= level_min && 583 i->level <= level_max && 584 bpos_ge(i->k->k.p, start) && 585 bpos_le(i->k->k.p, end))) 586 keys->data[dst++] = *i; 587 keys->nr = keys->gap = dst; 588 } 589