1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_buf.h" 5 #include "bset.h" 6 #include "btree_cache.h" 7 #include "btree_journal_iter.h" 8 #include "journal_io.h" 9 10 #include <linux/sort.h> 11 12 /* 13 * For managing keys we read from the journal: until journal replay works normal 14 * btree lookups need to be able to find and return keys from the journal where 15 * they overwrite what's in the btree, so we have a special iterator and 16 * operations for the regular btree iter code to use: 17 */ 18 19 static int __journal_key_cmp(enum btree_id l_btree_id, 20 unsigned l_level, 21 struct bpos l_pos, 22 const struct journal_key *r) 23 { 24 return (cmp_int(l_btree_id, r->btree_id) ?: 25 cmp_int(l_level, r->level) ?: 26 bpos_cmp(l_pos, r->k->k.p)); 27 } 28 29 static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) 30 { 31 return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); 32 } 33 34 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) 35 { 36 size_t gap_size = keys->size - keys->nr; 37 38 if (idx >= keys->gap) 39 idx += gap_size; 40 return idx; 41 } 42 43 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 44 { 45 return keys->data + idx_to_pos(keys, idx); 46 } 47 48 static size_t __bch2_journal_key_search(struct journal_keys *keys, 49 enum btree_id id, unsigned level, 50 struct bpos pos) 51 { 52 size_t l = 0, r = keys->nr, m; 53 54 while (l < r) { 55 m = l + ((r - l) >> 1); 56 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) 57 l = m + 1; 58 else 59 r = m; 60 } 61 62 BUG_ON(l < keys->nr && 63 __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); 64 65 BUG_ON(l && 66 __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); 67 68 return l; 69 } 70 71 static size_t bch2_journal_key_search(struct journal_keys *keys, 72 enum btree_id id, unsigned level, 73 struct bpos pos) 74 { 75 return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); 76 } 77 78 /* Returns first non-overwritten key >= search key: */ 79 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, 80 unsigned level, struct bpos pos, 81 struct bpos end_pos, size_t *idx) 82 { 83 struct journal_keys *keys = &c->journal_keys; 84 unsigned iters = 0; 85 struct journal_key *k; 86 87 BUG_ON(*idx > keys->nr); 88 search: 89 if (!*idx) 90 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 91 92 while (*idx && 93 __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 94 --(*idx); 95 iters++; 96 if (iters == 10) { 97 *idx = 0; 98 goto search; 99 } 100 } 101 102 while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 103 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) 104 return NULL; 105 106 if (k->overwritten) { 107 (*idx)++; 108 continue; 109 } 110 111 if (__journal_key_cmp(btree_id, level, pos, k) <= 0) 112 return k->k; 113 114 (*idx)++; 115 iters++; 116 if (iters == 10) { 117 *idx = 0; 118 goto search; 119 } 120 } 121 122 return NULL; 123 } 124 125 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, 126 unsigned level, struct bpos pos) 127 { 128 size_t idx = 0; 129 130 return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); 131 } 132 133 static void journal_iters_fix(struct bch_fs *c) 134 { 135 struct journal_keys *keys = &c->journal_keys; 136 /* The key we just inserted is immediately before the gap: */ 137 size_t gap_end = keys->gap + (keys->size - keys->nr); 138 struct btree_and_journal_iter *iter; 139 140 /* 141 * If an iterator points one after the key we just inserted, decrement 142 * the iterator so it points at the key we just inserted - if the 143 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will 144 * handle that: 145 */ 146 list_for_each_entry(iter, &c->journal_iters, journal.list) 147 if (iter->journal.idx == gap_end) 148 iter->journal.idx = keys->gap - 1; 149 } 150 151 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) 152 { 153 struct journal_keys *keys = &c->journal_keys; 154 struct journal_iter *iter; 155 size_t gap_size = keys->size - keys->nr; 156 157 list_for_each_entry(iter, &c->journal_iters, list) { 158 if (iter->idx > old_gap) 159 iter->idx -= gap_size; 160 if (iter->idx >= new_gap) 161 iter->idx += gap_size; 162 } 163 } 164 165 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, 166 unsigned level, struct bkey_i *k) 167 { 168 struct journal_key n = { 169 .btree_id = id, 170 .level = level, 171 .k = k, 172 .allocated = true, 173 /* 174 * Ensure these keys are done last by journal replay, to unblock 175 * journal reclaim: 176 */ 177 .journal_seq = U32_MAX, 178 }; 179 struct journal_keys *keys = &c->journal_keys; 180 size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); 181 182 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 183 184 if (idx < keys->size && 185 journal_key_cmp(&n, &keys->data[idx]) == 0) { 186 if (keys->data[idx].allocated) 187 kfree(keys->data[idx].k); 188 keys->data[idx] = n; 189 return 0; 190 } 191 192 if (idx > keys->gap) 193 idx -= keys->size - keys->nr; 194 195 if (keys->nr == keys->size) { 196 struct journal_keys new_keys = { 197 .nr = keys->nr, 198 .size = max_t(size_t, keys->size, 8) * 2, 199 }; 200 201 new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); 202 if (!new_keys.data) { 203 bch_err(c, "%s: error allocating new key array (size %zu)", 204 __func__, new_keys.size); 205 return -BCH_ERR_ENOMEM_journal_key_insert; 206 } 207 208 /* Since @keys was full, there was no gap: */ 209 memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); 210 kvfree(keys->data); 211 keys->data = new_keys.data; 212 keys->nr = new_keys.nr; 213 keys->size = new_keys.size; 214 215 /* And now the gap is at the end: */ 216 keys->gap = keys->nr; 217 } 218 219 journal_iters_move_gap(c, keys->gap, idx); 220 221 move_gap(keys, idx); 222 223 keys->nr++; 224 keys->data[keys->gap++] = n; 225 226 journal_iters_fix(c); 227 228 return 0; 229 } 230 231 /* 232 * Can only be used from the recovery thread while we're still RO - can't be 233 * used once we've got RW, as journal_keys is at that point used by multiple 234 * threads: 235 */ 236 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, 237 unsigned level, struct bkey_i *k) 238 { 239 struct bkey_i *n; 240 int ret; 241 242 n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); 243 if (!n) 244 return -BCH_ERR_ENOMEM_journal_key_insert; 245 246 bkey_copy(n, k); 247 ret = bch2_journal_key_insert_take(c, id, level, n); 248 if (ret) 249 kfree(n); 250 return ret; 251 } 252 253 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, 254 unsigned level, struct bpos pos) 255 { 256 struct bkey_i whiteout; 257 258 bkey_init(&whiteout.k); 259 whiteout.k.p = pos; 260 261 return bch2_journal_key_insert(c, id, level, &whiteout); 262 } 263 264 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, 265 unsigned level, struct bpos pos) 266 { 267 struct journal_keys *keys = &c->journal_keys; 268 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 269 270 if (idx < keys->size && 271 keys->data[idx].btree_id == btree && 272 keys->data[idx].level == level && 273 bpos_eq(keys->data[idx].k->k.p, pos)) 274 keys->data[idx].overwritten = true; 275 } 276 277 static void bch2_journal_iter_advance(struct journal_iter *iter) 278 { 279 if (iter->idx < iter->keys->size) { 280 iter->idx++; 281 if (iter->idx == iter->keys->gap) 282 iter->idx += iter->keys->size - iter->keys->nr; 283 } 284 } 285 286 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 287 { 288 struct journal_key *k = iter->keys->data + iter->idx; 289 290 while (k < iter->keys->data + iter->keys->size && 291 k->btree_id == iter->btree_id && 292 k->level == iter->level) { 293 if (!k->overwritten) 294 return bkey_i_to_s_c(k->k); 295 296 bch2_journal_iter_advance(iter); 297 k = iter->keys->data + iter->idx; 298 } 299 300 return bkey_s_c_null; 301 } 302 303 static void bch2_journal_iter_exit(struct journal_iter *iter) 304 { 305 list_del(&iter->list); 306 } 307 308 static void bch2_journal_iter_init(struct bch_fs *c, 309 struct journal_iter *iter, 310 enum btree_id id, unsigned level, 311 struct bpos pos) 312 { 313 iter->btree_id = id; 314 iter->level = level; 315 iter->keys = &c->journal_keys; 316 iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); 317 } 318 319 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) 320 { 321 return bch2_btree_node_iter_peek_unpack(&iter->node_iter, 322 iter->b, &iter->unpacked); 323 } 324 325 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) 326 { 327 bch2_btree_node_iter_advance(&iter->node_iter, iter->b); 328 } 329 330 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) 331 { 332 if (bpos_eq(iter->pos, SPOS_MAX)) 333 iter->at_end = true; 334 else 335 iter->pos = bpos_successor(iter->pos); 336 } 337 338 static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) 339 { 340 struct btree_and_journal_iter iter = *_iter; 341 struct bch_fs *c = iter.trans->c; 342 unsigned level = iter.journal.level; 343 struct bkey_buf tmp; 344 unsigned nr = test_bit(BCH_FS_started, &c->flags) 345 ? (level > 1 ? 0 : 2) 346 : (level > 1 ? 1 : 16); 347 348 iter.prefetch = false; 349 bch2_bkey_buf_init(&tmp); 350 351 while (nr--) { 352 bch2_btree_and_journal_iter_advance(&iter); 353 struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); 354 if (!k.k) 355 break; 356 357 bch2_bkey_buf_reassemble(&tmp, c, k); 358 bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); 359 } 360 361 bch2_bkey_buf_exit(&tmp, c); 362 } 363 364 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 365 { 366 struct bkey_s_c btree_k, journal_k, ret; 367 368 if (iter->prefetch && iter->journal.level) 369 btree_and_journal_iter_prefetch(iter); 370 again: 371 if (iter->at_end) 372 return bkey_s_c_null; 373 374 while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && 375 bpos_lt(btree_k.k->p, iter->pos)) 376 bch2_journal_iter_advance_btree(iter); 377 378 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && 379 bpos_lt(journal_k.k->p, iter->pos)) 380 bch2_journal_iter_advance(&iter->journal); 381 382 ret = journal_k.k && 383 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) 384 ? journal_k 385 : btree_k; 386 387 if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) 388 ret = bkey_s_c_null; 389 390 if (ret.k) { 391 iter->pos = ret.k->p; 392 if (bkey_deleted(ret.k)) { 393 bch2_btree_and_journal_iter_advance(iter); 394 goto again; 395 } 396 } else { 397 iter->pos = SPOS_MAX; 398 iter->at_end = true; 399 } 400 401 return ret; 402 } 403 404 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) 405 { 406 bch2_journal_iter_exit(&iter->journal); 407 } 408 409 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 410 struct btree_and_journal_iter *iter, 411 struct btree *b, 412 struct btree_node_iter node_iter, 413 struct bpos pos) 414 { 415 memset(iter, 0, sizeof(*iter)); 416 417 iter->trans = trans; 418 iter->b = b; 419 iter->node_iter = node_iter; 420 bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); 421 INIT_LIST_HEAD(&iter->journal.list); 422 iter->pos = b->data->min_key; 423 iter->at_end = false; 424 } 425 426 /* 427 * this version is used by btree_gc before filesystem has gone RW and 428 * multithreaded, so uses the journal_iters list: 429 */ 430 void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, 431 struct btree_and_journal_iter *iter, 432 struct btree *b) 433 { 434 struct btree_node_iter node_iter; 435 436 bch2_btree_node_iter_init_from_start(&node_iter, b); 437 __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); 438 list_add(&iter->journal.list, &trans->c->journal_iters); 439 } 440 441 /* sort and dedup all keys in the journal: */ 442 443 void bch2_journal_entries_free(struct bch_fs *c) 444 { 445 struct journal_replay **i; 446 struct genradix_iter iter; 447 448 genradix_for_each(&c->journal_entries, iter, i) 449 kvfree(*i); 450 genradix_free(&c->journal_entries); 451 } 452 453 /* 454 * When keys compare equal, oldest compares first: 455 */ 456 static int journal_sort_key_cmp(const void *_l, const void *_r) 457 { 458 const struct journal_key *l = _l; 459 const struct journal_key *r = _r; 460 461 return journal_key_cmp(l, r) ?: 462 cmp_int(l->journal_seq, r->journal_seq) ?: 463 cmp_int(l->journal_offset, r->journal_offset); 464 } 465 466 void bch2_journal_keys_put(struct bch_fs *c) 467 { 468 struct journal_keys *keys = &c->journal_keys; 469 470 BUG_ON(atomic_read(&keys->ref) <= 0); 471 472 if (!atomic_dec_and_test(&keys->ref)) 473 return; 474 475 move_gap(keys, keys->nr); 476 477 darray_for_each(*keys, i) 478 if (i->allocated) 479 kfree(i->k); 480 481 kvfree(keys->data); 482 keys->data = NULL; 483 keys->nr = keys->gap = keys->size = 0; 484 485 bch2_journal_entries_free(c); 486 } 487 488 static void __journal_keys_sort(struct journal_keys *keys) 489 { 490 sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); 491 492 struct journal_key *dst = keys->data; 493 494 darray_for_each(*keys, src) { 495 if (src + 1 < &darray_top(*keys) && 496 !journal_key_cmp(src, src + 1)) 497 continue; 498 499 *dst++ = *src; 500 } 501 502 keys->nr = dst - keys->data; 503 } 504 505 int bch2_journal_keys_sort(struct bch_fs *c) 506 { 507 struct genradix_iter iter; 508 struct journal_replay *i, **_i; 509 struct journal_keys *keys = &c->journal_keys; 510 size_t nr_read = 0; 511 512 genradix_for_each(&c->journal_entries, iter, _i) { 513 i = *_i; 514 515 if (journal_replay_ignore(i)) 516 continue; 517 518 cond_resched(); 519 520 for_each_jset_key(k, entry, &i->j) { 521 struct journal_key n = (struct journal_key) { 522 .btree_id = entry->btree_id, 523 .level = entry->level, 524 .k = k, 525 .journal_seq = le64_to_cpu(i->j.seq), 526 .journal_offset = k->_data - i->j._data, 527 }; 528 529 if (darray_push(keys, n)) { 530 __journal_keys_sort(keys); 531 532 if (keys->nr * 8 > keys->size * 7) { 533 bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", 534 keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); 535 return -BCH_ERR_ENOMEM_journal_keys_sort; 536 } 537 538 BUG_ON(darray_push(keys, n)); 539 } 540 541 nr_read++; 542 } 543 } 544 545 __journal_keys_sort(keys); 546 keys->gap = keys->nr; 547 548 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); 549 return 0; 550 } 551