1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bset.h" 5 #include "btree_journal_iter.h" 6 #include "journal_io.h" 7 8 #include <linux/sort.h> 9 10 /* 11 * For managing keys we read from the journal: until journal replay works normal 12 * btree lookups need to be able to find and return keys from the journal where 13 * they overwrite what's in the btree, so we have a special iterator and 14 * operations for the regular btree iter code to use: 15 */ 16 17 static int __journal_key_cmp(enum btree_id l_btree_id, 18 unsigned l_level, 19 struct bpos l_pos, 20 const struct journal_key *r) 21 { 22 return (cmp_int(l_btree_id, r->btree_id) ?: 23 cmp_int(l_level, r->level) ?: 24 bpos_cmp(l_pos, r->k->k.p)); 25 } 26 27 static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) 28 { 29 return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); 30 } 31 32 static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) 33 { 34 size_t gap_size = keys->size - keys->nr; 35 36 if (idx >= keys->gap) 37 idx += gap_size; 38 return idx; 39 } 40 41 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) 42 { 43 return keys->d + idx_to_pos(keys, idx); 44 } 45 46 static size_t __bch2_journal_key_search(struct journal_keys *keys, 47 enum btree_id id, unsigned level, 48 struct bpos pos) 49 { 50 size_t l = 0, r = keys->nr, m; 51 52 while (l < r) { 53 m = l + ((r - l) >> 1); 54 if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) 55 l = m + 1; 56 else 57 r = m; 58 } 59 60 BUG_ON(l < keys->nr && 61 __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); 62 63 BUG_ON(l && 64 __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); 65 66 return l; 67 } 68 69 static size_t bch2_journal_key_search(struct journal_keys *keys, 70 enum btree_id id, unsigned level, 71 struct bpos pos) 72 { 73 return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); 74 } 75 76 /* Returns first non-overwritten key >= search key: */ 77 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, 78 unsigned level, struct bpos pos, 79 struct bpos end_pos, size_t *idx) 80 { 81 struct journal_keys *keys = &c->journal_keys; 82 unsigned iters = 0; 83 struct journal_key *k; 84 85 BUG_ON(*idx > keys->nr); 86 search: 87 if (!*idx) 88 *idx = __bch2_journal_key_search(keys, btree_id, level, pos); 89 90 while (*idx && 91 __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { 92 --(*idx); 93 iters++; 94 if (iters == 10) { 95 *idx = 0; 96 goto search; 97 } 98 } 99 100 while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { 101 if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) 102 return NULL; 103 104 if (k->overwritten) { 105 (*idx)++; 106 continue; 107 } 108 109 if (__journal_key_cmp(btree_id, level, pos, k) <= 0) 110 return k->k; 111 112 (*idx)++; 113 iters++; 114 if (iters == 10) { 115 *idx = 0; 116 goto search; 117 } 118 } 119 120 return NULL; 121 } 122 123 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, 124 unsigned level, struct bpos pos) 125 { 126 size_t idx = 0; 127 128 return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); 129 } 130 131 static void journal_iters_fix(struct bch_fs *c) 132 { 133 struct journal_keys *keys = &c->journal_keys; 134 /* The key we just inserted is immediately before the gap: */ 135 size_t gap_end = keys->gap + (keys->size - keys->nr); 136 struct btree_and_journal_iter *iter; 137 138 /* 139 * If an iterator points one after the key we just inserted, decrement 140 * the iterator so it points at the key we just inserted - if the 141 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will 142 * handle that: 143 */ 144 list_for_each_entry(iter, &c->journal_iters, journal.list) 145 if (iter->journal.idx == gap_end) 146 iter->journal.idx = keys->gap - 1; 147 } 148 149 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) 150 { 151 struct journal_keys *keys = &c->journal_keys; 152 struct journal_iter *iter; 153 size_t gap_size = keys->size - keys->nr; 154 155 list_for_each_entry(iter, &c->journal_iters, list) { 156 if (iter->idx > old_gap) 157 iter->idx -= gap_size; 158 if (iter->idx >= new_gap) 159 iter->idx += gap_size; 160 } 161 } 162 163 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, 164 unsigned level, struct bkey_i *k) 165 { 166 struct journal_key n = { 167 .btree_id = id, 168 .level = level, 169 .k = k, 170 .allocated = true, 171 /* 172 * Ensure these keys are done last by journal replay, to unblock 173 * journal reclaim: 174 */ 175 .journal_seq = U32_MAX, 176 }; 177 struct journal_keys *keys = &c->journal_keys; 178 size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); 179 180 BUG_ON(test_bit(BCH_FS_rw, &c->flags)); 181 182 if (idx < keys->size && 183 journal_key_cmp(&n, &keys->d[idx]) == 0) { 184 if (keys->d[idx].allocated) 185 kfree(keys->d[idx].k); 186 keys->d[idx] = n; 187 return 0; 188 } 189 190 if (idx > keys->gap) 191 idx -= keys->size - keys->nr; 192 193 if (keys->nr == keys->size) { 194 struct journal_keys new_keys = { 195 .nr = keys->nr, 196 .size = max_t(size_t, keys->size, 8) * 2, 197 }; 198 199 new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); 200 if (!new_keys.d) { 201 bch_err(c, "%s: error allocating new key array (size %zu)", 202 __func__, new_keys.size); 203 return -BCH_ERR_ENOMEM_journal_key_insert; 204 } 205 206 /* Since @keys was full, there was no gap: */ 207 memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); 208 kvfree(keys->d); 209 keys->d = new_keys.d; 210 keys->nr = new_keys.nr; 211 keys->size = new_keys.size; 212 213 /* And now the gap is at the end: */ 214 keys->gap = keys->nr; 215 } 216 217 journal_iters_move_gap(c, keys->gap, idx); 218 219 move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); 220 keys->gap = idx; 221 222 keys->nr++; 223 keys->d[keys->gap++] = n; 224 225 journal_iters_fix(c); 226 227 return 0; 228 } 229 230 /* 231 * Can only be used from the recovery thread while we're still RO - can't be 232 * used once we've got RW, as journal_keys is at that point used by multiple 233 * threads: 234 */ 235 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, 236 unsigned level, struct bkey_i *k) 237 { 238 struct bkey_i *n; 239 int ret; 240 241 n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); 242 if (!n) 243 return -BCH_ERR_ENOMEM_journal_key_insert; 244 245 bkey_copy(n, k); 246 ret = bch2_journal_key_insert_take(c, id, level, n); 247 if (ret) 248 kfree(n); 249 return ret; 250 } 251 252 int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, 253 unsigned level, struct bpos pos) 254 { 255 struct bkey_i whiteout; 256 257 bkey_init(&whiteout.k); 258 whiteout.k.p = pos; 259 260 return bch2_journal_key_insert(c, id, level, &whiteout); 261 } 262 263 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, 264 unsigned level, struct bpos pos) 265 { 266 struct journal_keys *keys = &c->journal_keys; 267 size_t idx = bch2_journal_key_search(keys, btree, level, pos); 268 269 if (idx < keys->size && 270 keys->d[idx].btree_id == btree && 271 keys->d[idx].level == level && 272 bpos_eq(keys->d[idx].k->k.p, pos)) 273 keys->d[idx].overwritten = true; 274 } 275 276 static void bch2_journal_iter_advance(struct journal_iter *iter) 277 { 278 if (iter->idx < iter->keys->size) { 279 iter->idx++; 280 if (iter->idx == iter->keys->gap) 281 iter->idx += iter->keys->size - iter->keys->nr; 282 } 283 } 284 285 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) 286 { 287 struct journal_key *k = iter->keys->d + iter->idx; 288 289 while (k < iter->keys->d + iter->keys->size && 290 k->btree_id == iter->btree_id && 291 k->level == iter->level) { 292 if (!k->overwritten) 293 return bkey_i_to_s_c(k->k); 294 295 bch2_journal_iter_advance(iter); 296 k = iter->keys->d + iter->idx; 297 } 298 299 return bkey_s_c_null; 300 } 301 302 static void bch2_journal_iter_exit(struct journal_iter *iter) 303 { 304 list_del(&iter->list); 305 } 306 307 static void bch2_journal_iter_init(struct bch_fs *c, 308 struct journal_iter *iter, 309 enum btree_id id, unsigned level, 310 struct bpos pos) 311 { 312 iter->btree_id = id; 313 iter->level = level; 314 iter->keys = &c->journal_keys; 315 iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); 316 } 317 318 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) 319 { 320 return bch2_btree_node_iter_peek_unpack(&iter->node_iter, 321 iter->b, &iter->unpacked); 322 } 323 324 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) 325 { 326 bch2_btree_node_iter_advance(&iter->node_iter, iter->b); 327 } 328 329 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) 330 { 331 if (bpos_eq(iter->pos, SPOS_MAX)) 332 iter->at_end = true; 333 else 334 iter->pos = bpos_successor(iter->pos); 335 } 336 337 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) 338 { 339 struct bkey_s_c btree_k, journal_k, ret; 340 again: 341 if (iter->at_end) 342 return bkey_s_c_null; 343 344 while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && 345 bpos_lt(btree_k.k->p, iter->pos)) 346 bch2_journal_iter_advance_btree(iter); 347 348 while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && 349 bpos_lt(journal_k.k->p, iter->pos)) 350 bch2_journal_iter_advance(&iter->journal); 351 352 ret = journal_k.k && 353 (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) 354 ? journal_k 355 : btree_k; 356 357 if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) 358 ret = bkey_s_c_null; 359 360 if (ret.k) { 361 iter->pos = ret.k->p; 362 if (bkey_deleted(ret.k)) { 363 bch2_btree_and_journal_iter_advance(iter); 364 goto again; 365 } 366 } else { 367 iter->pos = SPOS_MAX; 368 iter->at_end = true; 369 } 370 371 return ret; 372 } 373 374 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) 375 { 376 bch2_journal_iter_exit(&iter->journal); 377 } 378 379 void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, 380 struct bch_fs *c, 381 struct btree *b, 382 struct btree_node_iter node_iter, 383 struct bpos pos) 384 { 385 memset(iter, 0, sizeof(*iter)); 386 387 iter->b = b; 388 iter->node_iter = node_iter; 389 bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); 390 INIT_LIST_HEAD(&iter->journal.list); 391 iter->pos = b->data->min_key; 392 iter->at_end = false; 393 } 394 395 /* 396 * this version is used by btree_gc before filesystem has gone RW and 397 * multithreaded, so uses the journal_iters list: 398 */ 399 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, 400 struct bch_fs *c, 401 struct btree *b) 402 { 403 struct btree_node_iter node_iter; 404 405 bch2_btree_node_iter_init_from_start(&node_iter, b); 406 __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); 407 list_add(&iter->journal.list, &c->journal_iters); 408 } 409 410 /* sort and dedup all keys in the journal: */ 411 412 void bch2_journal_entries_free(struct bch_fs *c) 413 { 414 struct journal_replay **i; 415 struct genradix_iter iter; 416 417 genradix_for_each(&c->journal_entries, iter, i) 418 if (*i) 419 kvpfree(*i, offsetof(struct journal_replay, j) + 420 vstruct_bytes(&(*i)->j)); 421 genradix_free(&c->journal_entries); 422 } 423 424 /* 425 * When keys compare equal, oldest compares first: 426 */ 427 static int journal_sort_key_cmp(const void *_l, const void *_r) 428 { 429 const struct journal_key *l = _l; 430 const struct journal_key *r = _r; 431 432 return journal_key_cmp(l, r) ?: 433 cmp_int(l->journal_seq, r->journal_seq) ?: 434 cmp_int(l->journal_offset, r->journal_offset); 435 } 436 437 void bch2_journal_keys_put(struct bch_fs *c) 438 { 439 struct journal_keys *keys = &c->journal_keys; 440 struct journal_key *i; 441 442 BUG_ON(atomic_read(&keys->ref) <= 0); 443 444 if (!atomic_dec_and_test(&keys->ref)) 445 return; 446 447 move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); 448 keys->gap = keys->nr; 449 450 for (i = keys->d; i < keys->d + keys->nr; i++) 451 if (i->allocated) 452 kfree(i->k); 453 454 kvfree(keys->d); 455 keys->d = NULL; 456 keys->nr = keys->gap = keys->size = 0; 457 458 bch2_journal_entries_free(c); 459 } 460 461 static void __journal_keys_sort(struct journal_keys *keys) 462 { 463 struct journal_key *src, *dst; 464 465 sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); 466 467 src = dst = keys->d; 468 while (src < keys->d + keys->nr) { 469 while (src + 1 < keys->d + keys->nr && 470 !journal_key_cmp(src, src + 1)) 471 src++; 472 473 *dst++ = *src++; 474 } 475 476 keys->nr = dst - keys->d; 477 } 478 479 int bch2_journal_keys_sort(struct bch_fs *c) 480 { 481 struct genradix_iter iter; 482 struct journal_replay *i, **_i; 483 struct jset_entry *entry; 484 struct bkey_i *k; 485 struct journal_keys *keys = &c->journal_keys; 486 size_t nr_keys = 0, nr_read = 0; 487 488 genradix_for_each(&c->journal_entries, iter, _i) { 489 i = *_i; 490 491 if (!i || i->ignore) 492 continue; 493 494 for_each_jset_key(k, entry, &i->j) 495 nr_keys++; 496 } 497 498 if (!nr_keys) 499 return 0; 500 501 keys->size = roundup_pow_of_two(nr_keys); 502 503 keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); 504 if (!keys->d) { 505 bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", 506 nr_keys); 507 508 do { 509 keys->size >>= 1; 510 keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); 511 } while (!keys->d && keys->size > nr_keys / 8); 512 513 if (!keys->d) { 514 bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", 515 keys->size); 516 return -BCH_ERR_ENOMEM_journal_keys_sort; 517 } 518 } 519 520 genradix_for_each(&c->journal_entries, iter, _i) { 521 i = *_i; 522 523 if (!i || i->ignore) 524 continue; 525 526 cond_resched(); 527 528 for_each_jset_key(k, entry, &i->j) { 529 if (keys->nr == keys->size) { 530 __journal_keys_sort(keys); 531 532 if (keys->nr > keys->size * 7 / 8) { 533 bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", 534 keys->nr, keys->size, nr_read, nr_keys); 535 return -BCH_ERR_ENOMEM_journal_keys_sort; 536 } 537 } 538 539 keys->d[keys->nr++] = (struct journal_key) { 540 .btree_id = entry->btree_id, 541 .level = entry->level, 542 .k = k, 543 .journal_seq = le64_to_cpu(i->j.seq), 544 .journal_offset = k->_data - i->j._data, 545 }; 546 547 nr_read++; 548 } 549 } 550 551 __journal_keys_sort(keys); 552 keys->gap = keys->nr; 553 554 bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); 555 return 0; 556 } 557