1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_methods.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_iter.h" 8 #include "btree_journal_iter.h" 9 #include "btree_key_cache.h" 10 #include "btree_locking.h" 11 #include "btree_update.h" 12 #include "debug.h" 13 #include "error.h" 14 #include "extents.h" 15 #include "journal.h" 16 #include "replicas.h" 17 #include "snapshot.h" 18 #include "trace.h" 19 20 #include <linux/random.h> 21 #include <linux/prefetch.h> 22 23 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); 24 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, 25 struct btree_path *); 26 27 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) 28 { 29 #ifdef TRACK_PATH_ALLOCATED 30 return iter->ip_allocated; 31 #else 32 return 0; 33 #endif 34 } 35 36 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); 37 38 static inline int __btree_path_cmp(const struct btree_path *l, 39 enum btree_id r_btree_id, 40 bool r_cached, 41 struct bpos r_pos, 42 unsigned r_level) 43 { 44 /* 45 * Must match lock ordering as defined by __bch2_btree_node_lock: 46 */ 47 return cmp_int(l->btree_id, r_btree_id) ?: 48 cmp_int((int) l->cached, (int) r_cached) ?: 49 bpos_cmp(l->pos, r_pos) ?: 50 -cmp_int(l->level, r_level); 51 } 52 53 static inline int btree_path_cmp(const struct btree_path *l, 54 const struct btree_path *r) 55 { 56 return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); 57 } 58 59 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) 60 { 61 /* Are we iterating over keys in all snapshots? */ 62 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { 63 p = bpos_successor(p); 64 } else { 65 p = bpos_nosnap_successor(p); 66 p.snapshot = iter->snapshot; 67 } 68 69 return p; 70 } 71 72 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) 73 { 74 /* Are we iterating over keys in all snapshots? */ 75 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { 76 p = bpos_predecessor(p); 77 } else { 78 p = bpos_nosnap_predecessor(p); 79 p.snapshot = iter->snapshot; 80 } 81 82 return p; 83 } 84 85 static inline struct bpos btree_iter_search_key(struct btree_iter *iter) 86 { 87 struct bpos pos = iter->pos; 88 89 if ((iter->flags & BTREE_ITER_IS_EXTENTS) && 90 !bkey_eq(pos, POS_MAX)) 91 pos = bkey_successor(iter, pos); 92 return pos; 93 } 94 95 static inline bool btree_path_pos_before_node(struct btree_path *path, 96 struct btree *b) 97 { 98 return bpos_lt(path->pos, b->data->min_key); 99 } 100 101 static inline bool btree_path_pos_after_node(struct btree_path *path, 102 struct btree *b) 103 { 104 return bpos_gt(path->pos, b->key.k.p); 105 } 106 107 static inline bool btree_path_pos_in_node(struct btree_path *path, 108 struct btree *b) 109 { 110 return path->btree_id == b->c.btree_id && 111 !btree_path_pos_before_node(path, b) && 112 !btree_path_pos_after_node(path, b); 113 } 114 115 /* Btree iterator: */ 116 117 #ifdef CONFIG_BCACHEFS_DEBUG 118 119 static void bch2_btree_path_verify_cached(struct btree_trans *trans, 120 struct btree_path *path) 121 { 122 struct bkey_cached *ck; 123 bool locked = btree_node_locked(path, 0); 124 125 if (!bch2_btree_node_relock(trans, path, 0)) 126 return; 127 128 ck = (void *) path->l[0].b; 129 BUG_ON(ck->key.btree_id != path->btree_id || 130 !bkey_eq(ck->key.pos, path->pos)); 131 132 if (!locked) 133 btree_node_unlock(trans, path, 0); 134 } 135 136 static void bch2_btree_path_verify_level(struct btree_trans *trans, 137 struct btree_path *path, unsigned level) 138 { 139 struct btree_path_level *l; 140 struct btree_node_iter tmp; 141 bool locked; 142 struct bkey_packed *p, *k; 143 struct printbuf buf1 = PRINTBUF; 144 struct printbuf buf2 = PRINTBUF; 145 struct printbuf buf3 = PRINTBUF; 146 const char *msg; 147 148 if (!bch2_debug_check_iterators) 149 return; 150 151 l = &path->l[level]; 152 tmp = l->iter; 153 locked = btree_node_locked(path, level); 154 155 if (path->cached) { 156 if (!level) 157 bch2_btree_path_verify_cached(trans, path); 158 return; 159 } 160 161 if (!btree_path_node(path, level)) 162 return; 163 164 if (!bch2_btree_node_relock_notrace(trans, path, level)) 165 return; 166 167 BUG_ON(!btree_path_pos_in_node(path, l->b)); 168 169 bch2_btree_node_iter_verify(&l->iter, l->b); 170 171 /* 172 * For interior nodes, the iterator will have skipped past deleted keys: 173 */ 174 p = level 175 ? bch2_btree_node_iter_prev(&tmp, l->b) 176 : bch2_btree_node_iter_prev_all(&tmp, l->b); 177 k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 178 179 if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { 180 msg = "before"; 181 goto err; 182 } 183 184 if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { 185 msg = "after"; 186 goto err; 187 } 188 189 if (!locked) 190 btree_node_unlock(trans, path, level); 191 return; 192 err: 193 bch2_bpos_to_text(&buf1, path->pos); 194 195 if (p) { 196 struct bkey uk = bkey_unpack_key(l->b, p); 197 198 bch2_bkey_to_text(&buf2, &uk); 199 } else { 200 prt_printf(&buf2, "(none)"); 201 } 202 203 if (k) { 204 struct bkey uk = bkey_unpack_key(l->b, k); 205 206 bch2_bkey_to_text(&buf3, &uk); 207 } else { 208 prt_printf(&buf3, "(none)"); 209 } 210 211 panic("path should be %s key at level %u:\n" 212 "path pos %s\n" 213 "prev key %s\n" 214 "cur key %s\n", 215 msg, level, buf1.buf, buf2.buf, buf3.buf); 216 } 217 218 static void bch2_btree_path_verify(struct btree_trans *trans, 219 struct btree_path *path) 220 { 221 struct bch_fs *c = trans->c; 222 unsigned i; 223 224 EBUG_ON(path->btree_id >= BTREE_ID_NR); 225 226 for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { 227 if (!path->l[i].b) { 228 BUG_ON(!path->cached && 229 bch2_btree_id_root(c, path->btree_id)->b->c.level > i); 230 break; 231 } 232 233 bch2_btree_path_verify_level(trans, path, i); 234 } 235 236 bch2_btree_path_verify_locks(path); 237 } 238 239 void bch2_trans_verify_paths(struct btree_trans *trans) 240 { 241 struct btree_path *path; 242 243 trans_for_each_path(trans, path) 244 bch2_btree_path_verify(trans, path); 245 } 246 247 static void bch2_btree_iter_verify(struct btree_iter *iter) 248 { 249 struct btree_trans *trans = iter->trans; 250 251 BUG_ON(iter->btree_id >= BTREE_ID_NR); 252 253 BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); 254 255 BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && 256 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); 257 258 BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && 259 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && 260 !btree_type_has_snapshots(iter->btree_id)); 261 262 if (iter->update_path) 263 bch2_btree_path_verify(trans, iter->update_path); 264 bch2_btree_path_verify(trans, iter->path); 265 } 266 267 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) 268 { 269 BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 270 !iter->pos.snapshot); 271 272 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && 273 iter->pos.snapshot != iter->snapshot); 274 275 BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || 276 bkey_gt(iter->pos, iter->k.p)); 277 } 278 279 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) 280 { 281 struct btree_trans *trans = iter->trans; 282 struct btree_iter copy; 283 struct bkey_s_c prev; 284 int ret = 0; 285 286 if (!bch2_debug_check_iterators) 287 return 0; 288 289 if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) 290 return 0; 291 292 if (bkey_err(k) || !k.k) 293 return 0; 294 295 BUG_ON(!bch2_snapshot_is_ancestor(trans->c, 296 iter->snapshot, 297 k.k->p.snapshot)); 298 299 bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, 300 BTREE_ITER_NOPRESERVE| 301 BTREE_ITER_ALL_SNAPSHOTS); 302 prev = bch2_btree_iter_prev(©); 303 if (!prev.k) 304 goto out; 305 306 ret = bkey_err(prev); 307 if (ret) 308 goto out; 309 310 if (bkey_eq(prev.k->p, k.k->p) && 311 bch2_snapshot_is_ancestor(trans->c, iter->snapshot, 312 prev.k->p.snapshot) > 0) { 313 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 314 315 bch2_bkey_to_text(&buf1, k.k); 316 bch2_bkey_to_text(&buf2, prev.k); 317 318 panic("iter snap %u\n" 319 "k %s\n" 320 "prev %s\n", 321 iter->snapshot, 322 buf1.buf, buf2.buf); 323 } 324 out: 325 bch2_trans_iter_exit(trans, ©); 326 return ret; 327 } 328 329 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, 330 struct bpos pos, bool key_cache) 331 { 332 struct btree_path *path; 333 unsigned idx; 334 struct printbuf buf = PRINTBUF; 335 336 btree_trans_sort_paths(trans); 337 338 trans_for_each_path_inorder(trans, path, idx) { 339 int cmp = cmp_int(path->btree_id, id) ?: 340 cmp_int(path->cached, key_cache); 341 342 if (cmp > 0) 343 break; 344 if (cmp < 0) 345 continue; 346 347 if (!btree_node_locked(path, 0) || 348 !path->should_be_locked) 349 continue; 350 351 if (!key_cache) { 352 if (bkey_ge(pos, path->l[0].b->data->min_key) && 353 bkey_le(pos, path->l[0].b->key.k.p)) 354 return; 355 } else { 356 if (bkey_eq(pos, path->pos)) 357 return; 358 } 359 } 360 361 bch2_dump_trans_paths_updates(trans); 362 bch2_bpos_to_text(&buf, pos); 363 364 panic("not locked: %s %s%s\n", 365 bch2_btree_ids[id], buf.buf, 366 key_cache ? " cached" : ""); 367 } 368 369 #else 370 371 static inline void bch2_btree_path_verify_level(struct btree_trans *trans, 372 struct btree_path *path, unsigned l) {} 373 static inline void bch2_btree_path_verify(struct btree_trans *trans, 374 struct btree_path *path) {} 375 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} 376 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} 377 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } 378 379 #endif 380 381 /* Btree path: fixups after btree updates */ 382 383 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, 384 struct btree *b, 385 struct bset_tree *t, 386 struct bkey_packed *k) 387 { 388 struct btree_node_iter_set *set; 389 390 btree_node_iter_for_each(iter, set) 391 if (set->end == t->end_offset) { 392 set->k = __btree_node_key_to_offset(b, k); 393 bch2_btree_node_iter_sort(iter, b); 394 return; 395 } 396 397 bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); 398 } 399 400 static void __bch2_btree_path_fix_key_modified(struct btree_path *path, 401 struct btree *b, 402 struct bkey_packed *where) 403 { 404 struct btree_path_level *l = &path->l[b->c.level]; 405 406 if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) 407 return; 408 409 if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) 410 bch2_btree_node_iter_advance(&l->iter, l->b); 411 } 412 413 void bch2_btree_path_fix_key_modified(struct btree_trans *trans, 414 struct btree *b, 415 struct bkey_packed *where) 416 { 417 struct btree_path *path; 418 419 trans_for_each_path_with_node(trans, b, path) { 420 __bch2_btree_path_fix_key_modified(path, b, where); 421 bch2_btree_path_verify_level(trans, path, b->c.level); 422 } 423 } 424 425 static void __bch2_btree_node_iter_fix(struct btree_path *path, 426 struct btree *b, 427 struct btree_node_iter *node_iter, 428 struct bset_tree *t, 429 struct bkey_packed *where, 430 unsigned clobber_u64s, 431 unsigned new_u64s) 432 { 433 const struct bkey_packed *end = btree_bkey_last(b, t); 434 struct btree_node_iter_set *set; 435 unsigned offset = __btree_node_key_to_offset(b, where); 436 int shift = new_u64s - clobber_u64s; 437 unsigned old_end = t->end_offset - shift; 438 unsigned orig_iter_pos = node_iter->data[0].k; 439 bool iter_current_key_modified = 440 orig_iter_pos >= offset && 441 orig_iter_pos <= offset + clobber_u64s; 442 443 btree_node_iter_for_each(node_iter, set) 444 if (set->end == old_end) 445 goto found; 446 447 /* didn't find the bset in the iterator - might have to readd it: */ 448 if (new_u64s && 449 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { 450 bch2_btree_node_iter_push(node_iter, b, where, end); 451 goto fixup_done; 452 } else { 453 /* Iterator is after key that changed */ 454 return; 455 } 456 found: 457 set->end = t->end_offset; 458 459 /* Iterator hasn't gotten to the key that changed yet: */ 460 if (set->k < offset) 461 return; 462 463 if (new_u64s && 464 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { 465 set->k = offset; 466 } else if (set->k < offset + clobber_u64s) { 467 set->k = offset + new_u64s; 468 if (set->k == set->end) 469 bch2_btree_node_iter_set_drop(node_iter, set); 470 } else { 471 /* Iterator is after key that changed */ 472 set->k = (int) set->k + shift; 473 return; 474 } 475 476 bch2_btree_node_iter_sort(node_iter, b); 477 fixup_done: 478 if (node_iter->data[0].k != orig_iter_pos) 479 iter_current_key_modified = true; 480 481 /* 482 * When a new key is added, and the node iterator now points to that 483 * key, the iterator might have skipped past deleted keys that should 484 * come after the key the iterator now points to. We have to rewind to 485 * before those deleted keys - otherwise 486 * bch2_btree_node_iter_prev_all() breaks: 487 */ 488 if (!bch2_btree_node_iter_end(node_iter) && 489 iter_current_key_modified && 490 b->c.level) { 491 struct bkey_packed *k, *k2, *p; 492 493 k = bch2_btree_node_iter_peek_all(node_iter, b); 494 495 for_each_bset(b, t) { 496 bool set_pos = false; 497 498 if (node_iter->data[0].end == t->end_offset) 499 continue; 500 501 k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); 502 503 while ((p = bch2_bkey_prev_all(b, t, k2)) && 504 bkey_iter_cmp(b, k, p) < 0) { 505 k2 = p; 506 set_pos = true; 507 } 508 509 if (set_pos) 510 btree_node_iter_set_set_pos(node_iter, 511 b, t, k2); 512 } 513 } 514 } 515 516 void bch2_btree_node_iter_fix(struct btree_trans *trans, 517 struct btree_path *path, 518 struct btree *b, 519 struct btree_node_iter *node_iter, 520 struct bkey_packed *where, 521 unsigned clobber_u64s, 522 unsigned new_u64s) 523 { 524 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); 525 struct btree_path *linked; 526 527 if (node_iter != &path->l[b->c.level].iter) { 528 __bch2_btree_node_iter_fix(path, b, node_iter, t, 529 where, clobber_u64s, new_u64s); 530 531 if (bch2_debug_check_iterators) 532 bch2_btree_node_iter_verify(node_iter, b); 533 } 534 535 trans_for_each_path_with_node(trans, b, linked) { 536 __bch2_btree_node_iter_fix(linked, b, 537 &linked->l[b->c.level].iter, t, 538 where, clobber_u64s, new_u64s); 539 bch2_btree_path_verify_level(trans, linked, b->c.level); 540 } 541 } 542 543 /* Btree path level: pointer to a particular btree node and node iter */ 544 545 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, 546 struct btree_path_level *l, 547 struct bkey *u, 548 struct bkey_packed *k) 549 { 550 if (unlikely(!k)) { 551 /* 552 * signal to bch2_btree_iter_peek_slot() that we're currently at 553 * a hole 554 */ 555 u->type = KEY_TYPE_deleted; 556 return bkey_s_c_null; 557 } 558 559 return bkey_disassemble(l->b, k, u); 560 } 561 562 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, 563 struct btree_path_level *l, 564 struct bkey *u) 565 { 566 return __btree_iter_unpack(c, l, u, 567 bch2_btree_node_iter_peek_all(&l->iter, l->b)); 568 } 569 570 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, 571 struct btree_path *path, 572 struct btree_path_level *l, 573 struct bkey *u) 574 { 575 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, 576 bch2_btree_node_iter_peek(&l->iter, l->b)); 577 578 path->pos = k.k ? k.k->p : l->b->key.k.p; 579 trans->paths_sorted = false; 580 bch2_btree_path_verify_level(trans, path, l - path->l); 581 return k; 582 } 583 584 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, 585 struct btree_path *path, 586 struct btree_path_level *l, 587 struct bkey *u) 588 { 589 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, 590 bch2_btree_node_iter_prev(&l->iter, l->b)); 591 592 path->pos = k.k ? k.k->p : l->b->data->min_key; 593 trans->paths_sorted = false; 594 bch2_btree_path_verify_level(trans, path, l - path->l); 595 return k; 596 } 597 598 static inline bool btree_path_advance_to_pos(struct btree_path *path, 599 struct btree_path_level *l, 600 int max_advance) 601 { 602 struct bkey_packed *k; 603 int nr_advanced = 0; 604 605 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && 606 bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { 607 if (max_advance > 0 && nr_advanced >= max_advance) 608 return false; 609 610 bch2_btree_node_iter_advance(&l->iter, l->b); 611 nr_advanced++; 612 } 613 614 return true; 615 } 616 617 static inline void __btree_path_level_init(struct btree_path *path, 618 unsigned level) 619 { 620 struct btree_path_level *l = &path->l[level]; 621 622 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); 623 624 /* 625 * Iterators to interior nodes should always be pointed at the first non 626 * whiteout: 627 */ 628 if (level) 629 bch2_btree_node_iter_peek(&l->iter, l->b); 630 } 631 632 void bch2_btree_path_level_init(struct btree_trans *trans, 633 struct btree_path *path, 634 struct btree *b) 635 { 636 BUG_ON(path->cached); 637 638 EBUG_ON(!btree_path_pos_in_node(path, b)); 639 640 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); 641 path->l[b->c.level].b = b; 642 __btree_path_level_init(path, b->c.level); 643 } 644 645 /* Btree path: fixups after btree node updates: */ 646 647 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) 648 { 649 struct bch_fs *c = trans->c; 650 struct btree_insert_entry *i; 651 652 trans_for_each_update(trans, i) 653 if (!i->cached && 654 i->level == b->c.level && 655 i->btree_id == b->c.btree_id && 656 bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && 657 bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { 658 i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; 659 660 if (unlikely(trans->journal_replay_not_finished)) { 661 struct bkey_i *j_k = 662 bch2_journal_keys_peek_slot(c, i->btree_id, i->level, 663 i->k->k.p); 664 665 if (j_k) { 666 i->old_k = j_k->k; 667 i->old_v = &j_k->v; 668 } 669 } 670 } 671 } 672 673 /* 674 * A btree node is being replaced - update the iterator to point to the new 675 * node: 676 */ 677 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) 678 { 679 struct btree_path *path; 680 681 trans_for_each_path(trans, path) 682 if (path->uptodate == BTREE_ITER_UPTODATE && 683 !path->cached && 684 btree_path_pos_in_node(path, b)) { 685 enum btree_node_locked_type t = 686 btree_lock_want(path, b->c.level); 687 688 if (t != BTREE_NODE_UNLOCKED) { 689 btree_node_unlock(trans, path, b->c.level); 690 six_lock_increment(&b->c.lock, (enum six_lock_type) t); 691 mark_btree_node_locked(trans, path, b->c.level, t); 692 } 693 694 bch2_btree_path_level_init(trans, path, b); 695 } 696 697 bch2_trans_revalidate_updates_in_node(trans, b); 698 } 699 700 /* 701 * A btree node has been modified in such a way as to invalidate iterators - fix 702 * them: 703 */ 704 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) 705 { 706 struct btree_path *path; 707 708 trans_for_each_path_with_node(trans, b, path) 709 __btree_path_level_init(path, b->c.level); 710 711 bch2_trans_revalidate_updates_in_node(trans, b); 712 } 713 714 /* Btree path: traverse, set_pos: */ 715 716 static inline int btree_path_lock_root(struct btree_trans *trans, 717 struct btree_path *path, 718 unsigned depth_want, 719 unsigned long trace_ip) 720 { 721 struct bch_fs *c = trans->c; 722 struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; 723 enum six_lock_type lock_type; 724 unsigned i; 725 int ret; 726 727 EBUG_ON(path->nodes_locked); 728 729 while (1) { 730 b = READ_ONCE(*rootp); 731 path->level = READ_ONCE(b->c.level); 732 733 if (unlikely(path->level < depth_want)) { 734 /* 735 * the root is at a lower depth than the depth we want: 736 * got to the end of the btree, or we're walking nodes 737 * greater than some depth and there are no nodes >= 738 * that depth 739 */ 740 path->level = depth_want; 741 for (i = path->level; i < BTREE_MAX_DEPTH; i++) 742 path->l[i].b = NULL; 743 return 1; 744 } 745 746 lock_type = __btree_lock_want(path, path->level); 747 ret = btree_node_lock(trans, path, &b->c, 748 path->level, lock_type, trace_ip); 749 if (unlikely(ret)) { 750 if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) 751 continue; 752 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 753 return ret; 754 BUG(); 755 } 756 757 if (likely(b == READ_ONCE(*rootp) && 758 b->c.level == path->level && 759 !race_fault())) { 760 for (i = 0; i < path->level; i++) 761 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); 762 path->l[path->level].b = b; 763 for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) 764 path->l[i].b = NULL; 765 766 mark_btree_node_locked(trans, path, path->level, 767 (enum btree_node_locked_type) lock_type); 768 bch2_btree_path_level_init(trans, path, b); 769 return 0; 770 } 771 772 six_unlock_type(&b->c.lock, lock_type); 773 } 774 } 775 776 noinline 777 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) 778 { 779 struct bch_fs *c = trans->c; 780 struct btree_path_level *l = path_l(path); 781 struct btree_node_iter node_iter = l->iter; 782 struct bkey_packed *k; 783 struct bkey_buf tmp; 784 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) 785 ? (path->level > 1 ? 0 : 2) 786 : (path->level > 1 ? 1 : 16); 787 bool was_locked = btree_node_locked(path, path->level); 788 int ret = 0; 789 790 bch2_bkey_buf_init(&tmp); 791 792 while (nr-- && !ret) { 793 if (!bch2_btree_node_relock(trans, path, path->level)) 794 break; 795 796 bch2_btree_node_iter_advance(&node_iter, l->b); 797 k = bch2_btree_node_iter_peek(&node_iter, l->b); 798 if (!k) 799 break; 800 801 bch2_bkey_buf_unpack(&tmp, c, l->b, k); 802 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, 803 path->level - 1); 804 } 805 806 if (!was_locked) 807 btree_node_unlock(trans, path, path->level); 808 809 bch2_bkey_buf_exit(&tmp, c); 810 return ret; 811 } 812 813 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, 814 struct btree_and_journal_iter *jiter) 815 { 816 struct bch_fs *c = trans->c; 817 struct bkey_s_c k; 818 struct bkey_buf tmp; 819 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) 820 ? (path->level > 1 ? 0 : 2) 821 : (path->level > 1 ? 1 : 16); 822 bool was_locked = btree_node_locked(path, path->level); 823 int ret = 0; 824 825 bch2_bkey_buf_init(&tmp); 826 827 while (nr-- && !ret) { 828 if (!bch2_btree_node_relock(trans, path, path->level)) 829 break; 830 831 bch2_btree_and_journal_iter_advance(jiter); 832 k = bch2_btree_and_journal_iter_peek(jiter); 833 if (!k.k) 834 break; 835 836 bch2_bkey_buf_reassemble(&tmp, c, k); 837 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, 838 path->level - 1); 839 } 840 841 if (!was_locked) 842 btree_node_unlock(trans, path, path->level); 843 844 bch2_bkey_buf_exit(&tmp, c); 845 return ret; 846 } 847 848 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, 849 struct btree_path *path, 850 unsigned plevel, struct btree *b) 851 { 852 struct btree_path_level *l = &path->l[plevel]; 853 bool locked = btree_node_locked(path, plevel); 854 struct bkey_packed *k; 855 struct bch_btree_ptr_v2 *bp; 856 857 if (!bch2_btree_node_relock(trans, path, plevel)) 858 return; 859 860 k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 861 BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); 862 863 bp = (void *) bkeyp_val(&l->b->format, k); 864 bp->mem_ptr = (unsigned long)b; 865 866 if (!locked) 867 btree_node_unlock(trans, path, plevel); 868 } 869 870 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, 871 struct btree_path *path, 872 unsigned flags, 873 struct bkey_buf *out) 874 { 875 struct bch_fs *c = trans->c; 876 struct btree_path_level *l = path_l(path); 877 struct btree_and_journal_iter jiter; 878 struct bkey_s_c k; 879 int ret = 0; 880 881 __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); 882 883 k = bch2_btree_and_journal_iter_peek(&jiter); 884 885 bch2_bkey_buf_reassemble(out, c, k); 886 887 if (flags & BTREE_ITER_PREFETCH) 888 ret = btree_path_prefetch_j(trans, path, &jiter); 889 890 bch2_btree_and_journal_iter_exit(&jiter); 891 return ret; 892 } 893 894 static __always_inline int btree_path_down(struct btree_trans *trans, 895 struct btree_path *path, 896 unsigned flags, 897 unsigned long trace_ip) 898 { 899 struct bch_fs *c = trans->c; 900 struct btree_path_level *l = path_l(path); 901 struct btree *b; 902 unsigned level = path->level - 1; 903 enum six_lock_type lock_type = __btree_lock_want(path, level); 904 struct bkey_buf tmp; 905 int ret; 906 907 EBUG_ON(!btree_node_locked(path, path->level)); 908 909 bch2_bkey_buf_init(&tmp); 910 911 if (unlikely(trans->journal_replay_not_finished)) { 912 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); 913 if (ret) 914 goto err; 915 } else { 916 bch2_bkey_buf_unpack(&tmp, c, l->b, 917 bch2_btree_node_iter_peek(&l->iter, l->b)); 918 919 if (flags & BTREE_ITER_PREFETCH) { 920 ret = btree_path_prefetch(trans, path); 921 if (ret) 922 goto err; 923 } 924 } 925 926 b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); 927 ret = PTR_ERR_OR_ZERO(b); 928 if (unlikely(ret)) 929 goto err; 930 931 if (likely(!trans->journal_replay_not_finished && 932 tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && 933 unlikely(b != btree_node_mem_ptr(tmp.k))) 934 btree_node_mem_ptr_set(trans, path, level + 1, b); 935 936 if (btree_node_read_locked(path, level + 1)) 937 btree_node_unlock(trans, path, level + 1); 938 939 mark_btree_node_locked(trans, path, level, 940 (enum btree_node_locked_type) lock_type); 941 path->level = level; 942 bch2_btree_path_level_init(trans, path, b); 943 944 bch2_btree_path_verify_locks(path); 945 err: 946 bch2_bkey_buf_exit(&tmp, c); 947 return ret; 948 } 949 950 951 static int bch2_btree_path_traverse_all(struct btree_trans *trans) 952 { 953 struct bch_fs *c = trans->c; 954 struct btree_path *path; 955 unsigned long trace_ip = _RET_IP_; 956 int i, ret = 0; 957 958 if (trans->in_traverse_all) 959 return -BCH_ERR_transaction_restart_in_traverse_all; 960 961 trans->in_traverse_all = true; 962 retry_all: 963 trans->restarted = 0; 964 trans->last_restarted_ip = 0; 965 966 trans_for_each_path(trans, path) 967 path->should_be_locked = false; 968 969 btree_trans_sort_paths(trans); 970 971 bch2_trans_unlock(trans); 972 cond_resched(); 973 974 if (unlikely(trans->memory_allocation_failure)) { 975 struct closure cl; 976 977 closure_init_stack(&cl); 978 979 do { 980 ret = bch2_btree_cache_cannibalize_lock(c, &cl); 981 closure_sync(&cl); 982 } while (ret); 983 } 984 985 /* Now, redo traversals in correct order: */ 986 i = 0; 987 while (i < trans->nr_sorted) { 988 path = trans->paths + trans->sorted[i]; 989 990 /* 991 * Traversing a path can cause another path to be added at about 992 * the same position: 993 */ 994 if (path->uptodate) { 995 __btree_path_get(path, false); 996 ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); 997 __btree_path_put(path, false); 998 999 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1000 bch2_err_matches(ret, ENOMEM)) 1001 goto retry_all; 1002 if (ret) 1003 goto err; 1004 } else { 1005 i++; 1006 } 1007 } 1008 1009 /* 1010 * We used to assert that all paths had been traversed here 1011 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since 1012 * path->should_be_locked is not set yet, we might have unlocked and 1013 * then failed to relock a path - that's fine. 1014 */ 1015 err: 1016 bch2_btree_cache_cannibalize_unlock(c); 1017 1018 trans->in_traverse_all = false; 1019 1020 trace_and_count(c, trans_traverse_all, trans, trace_ip); 1021 return ret; 1022 } 1023 1024 static inline bool btree_path_check_pos_in_node(struct btree_path *path, 1025 unsigned l, int check_pos) 1026 { 1027 if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) 1028 return false; 1029 if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) 1030 return false; 1031 return true; 1032 } 1033 1034 static inline bool btree_path_good_node(struct btree_trans *trans, 1035 struct btree_path *path, 1036 unsigned l, int check_pos) 1037 { 1038 return is_btree_node(path, l) && 1039 bch2_btree_node_relock(trans, path, l) && 1040 btree_path_check_pos_in_node(path, l, check_pos); 1041 } 1042 1043 static void btree_path_set_level_down(struct btree_trans *trans, 1044 struct btree_path *path, 1045 unsigned new_level) 1046 { 1047 unsigned l; 1048 1049 path->level = new_level; 1050 1051 for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) 1052 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) 1053 btree_node_unlock(trans, path, l); 1054 1055 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1056 bch2_btree_path_verify(trans, path); 1057 } 1058 1059 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, 1060 struct btree_path *path, 1061 int check_pos) 1062 { 1063 unsigned i, l = path->level; 1064 again: 1065 while (btree_path_node(path, l) && 1066 !btree_path_good_node(trans, path, l, check_pos)) 1067 __btree_path_set_level_up(trans, path, l++); 1068 1069 /* If we need intent locks, take them too: */ 1070 for (i = l + 1; 1071 i < path->locks_want && btree_path_node(path, i); 1072 i++) 1073 if (!bch2_btree_node_relock(trans, path, i)) { 1074 while (l <= i) 1075 __btree_path_set_level_up(trans, path, l++); 1076 goto again; 1077 } 1078 1079 return l; 1080 } 1081 1082 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, 1083 struct btree_path *path, 1084 int check_pos) 1085 { 1086 return likely(btree_node_locked(path, path->level) && 1087 btree_path_check_pos_in_node(path, path->level, check_pos)) 1088 ? path->level 1089 : __btree_path_up_until_good_node(trans, path, check_pos); 1090 } 1091 1092 /* 1093 * This is the main state machine for walking down the btree - walks down to a 1094 * specified depth 1095 * 1096 * Returns 0 on success, -EIO on error (error reading in a btree node). 1097 * 1098 * On error, caller (peek_node()/peek_key()) must return NULL; the error is 1099 * stashed in the iterator and returned from bch2_trans_exit(). 1100 */ 1101 int bch2_btree_path_traverse_one(struct btree_trans *trans, 1102 struct btree_path *path, 1103 unsigned flags, 1104 unsigned long trace_ip) 1105 { 1106 unsigned depth_want = path->level; 1107 int ret = -((int) trans->restarted); 1108 1109 if (unlikely(ret)) 1110 goto out; 1111 1112 /* 1113 * Ensure we obey path->should_be_locked: if it's set, we can't unlock 1114 * and re-traverse the path without a transaction restart: 1115 */ 1116 if (path->should_be_locked) { 1117 ret = bch2_btree_path_relock(trans, path, trace_ip); 1118 goto out; 1119 } 1120 1121 if (path->cached) { 1122 ret = bch2_btree_path_traverse_cached(trans, path, flags); 1123 goto out; 1124 } 1125 1126 if (unlikely(path->level >= BTREE_MAX_DEPTH)) 1127 goto out; 1128 1129 path->level = btree_path_up_until_good_node(trans, path, 0); 1130 1131 EBUG_ON(btree_path_node(path, path->level) && 1132 !btree_node_locked(path, path->level)); 1133 1134 /* 1135 * Note: path->nodes[path->level] may be temporarily NULL here - that 1136 * would indicate to other code that we got to the end of the btree, 1137 * here it indicates that relocking the root failed - it's critical that 1138 * btree_path_lock_root() comes next and that it can't fail 1139 */ 1140 while (path->level > depth_want) { 1141 ret = btree_path_node(path, path->level) 1142 ? btree_path_down(trans, path, flags, trace_ip) 1143 : btree_path_lock_root(trans, path, depth_want, trace_ip); 1144 if (unlikely(ret)) { 1145 if (ret == 1) { 1146 /* 1147 * No nodes at this level - got to the end of 1148 * the btree: 1149 */ 1150 ret = 0; 1151 goto out; 1152 } 1153 1154 __bch2_btree_path_unlock(trans, path); 1155 path->level = depth_want; 1156 path->l[path->level].b = ERR_PTR(ret); 1157 goto out; 1158 } 1159 } 1160 1161 path->uptodate = BTREE_ITER_UPTODATE; 1162 out: 1163 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) 1164 panic("ret %s (%i) trans->restarted %s (%i)\n", 1165 bch2_err_str(ret), ret, 1166 bch2_err_str(trans->restarted), trans->restarted); 1167 bch2_btree_path_verify(trans, path); 1168 return ret; 1169 } 1170 1171 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, 1172 struct btree_path *src) 1173 { 1174 unsigned i, offset = offsetof(struct btree_path, pos); 1175 1176 memcpy((void *) dst + offset, 1177 (void *) src + offset, 1178 sizeof(struct btree_path) - offset); 1179 1180 for (i = 0; i < BTREE_MAX_DEPTH; i++) { 1181 unsigned t = btree_node_locked_type(dst, i); 1182 1183 if (t != BTREE_NODE_UNLOCKED) 1184 six_lock_increment(&dst->l[i].b->c.lock, t); 1185 } 1186 } 1187 1188 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, 1189 bool intent) 1190 { 1191 struct btree_path *new = btree_path_alloc(trans, src); 1192 1193 btree_path_copy(trans, new, src); 1194 __btree_path_get(new, intent); 1195 return new; 1196 } 1197 1198 __flatten 1199 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, 1200 struct btree_path *path, bool intent, 1201 unsigned long ip) 1202 { 1203 __btree_path_put(path, intent); 1204 path = btree_path_clone(trans, path, intent); 1205 path->preserve = false; 1206 return path; 1207 } 1208 1209 struct btree_path * __must_check 1210 __bch2_btree_path_set_pos(struct btree_trans *trans, 1211 struct btree_path *path, struct bpos new_pos, 1212 bool intent, unsigned long ip, int cmp) 1213 { 1214 unsigned level = path->level; 1215 1216 bch2_trans_verify_not_in_restart(trans); 1217 EBUG_ON(!path->ref); 1218 1219 path = bch2_btree_path_make_mut(trans, path, intent, ip); 1220 1221 path->pos = new_pos; 1222 trans->paths_sorted = false; 1223 1224 if (unlikely(path->cached)) { 1225 btree_node_unlock(trans, path, 0); 1226 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); 1227 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1228 goto out; 1229 } 1230 1231 level = btree_path_up_until_good_node(trans, path, cmp); 1232 1233 if (btree_path_node(path, level)) { 1234 struct btree_path_level *l = &path->l[level]; 1235 1236 BUG_ON(!btree_node_locked(path, level)); 1237 /* 1238 * We might have to skip over many keys, or just a few: try 1239 * advancing the node iterator, and if we have to skip over too 1240 * many keys just reinit it (or if we're rewinding, since that 1241 * is expensive). 1242 */ 1243 if (cmp < 0 || 1244 !btree_path_advance_to_pos(path, l, 8)) 1245 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); 1246 1247 /* 1248 * Iterators to interior nodes should always be pointed at the first non 1249 * whiteout: 1250 */ 1251 if (unlikely(level)) 1252 bch2_btree_node_iter_peek(&l->iter, l->b); 1253 } 1254 1255 if (unlikely(level != path->level)) { 1256 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1257 __bch2_btree_path_unlock(trans, path); 1258 } 1259 out: 1260 bch2_btree_path_verify(trans, path); 1261 return path; 1262 } 1263 1264 /* Btree path: main interface: */ 1265 1266 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) 1267 { 1268 struct btree_path *sib; 1269 1270 sib = prev_btree_path(trans, path); 1271 if (sib && !btree_path_cmp(sib, path)) 1272 return sib; 1273 1274 sib = next_btree_path(trans, path); 1275 if (sib && !btree_path_cmp(sib, path)) 1276 return sib; 1277 1278 return NULL; 1279 } 1280 1281 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) 1282 { 1283 struct btree_path *sib; 1284 1285 sib = prev_btree_path(trans, path); 1286 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) 1287 return sib; 1288 1289 sib = next_btree_path(trans, path); 1290 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) 1291 return sib; 1292 1293 return NULL; 1294 } 1295 1296 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) 1297 { 1298 __bch2_btree_path_unlock(trans, path); 1299 btree_path_list_remove(trans, path); 1300 trans->paths_allocated &= ~(1ULL << path->idx); 1301 } 1302 1303 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) 1304 { 1305 struct btree_path *dup; 1306 1307 EBUG_ON(trans->paths + path->idx != path); 1308 EBUG_ON(!path->ref); 1309 1310 if (!__btree_path_put(path, intent)) 1311 return; 1312 1313 dup = path->preserve 1314 ? have_path_at_pos(trans, path) 1315 : have_node_at_pos(trans, path); 1316 1317 if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) 1318 return; 1319 1320 if (path->should_be_locked && 1321 !trans->restarted && 1322 (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) 1323 return; 1324 1325 if (dup) { 1326 dup->preserve |= path->preserve; 1327 dup->should_be_locked |= path->should_be_locked; 1328 } 1329 1330 __bch2_path_free(trans, path); 1331 } 1332 1333 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, 1334 bool intent) 1335 { 1336 EBUG_ON(trans->paths + path->idx != path); 1337 EBUG_ON(!path->ref); 1338 1339 if (!__btree_path_put(path, intent)) 1340 return; 1341 1342 __bch2_path_free(trans, path); 1343 } 1344 1345 void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) 1346 { 1347 panic("trans->restart_count %u, should be %u, last restarted by %pS\n", 1348 trans->restart_count, restart_count, 1349 (void *) trans->last_begin_ip); 1350 } 1351 1352 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) 1353 { 1354 panic("in transaction restart: %s, last restarted by %pS\n", 1355 bch2_err_str(trans->restarted), 1356 (void *) trans->last_restarted_ip); 1357 } 1358 1359 noinline __cold 1360 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) 1361 { 1362 struct btree_insert_entry *i; 1363 struct btree_write_buffered_key *wb; 1364 1365 prt_printf(buf, "transaction updates for %s journal seq %llu", 1366 trans->fn, trans->journal_res.seq); 1367 prt_newline(buf); 1368 printbuf_indent_add(buf, 2); 1369 1370 trans_for_each_update(trans, i) { 1371 struct bkey_s_c old = { &i->old_k, i->old_v }; 1372 1373 prt_printf(buf, "update: btree=%s cached=%u %pS", 1374 bch2_btree_ids[i->btree_id], 1375 i->cached, 1376 (void *) i->ip_allocated); 1377 prt_newline(buf); 1378 1379 prt_printf(buf, " old "); 1380 bch2_bkey_val_to_text(buf, trans->c, old); 1381 prt_newline(buf); 1382 1383 prt_printf(buf, " new "); 1384 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); 1385 prt_newline(buf); 1386 } 1387 1388 trans_for_each_wb_update(trans, wb) { 1389 prt_printf(buf, "update: btree=%s wb=1 %pS", 1390 bch2_btree_ids[wb->btree], 1391 (void *) i->ip_allocated); 1392 prt_newline(buf); 1393 1394 prt_printf(buf, " new "); 1395 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); 1396 prt_newline(buf); 1397 } 1398 1399 printbuf_indent_sub(buf, 2); 1400 } 1401 1402 noinline __cold 1403 void bch2_dump_trans_updates(struct btree_trans *trans) 1404 { 1405 struct printbuf buf = PRINTBUF; 1406 1407 bch2_trans_updates_to_text(&buf, trans); 1408 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1409 printbuf_exit(&buf); 1410 } 1411 1412 noinline __cold 1413 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) 1414 { 1415 prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", 1416 path->idx, path->ref, path->intent_ref, 1417 path->preserve ? 'P' : ' ', 1418 path->should_be_locked ? 'S' : ' ', 1419 bch2_btree_ids[path->btree_id], 1420 path->level); 1421 bch2_bpos_to_text(out, path->pos); 1422 1423 prt_printf(out, " locks %u", path->nodes_locked); 1424 #ifdef TRACK_PATH_ALLOCATED 1425 prt_printf(out, " %pS", (void *) path->ip_allocated); 1426 #endif 1427 prt_newline(out); 1428 } 1429 1430 static noinline __cold 1431 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, 1432 bool nosort) 1433 { 1434 struct btree_path *path; 1435 unsigned idx; 1436 1437 if (!nosort) 1438 btree_trans_sort_paths(trans); 1439 1440 trans_for_each_path_inorder(trans, path, idx) 1441 bch2_btree_path_to_text(out, path); 1442 } 1443 1444 noinline __cold 1445 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) 1446 { 1447 __bch2_trans_paths_to_text(out, trans, false); 1448 } 1449 1450 static noinline __cold 1451 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) 1452 { 1453 struct printbuf buf = PRINTBUF; 1454 1455 __bch2_trans_paths_to_text(&buf, trans, nosort); 1456 bch2_trans_updates_to_text(&buf, trans); 1457 1458 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1459 printbuf_exit(&buf); 1460 } 1461 1462 noinline __cold 1463 void bch2_dump_trans_paths_updates(struct btree_trans *trans) 1464 { 1465 __bch2_dump_trans_paths_updates(trans, false); 1466 } 1467 1468 noinline __cold 1469 static void bch2_trans_update_max_paths(struct btree_trans *trans) 1470 { 1471 struct btree_transaction_stats *s = btree_trans_stats(trans); 1472 struct printbuf buf = PRINTBUF; 1473 1474 if (!s) 1475 return; 1476 1477 bch2_trans_paths_to_text(&buf, trans); 1478 1479 if (!buf.allocation_failure) { 1480 mutex_lock(&s->lock); 1481 if (s->nr_max_paths < hweight64(trans->paths_allocated)) { 1482 s->nr_max_paths = trans->nr_max_paths = 1483 hweight64(trans->paths_allocated); 1484 swap(s->max_paths_text, buf.buf); 1485 } 1486 mutex_unlock(&s->lock); 1487 } 1488 1489 printbuf_exit(&buf); 1490 1491 trans->nr_max_paths = hweight64(trans->paths_allocated); 1492 } 1493 1494 static noinline void btree_path_overflow(struct btree_trans *trans) 1495 { 1496 bch2_dump_trans_paths_updates(trans); 1497 panic("trans path overflow\n"); 1498 } 1499 1500 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, 1501 struct btree_path *pos) 1502 { 1503 struct btree_path *path; 1504 unsigned idx; 1505 1506 if (unlikely(trans->paths_allocated == 1507 ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) 1508 btree_path_overflow(trans); 1509 1510 idx = __ffs64(~trans->paths_allocated); 1511 1512 /* 1513 * Do this before marking the new path as allocated, since it won't be 1514 * initialized yet: 1515 */ 1516 if (unlikely(idx > trans->nr_max_paths)) 1517 bch2_trans_update_max_paths(trans); 1518 1519 trans->paths_allocated |= 1ULL << idx; 1520 1521 path = &trans->paths[idx]; 1522 path->idx = idx; 1523 path->ref = 0; 1524 path->intent_ref = 0; 1525 path->nodes_locked = 0; 1526 1527 btree_path_list_add(trans, pos, path); 1528 trans->paths_sorted = false; 1529 return path; 1530 } 1531 1532 struct btree_path *bch2_path_get(struct btree_trans *trans, 1533 enum btree_id btree_id, struct bpos pos, 1534 unsigned locks_want, unsigned level, 1535 unsigned flags, unsigned long ip) 1536 { 1537 struct btree_path *path, *path_pos = NULL; 1538 bool cached = flags & BTREE_ITER_CACHED; 1539 bool intent = flags & BTREE_ITER_INTENT; 1540 int i; 1541 1542 bch2_trans_verify_not_in_restart(trans); 1543 bch2_trans_verify_locks(trans); 1544 1545 btree_trans_sort_paths(trans); 1546 1547 trans_for_each_path_inorder(trans, path, i) { 1548 if (__btree_path_cmp(path, 1549 btree_id, 1550 cached, 1551 pos, 1552 level) > 0) 1553 break; 1554 1555 path_pos = path; 1556 } 1557 1558 if (path_pos && 1559 path_pos->cached == cached && 1560 path_pos->btree_id == btree_id && 1561 path_pos->level == level) { 1562 __btree_path_get(path_pos, intent); 1563 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); 1564 } else { 1565 path = btree_path_alloc(trans, path_pos); 1566 path_pos = NULL; 1567 1568 __btree_path_get(path, intent); 1569 path->pos = pos; 1570 path->btree_id = btree_id; 1571 path->cached = cached; 1572 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 1573 path->should_be_locked = false; 1574 path->level = level; 1575 path->locks_want = locks_want; 1576 path->nodes_locked = 0; 1577 for (i = 0; i < ARRAY_SIZE(path->l); i++) 1578 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); 1579 #ifdef TRACK_PATH_ALLOCATED 1580 path->ip_allocated = ip; 1581 #endif 1582 trans->paths_sorted = false; 1583 } 1584 1585 if (!(flags & BTREE_ITER_NOPRESERVE)) 1586 path->preserve = true; 1587 1588 if (path->intent_ref) 1589 locks_want = max(locks_want, level + 1); 1590 1591 /* 1592 * If the path has locks_want greater than requested, we don't downgrade 1593 * it here - on transaction restart because btree node split needs to 1594 * upgrade locks, we might be putting/getting the iterator again. 1595 * Downgrading iterators only happens via bch2_trans_downgrade(), after 1596 * a successful transaction commit. 1597 */ 1598 1599 locks_want = min(locks_want, BTREE_MAX_DEPTH); 1600 if (locks_want > path->locks_want) 1601 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); 1602 1603 return path; 1604 } 1605 1606 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) 1607 { 1608 1609 struct btree_path_level *l = path_l(path); 1610 struct bkey_packed *_k; 1611 struct bkey_s_c k; 1612 1613 if (unlikely(!l->b)) 1614 return bkey_s_c_null; 1615 1616 EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); 1617 EBUG_ON(!btree_node_locked(path, path->level)); 1618 1619 if (!path->cached) { 1620 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 1621 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; 1622 1623 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); 1624 1625 if (!k.k || !bpos_eq(path->pos, k.k->p)) 1626 goto hole; 1627 } else { 1628 struct bkey_cached *ck = (void *) path->l[0].b; 1629 1630 EBUG_ON(ck && 1631 (path->btree_id != ck->key.btree_id || 1632 !bkey_eq(path->pos, ck->key.pos))); 1633 if (!ck || !ck->valid) 1634 return bkey_s_c_null; 1635 1636 *u = ck->k->k; 1637 k = bkey_i_to_s_c(ck->k); 1638 } 1639 1640 return k; 1641 hole: 1642 bkey_init(u); 1643 u->p = path->pos; 1644 return (struct bkey_s_c) { u, NULL }; 1645 } 1646 1647 /* Btree iterators: */ 1648 1649 int __must_check 1650 __bch2_btree_iter_traverse(struct btree_iter *iter) 1651 { 1652 return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); 1653 } 1654 1655 int __must_check 1656 bch2_btree_iter_traverse(struct btree_iter *iter) 1657 { 1658 int ret; 1659 1660 iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, 1661 btree_iter_search_key(iter), 1662 iter->flags & BTREE_ITER_INTENT, 1663 btree_iter_ip_allocated(iter)); 1664 1665 ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); 1666 if (ret) 1667 return ret; 1668 1669 btree_path_set_should_be_locked(iter->path); 1670 return 0; 1671 } 1672 1673 /* Iterate across nodes (leaf and interior nodes) */ 1674 1675 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) 1676 { 1677 struct btree_trans *trans = iter->trans; 1678 struct btree *b = NULL; 1679 int ret; 1680 1681 EBUG_ON(iter->path->cached); 1682 bch2_btree_iter_verify(iter); 1683 1684 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 1685 if (ret) 1686 goto err; 1687 1688 b = btree_path_node(iter->path, iter->path->level); 1689 if (!b) 1690 goto out; 1691 1692 BUG_ON(bpos_lt(b->key.k.p, iter->pos)); 1693 1694 bkey_init(&iter->k); 1695 iter->k.p = iter->pos = b->key.k.p; 1696 1697 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, 1698 iter->flags & BTREE_ITER_INTENT, 1699 btree_iter_ip_allocated(iter)); 1700 btree_path_set_should_be_locked(iter->path); 1701 out: 1702 bch2_btree_iter_verify_entry_exit(iter); 1703 bch2_btree_iter_verify(iter); 1704 1705 return b; 1706 err: 1707 b = ERR_PTR(ret); 1708 goto out; 1709 } 1710 1711 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) 1712 { 1713 struct btree *b; 1714 1715 while (b = bch2_btree_iter_peek_node(iter), 1716 bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) 1717 bch2_trans_begin(iter->trans); 1718 1719 return b; 1720 } 1721 1722 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) 1723 { 1724 struct btree_trans *trans = iter->trans; 1725 struct btree_path *path = iter->path; 1726 struct btree *b = NULL; 1727 int ret; 1728 1729 bch2_trans_verify_not_in_restart(trans); 1730 EBUG_ON(iter->path->cached); 1731 bch2_btree_iter_verify(iter); 1732 1733 /* already at end? */ 1734 if (!btree_path_node(path, path->level)) 1735 return NULL; 1736 1737 /* got to end? */ 1738 if (!btree_path_node(path, path->level + 1)) { 1739 btree_path_set_level_up(trans, path); 1740 return NULL; 1741 } 1742 1743 if (!bch2_btree_node_relock(trans, path, path->level + 1)) { 1744 __bch2_btree_path_unlock(trans, path); 1745 path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); 1746 path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); 1747 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1748 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); 1749 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); 1750 goto err; 1751 } 1752 1753 b = btree_path_node(path, path->level + 1); 1754 1755 if (bpos_eq(iter->pos, b->key.k.p)) { 1756 __btree_path_set_level_up(trans, path, path->level++); 1757 } else { 1758 /* 1759 * Haven't gotten to the end of the parent node: go back down to 1760 * the next child node 1761 */ 1762 path = iter->path = 1763 bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), 1764 iter->flags & BTREE_ITER_INTENT, 1765 btree_iter_ip_allocated(iter)); 1766 1767 btree_path_set_level_down(trans, path, iter->min_depth); 1768 1769 ret = bch2_btree_path_traverse(trans, path, iter->flags); 1770 if (ret) 1771 goto err; 1772 1773 b = path->l[path->level].b; 1774 } 1775 1776 bkey_init(&iter->k); 1777 iter->k.p = iter->pos = b->key.k.p; 1778 1779 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, 1780 iter->flags & BTREE_ITER_INTENT, 1781 btree_iter_ip_allocated(iter)); 1782 btree_path_set_should_be_locked(iter->path); 1783 BUG_ON(iter->path->uptodate); 1784 out: 1785 bch2_btree_iter_verify_entry_exit(iter); 1786 bch2_btree_iter_verify(iter); 1787 1788 return b; 1789 err: 1790 b = ERR_PTR(ret); 1791 goto out; 1792 } 1793 1794 /* Iterate across keys (in leaf nodes only) */ 1795 1796 inline bool bch2_btree_iter_advance(struct btree_iter *iter) 1797 { 1798 if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { 1799 struct bpos pos = iter->k.p; 1800 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS 1801 ? bpos_eq(pos, SPOS_MAX) 1802 : bkey_eq(pos, SPOS_MAX)); 1803 1804 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) 1805 pos = bkey_successor(iter, pos); 1806 bch2_btree_iter_set_pos(iter, pos); 1807 return ret; 1808 } else { 1809 if (!btree_path_node(iter->path, iter->path->level)) 1810 return true; 1811 1812 iter->advanced = true; 1813 return false; 1814 } 1815 } 1816 1817 inline bool bch2_btree_iter_rewind(struct btree_iter *iter) 1818 { 1819 struct bpos pos = bkey_start_pos(&iter->k); 1820 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS 1821 ? bpos_eq(pos, POS_MIN) 1822 : bkey_eq(pos, POS_MIN)); 1823 1824 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) 1825 pos = bkey_predecessor(iter, pos); 1826 bch2_btree_iter_set_pos(iter, pos); 1827 return ret; 1828 } 1829 1830 static noinline 1831 struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) 1832 { 1833 struct btree_insert_entry *i; 1834 struct bkey_i *ret = NULL; 1835 1836 trans_for_each_update(iter->trans, i) { 1837 if (i->btree_id < iter->btree_id) 1838 continue; 1839 if (i->btree_id > iter->btree_id) 1840 break; 1841 if (bpos_lt(i->k->k.p, iter->path->pos)) 1842 continue; 1843 if (i->key_cache_already_flushed) 1844 continue; 1845 if (!ret || bpos_lt(i->k->k.p, ret->k.p)) 1846 ret = i->k; 1847 } 1848 1849 return ret; 1850 } 1851 1852 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) 1853 { 1854 return iter->flags & BTREE_ITER_WITH_UPDATES 1855 ? __bch2_btree_trans_peek_updates(iter) 1856 : NULL; 1857 } 1858 1859 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, 1860 struct btree_iter *iter, 1861 struct bpos end_pos) 1862 { 1863 struct bkey_i *k; 1864 1865 if (bpos_lt(iter->path->pos, iter->journal_pos)) 1866 iter->journal_idx = 0; 1867 1868 k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 1869 iter->path->level, 1870 iter->path->pos, 1871 end_pos, 1872 &iter->journal_idx); 1873 1874 iter->journal_pos = k ? k->k.p : end_pos; 1875 return k; 1876 } 1877 1878 static noinline 1879 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, 1880 struct btree_iter *iter) 1881 { 1882 struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); 1883 1884 if (k) { 1885 iter->k = k->k; 1886 return bkey_i_to_s_c(k); 1887 } else { 1888 return bkey_s_c_null; 1889 } 1890 } 1891 1892 static noinline 1893 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, 1894 struct btree_iter *iter, 1895 struct bkey_s_c k) 1896 { 1897 struct bkey_i *next_journal = 1898 bch2_btree_journal_peek(trans, iter, 1899 k.k ? k.k->p : path_l(iter->path)->b->key.k.p); 1900 1901 if (next_journal) { 1902 iter->k = next_journal->k; 1903 k = bkey_i_to_s_c(next_journal); 1904 } 1905 1906 return k; 1907 } 1908 1909 /* 1910 * Checks btree key cache for key at iter->pos and returns it if present, or 1911 * bkey_s_c_null: 1912 */ 1913 static noinline 1914 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) 1915 { 1916 struct btree_trans *trans = iter->trans; 1917 struct bch_fs *c = trans->c; 1918 struct bkey u; 1919 struct bkey_s_c k; 1920 int ret; 1921 1922 if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && 1923 bpos_eq(iter->pos, pos)) 1924 return bkey_s_c_null; 1925 1926 if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) 1927 return bkey_s_c_null; 1928 1929 if (!iter->key_cache_path) 1930 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, 1931 iter->flags & BTREE_ITER_INTENT, 0, 1932 iter->flags|BTREE_ITER_CACHED| 1933 BTREE_ITER_CACHED_NOFILL, 1934 _THIS_IP_); 1935 1936 iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, 1937 iter->flags & BTREE_ITER_INTENT, 1938 btree_iter_ip_allocated(iter)); 1939 1940 ret = bch2_btree_path_traverse(trans, iter->key_cache_path, 1941 iter->flags|BTREE_ITER_CACHED) ?: 1942 bch2_btree_path_relock(trans, iter->path, _THIS_IP_); 1943 if (unlikely(ret)) 1944 return bkey_s_c_err(ret); 1945 1946 btree_path_set_should_be_locked(iter->key_cache_path); 1947 1948 k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); 1949 if (k.k && !bkey_err(k)) { 1950 iter->k = u; 1951 k.k = &iter->k; 1952 } 1953 return k; 1954 } 1955 1956 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) 1957 { 1958 struct btree_trans *trans = iter->trans; 1959 struct bkey_i *next_update; 1960 struct bkey_s_c k, k2; 1961 int ret; 1962 1963 EBUG_ON(iter->path->cached); 1964 bch2_btree_iter_verify(iter); 1965 1966 while (1) { 1967 struct btree_path_level *l; 1968 1969 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 1970 iter->flags & BTREE_ITER_INTENT, 1971 btree_iter_ip_allocated(iter)); 1972 1973 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 1974 if (unlikely(ret)) { 1975 /* ensure that iter->k is consistent with iter->pos: */ 1976 bch2_btree_iter_set_pos(iter, iter->pos); 1977 k = bkey_s_c_err(ret); 1978 goto out; 1979 } 1980 1981 l = path_l(iter->path); 1982 1983 if (unlikely(!l->b)) { 1984 /* No btree nodes at requested level: */ 1985 bch2_btree_iter_set_pos(iter, SPOS_MAX); 1986 k = bkey_s_c_null; 1987 goto out; 1988 } 1989 1990 btree_path_set_should_be_locked(iter->path); 1991 1992 k = btree_path_level_peek_all(trans->c, l, &iter->k); 1993 1994 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && 1995 k.k && 1996 (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { 1997 k = k2; 1998 ret = bkey_err(k); 1999 if (ret) { 2000 bch2_btree_iter_set_pos(iter, iter->pos); 2001 goto out; 2002 } 2003 } 2004 2005 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) 2006 k = btree_trans_peek_journal(trans, iter, k); 2007 2008 next_update = btree_trans_peek_updates(iter); 2009 2010 if (next_update && 2011 bpos_le(next_update->k.p, 2012 k.k ? k.k->p : l->b->key.k.p)) { 2013 iter->k = next_update->k; 2014 k = bkey_i_to_s_c(next_update); 2015 } 2016 2017 if (k.k && bkey_deleted(k.k)) { 2018 /* 2019 * If we've got a whiteout, and it's after the search 2020 * key, advance the search key to the whiteout instead 2021 * of just after the whiteout - it might be a btree 2022 * whiteout, with a real key at the same position, since 2023 * in the btree deleted keys sort before non deleted. 2024 */ 2025 search_key = !bpos_eq(search_key, k.k->p) 2026 ? k.k->p 2027 : bpos_successor(k.k->p); 2028 continue; 2029 } 2030 2031 if (likely(k.k)) { 2032 break; 2033 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { 2034 /* Advance to next leaf node: */ 2035 search_key = bpos_successor(l->b->key.k.p); 2036 } else { 2037 /* End of btree: */ 2038 bch2_btree_iter_set_pos(iter, SPOS_MAX); 2039 k = bkey_s_c_null; 2040 goto out; 2041 } 2042 } 2043 out: 2044 bch2_btree_iter_verify(iter); 2045 2046 return k; 2047 } 2048 2049 /** 2050 * bch2_btree_iter_peek_upto() - returns first key greater than or equal to 2051 * iterator's current position 2052 * @iter: iterator to peek from 2053 * @end: search limit: returns keys less than or equal to @end 2054 * 2055 * Returns: key if found, or an error extractable with bkey_err(). 2056 */ 2057 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) 2058 { 2059 struct btree_trans *trans = iter->trans; 2060 struct bpos search_key = btree_iter_search_key(iter); 2061 struct bkey_s_c k; 2062 struct bpos iter_pos; 2063 int ret; 2064 2065 EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); 2066 EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); 2067 2068 if (iter->update_path) { 2069 bch2_path_put_nokeep(trans, iter->update_path, 2070 iter->flags & BTREE_ITER_INTENT); 2071 iter->update_path = NULL; 2072 } 2073 2074 bch2_btree_iter_verify_entry_exit(iter); 2075 2076 while (1) { 2077 k = __bch2_btree_iter_peek(iter, search_key); 2078 if (unlikely(!k.k)) 2079 goto end; 2080 if (unlikely(bkey_err(k))) 2081 goto out_no_locked; 2082 2083 /* 2084 * iter->pos should be mononotically increasing, and always be 2085 * equal to the key we just returned - except extents can 2086 * straddle iter->pos: 2087 */ 2088 if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) 2089 iter_pos = k.k->p; 2090 else 2091 iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); 2092 2093 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) 2094 ? bkey_gt(iter_pos, end) 2095 : bkey_ge(iter_pos, end))) 2096 goto end; 2097 2098 if (iter->update_path && 2099 !bkey_eq(iter->update_path->pos, k.k->p)) { 2100 bch2_path_put_nokeep(trans, iter->update_path, 2101 iter->flags & BTREE_ITER_INTENT); 2102 iter->update_path = NULL; 2103 } 2104 2105 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 2106 (iter->flags & BTREE_ITER_INTENT) && 2107 !(iter->flags & BTREE_ITER_IS_EXTENTS) && 2108 !iter->update_path) { 2109 struct bpos pos = k.k->p; 2110 2111 if (pos.snapshot < iter->snapshot) { 2112 search_key = bpos_successor(k.k->p); 2113 continue; 2114 } 2115 2116 pos.snapshot = iter->snapshot; 2117 2118 /* 2119 * advance, same as on exit for iter->path, but only up 2120 * to snapshot 2121 */ 2122 __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); 2123 iter->update_path = iter->path; 2124 2125 iter->update_path = bch2_btree_path_set_pos(trans, 2126 iter->update_path, pos, 2127 iter->flags & BTREE_ITER_INTENT, 2128 _THIS_IP_); 2129 ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); 2130 if (unlikely(ret)) { 2131 k = bkey_s_c_err(ret); 2132 goto out_no_locked; 2133 } 2134 } 2135 2136 /* 2137 * We can never have a key in a leaf node at POS_MAX, so 2138 * we don't have to check these successor() calls: 2139 */ 2140 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 2141 !bch2_snapshot_is_ancestor(trans->c, 2142 iter->snapshot, 2143 k.k->p.snapshot)) { 2144 search_key = bpos_successor(k.k->p); 2145 continue; 2146 } 2147 2148 if (bkey_whiteout(k.k) && 2149 !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { 2150 search_key = bkey_successor(iter, k.k->p); 2151 continue; 2152 } 2153 2154 break; 2155 } 2156 2157 iter->pos = iter_pos; 2158 2159 iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, 2160 iter->flags & BTREE_ITER_INTENT, 2161 btree_iter_ip_allocated(iter)); 2162 2163 btree_path_set_should_be_locked(iter->path); 2164 out_no_locked: 2165 if (iter->update_path) { 2166 ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); 2167 if (unlikely(ret)) 2168 k = bkey_s_c_err(ret); 2169 else 2170 btree_path_set_should_be_locked(iter->update_path); 2171 } 2172 2173 if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) 2174 iter->pos.snapshot = iter->snapshot; 2175 2176 ret = bch2_btree_iter_verify_ret(iter, k); 2177 if (unlikely(ret)) { 2178 bch2_btree_iter_set_pos(iter, iter->pos); 2179 k = bkey_s_c_err(ret); 2180 } 2181 2182 bch2_btree_iter_verify_entry_exit(iter); 2183 2184 return k; 2185 end: 2186 bch2_btree_iter_set_pos(iter, end); 2187 k = bkey_s_c_null; 2188 goto out_no_locked; 2189 } 2190 2191 /** 2192 * bch2_btree_iter_peek_all_levels() - returns the first key greater than or 2193 * equal to iterator's current position, returning keys from every level of the 2194 * btree. For keys at different levels of the btree that compare equal, the key 2195 * from the lower level (leaf) is returned first. 2196 * @iter: iterator to peek from 2197 * 2198 * Returns: key if found, or an error extractable with bkey_err(). 2199 */ 2200 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) 2201 { 2202 struct btree_trans *trans = iter->trans; 2203 struct bkey_s_c k; 2204 int ret; 2205 2206 EBUG_ON(iter->path->cached); 2207 bch2_btree_iter_verify(iter); 2208 BUG_ON(iter->path->level < iter->min_depth); 2209 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); 2210 EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); 2211 2212 while (1) { 2213 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, 2214 iter->flags & BTREE_ITER_INTENT, 2215 btree_iter_ip_allocated(iter)); 2216 2217 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2218 if (unlikely(ret)) { 2219 /* ensure that iter->k is consistent with iter->pos: */ 2220 bch2_btree_iter_set_pos(iter, iter->pos); 2221 k = bkey_s_c_err(ret); 2222 goto out_no_locked; 2223 } 2224 2225 /* Already at end? */ 2226 if (!btree_path_node(iter->path, iter->path->level)) { 2227 k = bkey_s_c_null; 2228 goto out_no_locked; 2229 } 2230 2231 k = btree_path_level_peek_all(trans->c, 2232 &iter->path->l[iter->path->level], &iter->k); 2233 2234 /* Check if we should go up to the parent node: */ 2235 if (!k.k || 2236 (iter->advanced && 2237 bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { 2238 iter->pos = path_l(iter->path)->b->key.k.p; 2239 btree_path_set_level_up(trans, iter->path); 2240 iter->advanced = false; 2241 continue; 2242 } 2243 2244 /* 2245 * Check if we should go back down to a leaf: 2246 * If we're not in a leaf node, we only return the current key 2247 * if it exactly matches iter->pos - otherwise we first have to 2248 * go back to the leaf: 2249 */ 2250 if (iter->path->level != iter->min_depth && 2251 (iter->advanced || 2252 !k.k || 2253 !bpos_eq(iter->pos, k.k->p))) { 2254 btree_path_set_level_down(trans, iter->path, iter->min_depth); 2255 iter->pos = bpos_successor(iter->pos); 2256 iter->advanced = false; 2257 continue; 2258 } 2259 2260 /* Check if we should go to the next key: */ 2261 if (iter->path->level == iter->min_depth && 2262 iter->advanced && 2263 k.k && 2264 bpos_eq(iter->pos, k.k->p)) { 2265 iter->pos = bpos_successor(iter->pos); 2266 iter->advanced = false; 2267 continue; 2268 } 2269 2270 if (iter->advanced && 2271 iter->path->level == iter->min_depth && 2272 !bpos_eq(k.k->p, iter->pos)) 2273 iter->advanced = false; 2274 2275 BUG_ON(iter->advanced); 2276 BUG_ON(!k.k); 2277 break; 2278 } 2279 2280 iter->pos = k.k->p; 2281 btree_path_set_should_be_locked(iter->path); 2282 out_no_locked: 2283 bch2_btree_iter_verify(iter); 2284 2285 return k; 2286 } 2287 2288 /** 2289 * bch2_btree_iter_next() - returns first key greater than iterator's current 2290 * position 2291 * @iter: iterator to peek from 2292 * 2293 * Returns: key if found, or an error extractable with bkey_err(). 2294 */ 2295 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) 2296 { 2297 if (!bch2_btree_iter_advance(iter)) 2298 return bkey_s_c_null; 2299 2300 return bch2_btree_iter_peek(iter); 2301 } 2302 2303 /** 2304 * bch2_btree_iter_peek_prev() - returns first key less than or equal to 2305 * iterator's current position 2306 * @iter: iterator to peek from 2307 * 2308 * Returns: key if found, or an error extractable with bkey_err(). 2309 */ 2310 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) 2311 { 2312 struct btree_trans *trans = iter->trans; 2313 struct bpos search_key = iter->pos; 2314 struct btree_path *saved_path = NULL; 2315 struct bkey_s_c k; 2316 struct bkey saved_k; 2317 const struct bch_val *saved_v; 2318 int ret; 2319 2320 EBUG_ON(iter->path->cached || iter->path->level); 2321 EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); 2322 2323 if (iter->flags & BTREE_ITER_WITH_JOURNAL) 2324 return bkey_s_c_err(-EIO); 2325 2326 bch2_btree_iter_verify(iter); 2327 bch2_btree_iter_verify_entry_exit(iter); 2328 2329 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2330 search_key.snapshot = U32_MAX; 2331 2332 while (1) { 2333 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 2334 iter->flags & BTREE_ITER_INTENT, 2335 btree_iter_ip_allocated(iter)); 2336 2337 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2338 if (unlikely(ret)) { 2339 /* ensure that iter->k is consistent with iter->pos: */ 2340 bch2_btree_iter_set_pos(iter, iter->pos); 2341 k = bkey_s_c_err(ret); 2342 goto out_no_locked; 2343 } 2344 2345 k = btree_path_level_peek(trans, iter->path, 2346 &iter->path->l[0], &iter->k); 2347 if (!k.k || 2348 ((iter->flags & BTREE_ITER_IS_EXTENTS) 2349 ? bpos_ge(bkey_start_pos(k.k), search_key) 2350 : bpos_gt(k.k->p, search_key))) 2351 k = btree_path_level_prev(trans, iter->path, 2352 &iter->path->l[0], &iter->k); 2353 2354 if (likely(k.k)) { 2355 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { 2356 if (k.k->p.snapshot == iter->snapshot) 2357 goto got_key; 2358 2359 /* 2360 * If we have a saved candidate, and we're no 2361 * longer at the same _key_ (not pos), return 2362 * that candidate 2363 */ 2364 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { 2365 bch2_path_put_nokeep(trans, iter->path, 2366 iter->flags & BTREE_ITER_INTENT); 2367 iter->path = saved_path; 2368 saved_path = NULL; 2369 iter->k = saved_k; 2370 k.v = saved_v; 2371 goto got_key; 2372 } 2373 2374 if (bch2_snapshot_is_ancestor(iter->trans->c, 2375 iter->snapshot, 2376 k.k->p.snapshot)) { 2377 if (saved_path) 2378 bch2_path_put_nokeep(trans, saved_path, 2379 iter->flags & BTREE_ITER_INTENT); 2380 saved_path = btree_path_clone(trans, iter->path, 2381 iter->flags & BTREE_ITER_INTENT); 2382 saved_k = *k.k; 2383 saved_v = k.v; 2384 } 2385 2386 search_key = bpos_predecessor(k.k->p); 2387 continue; 2388 } 2389 got_key: 2390 if (bkey_whiteout(k.k) && 2391 !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { 2392 search_key = bkey_predecessor(iter, k.k->p); 2393 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2394 search_key.snapshot = U32_MAX; 2395 continue; 2396 } 2397 2398 break; 2399 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { 2400 /* Advance to previous leaf node: */ 2401 search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); 2402 } else { 2403 /* Start of btree: */ 2404 bch2_btree_iter_set_pos(iter, POS_MIN); 2405 k = bkey_s_c_null; 2406 goto out_no_locked; 2407 } 2408 } 2409 2410 EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); 2411 2412 /* Extents can straddle iter->pos: */ 2413 if (bkey_lt(k.k->p, iter->pos)) 2414 iter->pos = k.k->p; 2415 2416 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2417 iter->pos.snapshot = iter->snapshot; 2418 2419 btree_path_set_should_be_locked(iter->path); 2420 out_no_locked: 2421 if (saved_path) 2422 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); 2423 2424 bch2_btree_iter_verify_entry_exit(iter); 2425 bch2_btree_iter_verify(iter); 2426 2427 return k; 2428 } 2429 2430 /** 2431 * bch2_btree_iter_prev() - returns first key less than iterator's current 2432 * position 2433 * @iter: iterator to peek from 2434 * 2435 * Returns: key if found, or an error extractable with bkey_err(). 2436 */ 2437 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) 2438 { 2439 if (!bch2_btree_iter_rewind(iter)) 2440 return bkey_s_c_null; 2441 2442 return bch2_btree_iter_peek_prev(iter); 2443 } 2444 2445 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) 2446 { 2447 struct btree_trans *trans = iter->trans; 2448 struct bpos search_key; 2449 struct bkey_s_c k; 2450 int ret; 2451 2452 bch2_btree_iter_verify(iter); 2453 bch2_btree_iter_verify_entry_exit(iter); 2454 EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); 2455 EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); 2456 2457 /* extents can't span inode numbers: */ 2458 if ((iter->flags & BTREE_ITER_IS_EXTENTS) && 2459 unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { 2460 if (iter->pos.inode == KEY_INODE_MAX) 2461 return bkey_s_c_null; 2462 2463 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); 2464 } 2465 2466 search_key = btree_iter_search_key(iter); 2467 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 2468 iter->flags & BTREE_ITER_INTENT, 2469 btree_iter_ip_allocated(iter)); 2470 2471 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2472 if (unlikely(ret)) { 2473 k = bkey_s_c_err(ret); 2474 goto out_no_locked; 2475 } 2476 2477 if ((iter->flags & BTREE_ITER_CACHED) || 2478 !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { 2479 struct bkey_i *next_update; 2480 2481 if ((next_update = btree_trans_peek_updates(iter)) && 2482 bpos_eq(next_update->k.p, iter->pos)) { 2483 iter->k = next_update->k; 2484 k = bkey_i_to_s_c(next_update); 2485 goto out; 2486 } 2487 2488 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && 2489 (k = btree_trans_peek_slot_journal(trans, iter)).k) 2490 goto out; 2491 2492 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && 2493 (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { 2494 if (!bkey_err(k)) 2495 iter->k = *k.k; 2496 /* We're not returning a key from iter->path: */ 2497 goto out_no_locked; 2498 } 2499 2500 k = bch2_btree_path_peek_slot(iter->path, &iter->k); 2501 if (unlikely(!k.k)) 2502 goto out_no_locked; 2503 } else { 2504 struct bpos next; 2505 struct bpos end = iter->pos; 2506 2507 if (iter->flags & BTREE_ITER_IS_EXTENTS) 2508 end.offset = U64_MAX; 2509 2510 EBUG_ON(iter->path->level); 2511 2512 if (iter->flags & BTREE_ITER_INTENT) { 2513 struct btree_iter iter2; 2514 2515 bch2_trans_copy_iter(&iter2, iter); 2516 k = bch2_btree_iter_peek_upto(&iter2, end); 2517 2518 if (k.k && !bkey_err(k)) { 2519 iter->k = iter2.k; 2520 k.k = &iter->k; 2521 } 2522 bch2_trans_iter_exit(trans, &iter2); 2523 } else { 2524 struct bpos pos = iter->pos; 2525 2526 k = bch2_btree_iter_peek_upto(iter, end); 2527 if (unlikely(bkey_err(k))) 2528 bch2_btree_iter_set_pos(iter, pos); 2529 else 2530 iter->pos = pos; 2531 } 2532 2533 if (unlikely(bkey_err(k))) 2534 goto out_no_locked; 2535 2536 next = k.k ? bkey_start_pos(k.k) : POS_MAX; 2537 2538 if (bkey_lt(iter->pos, next)) { 2539 bkey_init(&iter->k); 2540 iter->k.p = iter->pos; 2541 2542 if (iter->flags & BTREE_ITER_IS_EXTENTS) { 2543 bch2_key_resize(&iter->k, 2544 min_t(u64, KEY_SIZE_MAX, 2545 (next.inode == iter->pos.inode 2546 ? next.offset 2547 : KEY_OFFSET_MAX) - 2548 iter->pos.offset)); 2549 EBUG_ON(!iter->k.size); 2550 } 2551 2552 k = (struct bkey_s_c) { &iter->k, NULL }; 2553 } 2554 } 2555 out: 2556 btree_path_set_should_be_locked(iter->path); 2557 out_no_locked: 2558 bch2_btree_iter_verify_entry_exit(iter); 2559 bch2_btree_iter_verify(iter); 2560 ret = bch2_btree_iter_verify_ret(iter, k); 2561 if (unlikely(ret)) 2562 return bkey_s_c_err(ret); 2563 2564 return k; 2565 } 2566 2567 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) 2568 { 2569 if (!bch2_btree_iter_advance(iter)) 2570 return bkey_s_c_null; 2571 2572 return bch2_btree_iter_peek_slot(iter); 2573 } 2574 2575 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) 2576 { 2577 if (!bch2_btree_iter_rewind(iter)) 2578 return bkey_s_c_null; 2579 2580 return bch2_btree_iter_peek_slot(iter); 2581 } 2582 2583 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) 2584 { 2585 struct bkey_s_c k; 2586 2587 while (btree_trans_too_many_iters(iter->trans) || 2588 (k = bch2_btree_iter_peek_type(iter, iter->flags), 2589 bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) 2590 bch2_trans_begin(iter->trans); 2591 2592 return k; 2593 } 2594 2595 /* new transactional stuff: */ 2596 2597 #ifdef CONFIG_BCACHEFS_DEBUG 2598 static void btree_trans_verify_sorted_refs(struct btree_trans *trans) 2599 { 2600 struct btree_path *path; 2601 unsigned i; 2602 2603 BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); 2604 2605 trans_for_each_path(trans, path) { 2606 BUG_ON(path->sorted_idx >= trans->nr_sorted); 2607 BUG_ON(trans->sorted[path->sorted_idx] != path->idx); 2608 } 2609 2610 for (i = 0; i < trans->nr_sorted; i++) { 2611 unsigned idx = trans->sorted[i]; 2612 2613 EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); 2614 BUG_ON(trans->paths[idx].sorted_idx != i); 2615 } 2616 } 2617 2618 static void btree_trans_verify_sorted(struct btree_trans *trans) 2619 { 2620 struct btree_path *path, *prev = NULL; 2621 unsigned i; 2622 2623 if (!bch2_debug_check_iterators) 2624 return; 2625 2626 trans_for_each_path_inorder(trans, path, i) { 2627 if (prev && btree_path_cmp(prev, path) > 0) { 2628 __bch2_dump_trans_paths_updates(trans, true); 2629 panic("trans paths out of order!\n"); 2630 } 2631 prev = path; 2632 } 2633 } 2634 #else 2635 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} 2636 static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} 2637 #endif 2638 2639 void __bch2_btree_trans_sort_paths(struct btree_trans *trans) 2640 { 2641 int i, l = 0, r = trans->nr_sorted, inc = 1; 2642 bool swapped; 2643 2644 btree_trans_verify_sorted_refs(trans); 2645 2646 if (trans->paths_sorted) 2647 goto out; 2648 2649 /* 2650 * Cocktail shaker sort: this is efficient because iterators will be 2651 * mostly sorted. 2652 */ 2653 do { 2654 swapped = false; 2655 2656 for (i = inc > 0 ? l : r - 2; 2657 i + 1 < r && i >= l; 2658 i += inc) { 2659 if (btree_path_cmp(trans->paths + trans->sorted[i], 2660 trans->paths + trans->sorted[i + 1]) > 0) { 2661 swap(trans->sorted[i], trans->sorted[i + 1]); 2662 trans->paths[trans->sorted[i]].sorted_idx = i; 2663 trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; 2664 swapped = true; 2665 } 2666 } 2667 2668 if (inc > 0) 2669 --r; 2670 else 2671 l++; 2672 inc = -inc; 2673 } while (swapped); 2674 2675 trans->paths_sorted = true; 2676 out: 2677 btree_trans_verify_sorted(trans); 2678 } 2679 2680 static inline void btree_path_list_remove(struct btree_trans *trans, 2681 struct btree_path *path) 2682 { 2683 unsigned i; 2684 2685 EBUG_ON(path->sorted_idx >= trans->nr_sorted); 2686 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 2687 trans->nr_sorted--; 2688 memmove_u64s_down_small(trans->sorted + path->sorted_idx, 2689 trans->sorted + path->sorted_idx + 1, 2690 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); 2691 #else 2692 array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); 2693 #endif 2694 for (i = path->sorted_idx; i < trans->nr_sorted; i++) 2695 trans->paths[trans->sorted[i]].sorted_idx = i; 2696 2697 path->sorted_idx = U8_MAX; 2698 } 2699 2700 static inline void btree_path_list_add(struct btree_trans *trans, 2701 struct btree_path *pos, 2702 struct btree_path *path) 2703 { 2704 unsigned i; 2705 2706 path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; 2707 2708 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 2709 memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, 2710 trans->sorted + path->sorted_idx, 2711 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); 2712 trans->nr_sorted++; 2713 trans->sorted[path->sorted_idx] = path->idx; 2714 #else 2715 array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); 2716 #endif 2717 2718 for (i = path->sorted_idx; i < trans->nr_sorted; i++) 2719 trans->paths[trans->sorted[i]].sorted_idx = i; 2720 2721 btree_trans_verify_sorted_refs(trans); 2722 } 2723 2724 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) 2725 { 2726 if (iter->update_path) 2727 bch2_path_put_nokeep(trans, iter->update_path, 2728 iter->flags & BTREE_ITER_INTENT); 2729 if (iter->path) 2730 bch2_path_put(trans, iter->path, 2731 iter->flags & BTREE_ITER_INTENT); 2732 if (iter->key_cache_path) 2733 bch2_path_put(trans, iter->key_cache_path, 2734 iter->flags & BTREE_ITER_INTENT); 2735 iter->path = NULL; 2736 iter->update_path = NULL; 2737 iter->key_cache_path = NULL; 2738 } 2739 2740 void bch2_trans_iter_init_outlined(struct btree_trans *trans, 2741 struct btree_iter *iter, 2742 enum btree_id btree_id, struct bpos pos, 2743 unsigned flags) 2744 { 2745 bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, 2746 bch2_btree_iter_flags(trans, btree_id, flags), 2747 _RET_IP_); 2748 } 2749 2750 void bch2_trans_node_iter_init(struct btree_trans *trans, 2751 struct btree_iter *iter, 2752 enum btree_id btree_id, 2753 struct bpos pos, 2754 unsigned locks_want, 2755 unsigned depth, 2756 unsigned flags) 2757 { 2758 flags |= BTREE_ITER_NOT_EXTENTS; 2759 flags |= __BTREE_ITER_ALL_SNAPSHOTS; 2760 flags |= BTREE_ITER_ALL_SNAPSHOTS; 2761 2762 bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, 2763 __bch2_btree_iter_flags(trans, btree_id, flags), 2764 _RET_IP_); 2765 2766 iter->min_depth = depth; 2767 2768 BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); 2769 BUG_ON(iter->path->level != depth); 2770 BUG_ON(iter->min_depth != depth); 2771 } 2772 2773 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) 2774 { 2775 *dst = *src; 2776 if (src->path) 2777 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); 2778 if (src->update_path) 2779 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); 2780 dst->key_cache_path = NULL; 2781 } 2782 2783 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) 2784 { 2785 unsigned new_top = trans->mem_top + size; 2786 size_t old_bytes = trans->mem_bytes; 2787 size_t new_bytes = roundup_pow_of_two(new_top); 2788 int ret; 2789 void *new_mem; 2790 void *p; 2791 2792 trans->mem_max = max(trans->mem_max, new_top); 2793 2794 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); 2795 2796 new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); 2797 if (unlikely(!new_mem)) { 2798 bch2_trans_unlock(trans); 2799 2800 new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); 2801 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { 2802 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); 2803 new_bytes = BTREE_TRANS_MEM_MAX; 2804 kfree(trans->mem); 2805 } 2806 2807 if (!new_mem) 2808 return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); 2809 2810 trans->mem = new_mem; 2811 trans->mem_bytes = new_bytes; 2812 2813 ret = bch2_trans_relock(trans); 2814 if (ret) 2815 return ERR_PTR(ret); 2816 } 2817 2818 trans->mem = new_mem; 2819 trans->mem_bytes = new_bytes; 2820 2821 if (old_bytes) { 2822 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 2823 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); 2824 } 2825 2826 p = trans->mem + trans->mem_top; 2827 trans->mem_top += size; 2828 memset(p, 0, size); 2829 return p; 2830 } 2831 2832 static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) 2833 { 2834 struct bch_fs *c = trans->c; 2835 struct btree_path *path; 2836 2837 trans_for_each_path(trans, path) 2838 if (path->cached && !btree_node_locked(path, 0)) 2839 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); 2840 2841 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); 2842 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 2843 trans->srcu_lock_time = jiffies; 2844 } 2845 2846 /** 2847 * bch2_trans_begin() - reset a transaction after a interrupted attempt 2848 * @trans: transaction to reset 2849 * 2850 * Returns: current restart counter, to be used with trans_was_restarted() 2851 * 2852 * While iterating over nodes or updating nodes a attempt to lock a btree node 2853 * may return BCH_ERR_transaction_restart when the trylock fails. When this 2854 * occurs bch2_trans_begin() should be called and the transaction retried. 2855 */ 2856 u32 bch2_trans_begin(struct btree_trans *trans) 2857 { 2858 struct btree_path *path; 2859 u64 now; 2860 2861 bch2_trans_reset_updates(trans); 2862 2863 trans->restart_count++; 2864 trans->mem_top = 0; 2865 2866 trans_for_each_path(trans, path) { 2867 path->should_be_locked = false; 2868 2869 /* 2870 * If the transaction wasn't restarted, we're presuming to be 2871 * doing something new: dont keep iterators excpt the ones that 2872 * are in use - except for the subvolumes btree: 2873 */ 2874 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) 2875 path->preserve = false; 2876 2877 /* 2878 * XXX: we probably shouldn't be doing this if the transaction 2879 * was restarted, but currently we still overflow transaction 2880 * iterators if we do that 2881 */ 2882 if (!path->ref && !path->preserve) 2883 __bch2_path_free(trans, path); 2884 else 2885 path->preserve = false; 2886 } 2887 2888 now = local_clock(); 2889 if (!trans->restarted && 2890 (need_resched() || 2891 now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { 2892 drop_locks_do(trans, (cond_resched(), 0)); 2893 now = local_clock(); 2894 } 2895 trans->last_begin_time = now; 2896 2897 if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) 2898 bch2_trans_reset_srcu_lock(trans); 2899 2900 trans->last_begin_ip = _RET_IP_; 2901 if (trans->restarted) { 2902 bch2_btree_path_traverse_all(trans); 2903 trans->notrace_relock_fail = false; 2904 } 2905 2906 return trans->restart_count; 2907 } 2908 2909 static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) 2910 { 2911 struct btree_trans *trans; 2912 2913 if (IS_ENABLED(__KERNEL__)) { 2914 trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); 2915 if (trans) 2916 return trans; 2917 } 2918 2919 trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); 2920 /* 2921 * paths need to be zeroed, bch2_check_for_deadlock looks at 2922 * paths in other threads 2923 */ 2924 memset(&trans->paths, 0, sizeof(trans->paths)); 2925 return trans; 2926 } 2927 2928 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; 2929 2930 unsigned bch2_trans_get_fn_idx(const char *fn) 2931 { 2932 unsigned i; 2933 2934 for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) 2935 if (!bch2_btree_transaction_fns[i] || 2936 bch2_btree_transaction_fns[i] == fn) { 2937 bch2_btree_transaction_fns[i] = fn; 2938 return i; 2939 } 2940 2941 pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); 2942 return i; 2943 } 2944 2945 struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) 2946 __acquires(&c->btree_trans_barrier) 2947 { 2948 struct btree_trans *trans; 2949 struct btree_transaction_stats *s; 2950 2951 trans = bch2_trans_alloc(c); 2952 2953 memset(trans, 0, sizeof(*trans)); 2954 trans->c = c; 2955 trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) 2956 ? bch2_btree_transaction_fns[fn_idx] : NULL; 2957 trans->last_begin_time = local_clock(); 2958 trans->fn_idx = fn_idx; 2959 trans->locking_wait.task = current; 2960 trans->journal_replay_not_finished = 2961 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); 2962 closure_init_stack(&trans->ref); 2963 2964 s = btree_trans_stats(trans); 2965 if (s && s->max_mem) { 2966 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); 2967 2968 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); 2969 2970 if (!unlikely(trans->mem)) { 2971 trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); 2972 trans->mem_bytes = BTREE_TRANS_MEM_MAX; 2973 } else { 2974 trans->mem_bytes = expected_mem_bytes; 2975 } 2976 } 2977 2978 if (s) { 2979 trans->nr_max_paths = s->nr_max_paths; 2980 trans->wb_updates_size = s->wb_updates_size; 2981 } 2982 2983 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 2984 trans->srcu_lock_time = jiffies; 2985 2986 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 2987 struct btree_trans *pos; 2988 2989 seqmutex_lock(&c->btree_trans_lock); 2990 list_for_each_entry(pos, &c->btree_trans_list, list) { 2991 /* 2992 * We'd much prefer to be stricter here and completely 2993 * disallow multiple btree_trans in the same thread - 2994 * but the data move path calls bch2_write when we 2995 * already have a btree_trans initialized. 2996 */ 2997 BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && 2998 bch2_trans_locked(pos)); 2999 3000 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { 3001 list_add_tail(&trans->list, &pos->list); 3002 goto list_add_done; 3003 } 3004 } 3005 list_add_tail(&trans->list, &c->btree_trans_list); 3006 list_add_done: 3007 seqmutex_unlock(&c->btree_trans_lock); 3008 } 3009 3010 return trans; 3011 } 3012 3013 static void check_btree_paths_leaked(struct btree_trans *trans) 3014 { 3015 #ifdef CONFIG_BCACHEFS_DEBUG 3016 struct bch_fs *c = trans->c; 3017 struct btree_path *path; 3018 3019 trans_for_each_path(trans, path) 3020 if (path->ref) 3021 goto leaked; 3022 return; 3023 leaked: 3024 bch_err(c, "btree paths leaked from %s!", trans->fn); 3025 trans_for_each_path(trans, path) 3026 if (path->ref) 3027 printk(KERN_ERR " btree %s %pS\n", 3028 bch2_btree_ids[path->btree_id], 3029 (void *) path->ip_allocated); 3030 /* Be noisy about this: */ 3031 bch2_fatal_error(c); 3032 #endif 3033 } 3034 3035 void bch2_trans_put(struct btree_trans *trans) 3036 __releases(&c->btree_trans_barrier) 3037 { 3038 struct btree_insert_entry *i; 3039 struct bch_fs *c = trans->c; 3040 struct btree_transaction_stats *s = btree_trans_stats(trans); 3041 3042 bch2_trans_unlock(trans); 3043 3044 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 3045 seqmutex_lock(&c->btree_trans_lock); 3046 list_del(&trans->list); 3047 seqmutex_unlock(&c->btree_trans_lock); 3048 } 3049 3050 closure_sync(&trans->ref); 3051 3052 if (s) 3053 s->max_mem = max(s->max_mem, trans->mem_max); 3054 3055 trans_for_each_update(trans, i) 3056 __btree_path_put(i->path, true); 3057 trans->nr_updates = 0; 3058 3059 check_btree_paths_leaked(trans); 3060 3061 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); 3062 3063 bch2_journal_preres_put(&c->journal, &trans->journal_preres); 3064 3065 kfree(trans->extra_journal_entries.data); 3066 3067 if (trans->fs_usage_deltas) { 3068 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == 3069 REPLICAS_DELTA_LIST_MAX) 3070 mempool_free(trans->fs_usage_deltas, 3071 &c->replicas_delta_pool); 3072 else 3073 kfree(trans->fs_usage_deltas); 3074 } 3075 3076 if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) 3077 mempool_free(trans->mem, &c->btree_trans_mem_pool); 3078 else 3079 kfree(trans->mem); 3080 3081 /* Userspace doesn't have a real percpu implementation: */ 3082 if (IS_ENABLED(__KERNEL__)) 3083 trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); 3084 if (trans) 3085 mempool_free(trans, &c->btree_trans_pool); 3086 } 3087 3088 static void __maybe_unused 3089 bch2_btree_bkey_cached_common_to_text(struct printbuf *out, 3090 struct btree_bkey_cached_common *b) 3091 { 3092 struct six_lock_count c = six_lock_counts(&b->lock); 3093 struct task_struct *owner; 3094 pid_t pid; 3095 3096 rcu_read_lock(); 3097 owner = READ_ONCE(b->lock.owner); 3098 pid = owner ? owner->pid : 0; 3099 rcu_read_unlock(); 3100 3101 prt_tab(out); 3102 prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', 3103 b->level, bch2_btree_ids[b->btree_id]); 3104 bch2_bpos_to_text(out, btree_node_pos(b)); 3105 3106 prt_tab(out); 3107 prt_printf(out, " locks %u:%u:%u held by pid %u", 3108 c.n[0], c.n[1], c.n[2], pid); 3109 } 3110 3111 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) 3112 { 3113 struct btree_path *path; 3114 struct btree_bkey_cached_common *b; 3115 static char lock_types[] = { 'r', 'i', 'w' }; 3116 unsigned l, idx; 3117 3118 if (!out->nr_tabstops) { 3119 printbuf_tabstop_push(out, 16); 3120 printbuf_tabstop_push(out, 32); 3121 } 3122 3123 prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); 3124 3125 trans_for_each_path_safe(trans, path, idx) { 3126 if (!path->nodes_locked) 3127 continue; 3128 3129 prt_printf(out, " path %u %c l=%u %s:", 3130 path->idx, 3131 path->cached ? 'c' : 'b', 3132 path->level, 3133 bch2_btree_ids[path->btree_id]); 3134 bch2_bpos_to_text(out, path->pos); 3135 prt_newline(out); 3136 3137 for (l = 0; l < BTREE_MAX_DEPTH; l++) { 3138 if (btree_node_locked(path, l) && 3139 !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { 3140 prt_printf(out, " %c l=%u ", 3141 lock_types[btree_node_locked_type(path, l)], l); 3142 bch2_btree_bkey_cached_common_to_text(out, b); 3143 prt_newline(out); 3144 } 3145 } 3146 } 3147 3148 b = READ_ONCE(trans->locking); 3149 if (b) { 3150 prt_printf(out, " blocked for %lluus on", 3151 div_u64(local_clock() - trans->locking_wait.start_time, 3152 1000)); 3153 prt_newline(out); 3154 prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); 3155 bch2_btree_bkey_cached_common_to_text(out, b); 3156 prt_newline(out); 3157 } 3158 } 3159 3160 void bch2_fs_btree_iter_exit(struct bch_fs *c) 3161 { 3162 struct btree_transaction_stats *s; 3163 struct btree_trans *trans; 3164 int cpu; 3165 3166 trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); 3167 if (trans) 3168 panic("%s leaked btree_trans\n", trans->fn); 3169 3170 if (c->btree_trans_bufs) 3171 for_each_possible_cpu(cpu) 3172 kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); 3173 free_percpu(c->btree_trans_bufs); 3174 3175 for (s = c->btree_transaction_stats; 3176 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); 3177 s++) { 3178 kfree(s->max_paths_text); 3179 bch2_time_stats_exit(&s->lock_hold_times); 3180 } 3181 3182 if (c->btree_trans_barrier_initialized) 3183 cleanup_srcu_struct(&c->btree_trans_barrier); 3184 mempool_exit(&c->btree_trans_mem_pool); 3185 mempool_exit(&c->btree_trans_pool); 3186 } 3187 3188 int bch2_fs_btree_iter_init(struct bch_fs *c) 3189 { 3190 struct btree_transaction_stats *s; 3191 int ret; 3192 3193 for (s = c->btree_transaction_stats; 3194 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); 3195 s++) { 3196 bch2_time_stats_init(&s->lock_hold_times); 3197 mutex_init(&s->lock); 3198 } 3199 3200 INIT_LIST_HEAD(&c->btree_trans_list); 3201 seqmutex_init(&c->btree_trans_lock); 3202 3203 c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); 3204 if (!c->btree_trans_bufs) 3205 return -ENOMEM; 3206 3207 ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, 3208 sizeof(struct btree_trans)) ?: 3209 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, 3210 BTREE_TRANS_MEM_MAX) ?: 3211 init_srcu_struct(&c->btree_trans_barrier); 3212 if (!ret) 3213 c->btree_trans_barrier_initialized = true; 3214 return ret; 3215 } 3216