1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bkey_methods.h" 5 #include "bkey_buf.h" 6 #include "btree_cache.h" 7 #include "btree_iter.h" 8 #include "btree_journal_iter.h" 9 #include "btree_key_cache.h" 10 #include "btree_locking.h" 11 #include "btree_update.h" 12 #include "debug.h" 13 #include "error.h" 14 #include "extents.h" 15 #include "journal.h" 16 #include "replicas.h" 17 #include "snapshot.h" 18 #include "trace.h" 19 20 #include <linux/random.h> 21 #include <linux/prefetch.h> 22 23 static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); 24 static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, 25 struct btree_path *); 26 27 static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) 28 { 29 #ifdef TRACK_PATH_ALLOCATED 30 return iter->ip_allocated; 31 #else 32 return 0; 33 #endif 34 } 35 36 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); 37 38 static inline int __btree_path_cmp(const struct btree_path *l, 39 enum btree_id r_btree_id, 40 bool r_cached, 41 struct bpos r_pos, 42 unsigned r_level) 43 { 44 /* 45 * Must match lock ordering as defined by __bch2_btree_node_lock: 46 */ 47 return cmp_int(l->btree_id, r_btree_id) ?: 48 cmp_int((int) l->cached, (int) r_cached) ?: 49 bpos_cmp(l->pos, r_pos) ?: 50 -cmp_int(l->level, r_level); 51 } 52 53 static inline int btree_path_cmp(const struct btree_path *l, 54 const struct btree_path *r) 55 { 56 return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); 57 } 58 59 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) 60 { 61 /* Are we iterating over keys in all snapshots? */ 62 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { 63 p = bpos_successor(p); 64 } else { 65 p = bpos_nosnap_successor(p); 66 p.snapshot = iter->snapshot; 67 } 68 69 return p; 70 } 71 72 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) 73 { 74 /* Are we iterating over keys in all snapshots? */ 75 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { 76 p = bpos_predecessor(p); 77 } else { 78 p = bpos_nosnap_predecessor(p); 79 p.snapshot = iter->snapshot; 80 } 81 82 return p; 83 } 84 85 static inline struct bpos btree_iter_search_key(struct btree_iter *iter) 86 { 87 struct bpos pos = iter->pos; 88 89 if ((iter->flags & BTREE_ITER_IS_EXTENTS) && 90 !bkey_eq(pos, POS_MAX)) 91 pos = bkey_successor(iter, pos); 92 return pos; 93 } 94 95 static inline bool btree_path_pos_before_node(struct btree_path *path, 96 struct btree *b) 97 { 98 return bpos_lt(path->pos, b->data->min_key); 99 } 100 101 static inline bool btree_path_pos_after_node(struct btree_path *path, 102 struct btree *b) 103 { 104 return bpos_gt(path->pos, b->key.k.p); 105 } 106 107 static inline bool btree_path_pos_in_node(struct btree_path *path, 108 struct btree *b) 109 { 110 return path->btree_id == b->c.btree_id && 111 !btree_path_pos_before_node(path, b) && 112 !btree_path_pos_after_node(path, b); 113 } 114 115 /* Btree iterator: */ 116 117 #ifdef CONFIG_BCACHEFS_DEBUG 118 119 static void bch2_btree_path_verify_cached(struct btree_trans *trans, 120 struct btree_path *path) 121 { 122 struct bkey_cached *ck; 123 bool locked = btree_node_locked(path, 0); 124 125 if (!bch2_btree_node_relock(trans, path, 0)) 126 return; 127 128 ck = (void *) path->l[0].b; 129 BUG_ON(ck->key.btree_id != path->btree_id || 130 !bkey_eq(ck->key.pos, path->pos)); 131 132 if (!locked) 133 btree_node_unlock(trans, path, 0); 134 } 135 136 static void bch2_btree_path_verify_level(struct btree_trans *trans, 137 struct btree_path *path, unsigned level) 138 { 139 struct btree_path_level *l; 140 struct btree_node_iter tmp; 141 bool locked; 142 struct bkey_packed *p, *k; 143 struct printbuf buf1 = PRINTBUF; 144 struct printbuf buf2 = PRINTBUF; 145 struct printbuf buf3 = PRINTBUF; 146 const char *msg; 147 148 if (!bch2_debug_check_iterators) 149 return; 150 151 l = &path->l[level]; 152 tmp = l->iter; 153 locked = btree_node_locked(path, level); 154 155 if (path->cached) { 156 if (!level) 157 bch2_btree_path_verify_cached(trans, path); 158 return; 159 } 160 161 if (!btree_path_node(path, level)) 162 return; 163 164 if (!bch2_btree_node_relock_notrace(trans, path, level)) 165 return; 166 167 BUG_ON(!btree_path_pos_in_node(path, l->b)); 168 169 bch2_btree_node_iter_verify(&l->iter, l->b); 170 171 /* 172 * For interior nodes, the iterator will have skipped past deleted keys: 173 */ 174 p = level 175 ? bch2_btree_node_iter_prev(&tmp, l->b) 176 : bch2_btree_node_iter_prev_all(&tmp, l->b); 177 k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 178 179 if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { 180 msg = "before"; 181 goto err; 182 } 183 184 if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { 185 msg = "after"; 186 goto err; 187 } 188 189 if (!locked) 190 btree_node_unlock(trans, path, level); 191 return; 192 err: 193 bch2_bpos_to_text(&buf1, path->pos); 194 195 if (p) { 196 struct bkey uk = bkey_unpack_key(l->b, p); 197 198 bch2_bkey_to_text(&buf2, &uk); 199 } else { 200 prt_printf(&buf2, "(none)"); 201 } 202 203 if (k) { 204 struct bkey uk = bkey_unpack_key(l->b, k); 205 206 bch2_bkey_to_text(&buf3, &uk); 207 } else { 208 prt_printf(&buf3, "(none)"); 209 } 210 211 panic("path should be %s key at level %u:\n" 212 "path pos %s\n" 213 "prev key %s\n" 214 "cur key %s\n", 215 msg, level, buf1.buf, buf2.buf, buf3.buf); 216 } 217 218 static void bch2_btree_path_verify(struct btree_trans *trans, 219 struct btree_path *path) 220 { 221 struct bch_fs *c = trans->c; 222 unsigned i; 223 224 EBUG_ON(path->btree_id >= BTREE_ID_NR); 225 226 for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { 227 if (!path->l[i].b) { 228 BUG_ON(!path->cached && 229 bch2_btree_id_root(c, path->btree_id)->b->c.level > i); 230 break; 231 } 232 233 bch2_btree_path_verify_level(trans, path, i); 234 } 235 236 bch2_btree_path_verify_locks(path); 237 } 238 239 void bch2_trans_verify_paths(struct btree_trans *trans) 240 { 241 struct btree_path *path; 242 243 trans_for_each_path(trans, path) 244 bch2_btree_path_verify(trans, path); 245 } 246 247 static void bch2_btree_iter_verify(struct btree_iter *iter) 248 { 249 struct btree_trans *trans = iter->trans; 250 251 BUG_ON(iter->btree_id >= BTREE_ID_NR); 252 253 BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); 254 255 BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && 256 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); 257 258 BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && 259 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && 260 !btree_type_has_snapshot_field(iter->btree_id)); 261 262 if (iter->update_path) 263 bch2_btree_path_verify(trans, iter->update_path); 264 bch2_btree_path_verify(trans, iter->path); 265 } 266 267 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) 268 { 269 BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 270 !iter->pos.snapshot); 271 272 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && 273 iter->pos.snapshot != iter->snapshot); 274 275 BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || 276 bkey_gt(iter->pos, iter->k.p)); 277 } 278 279 static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) 280 { 281 struct btree_trans *trans = iter->trans; 282 struct btree_iter copy; 283 struct bkey_s_c prev; 284 int ret = 0; 285 286 if (!bch2_debug_check_iterators) 287 return 0; 288 289 if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) 290 return 0; 291 292 if (bkey_err(k) || !k.k) 293 return 0; 294 295 BUG_ON(!bch2_snapshot_is_ancestor(trans->c, 296 iter->snapshot, 297 k.k->p.snapshot)); 298 299 bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, 300 BTREE_ITER_NOPRESERVE| 301 BTREE_ITER_ALL_SNAPSHOTS); 302 prev = bch2_btree_iter_prev(©); 303 if (!prev.k) 304 goto out; 305 306 ret = bkey_err(prev); 307 if (ret) 308 goto out; 309 310 if (bkey_eq(prev.k->p, k.k->p) && 311 bch2_snapshot_is_ancestor(trans->c, iter->snapshot, 312 prev.k->p.snapshot) > 0) { 313 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 314 315 bch2_bkey_to_text(&buf1, k.k); 316 bch2_bkey_to_text(&buf2, prev.k); 317 318 panic("iter snap %u\n" 319 "k %s\n" 320 "prev %s\n", 321 iter->snapshot, 322 buf1.buf, buf2.buf); 323 } 324 out: 325 bch2_trans_iter_exit(trans, ©); 326 return ret; 327 } 328 329 void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, 330 struct bpos pos, bool key_cache) 331 { 332 struct btree_path *path; 333 unsigned idx; 334 struct printbuf buf = PRINTBUF; 335 336 btree_trans_sort_paths(trans); 337 338 trans_for_each_path_inorder(trans, path, idx) { 339 int cmp = cmp_int(path->btree_id, id) ?: 340 cmp_int(path->cached, key_cache); 341 342 if (cmp > 0) 343 break; 344 if (cmp < 0) 345 continue; 346 347 if (!btree_node_locked(path, 0) || 348 !path->should_be_locked) 349 continue; 350 351 if (!key_cache) { 352 if (bkey_ge(pos, path->l[0].b->data->min_key) && 353 bkey_le(pos, path->l[0].b->key.k.p)) 354 return; 355 } else { 356 if (bkey_eq(pos, path->pos)) 357 return; 358 } 359 } 360 361 bch2_dump_trans_paths_updates(trans); 362 bch2_bpos_to_text(&buf, pos); 363 364 panic("not locked: %s %s%s\n", 365 bch2_btree_id_str(id), buf.buf, 366 key_cache ? " cached" : ""); 367 } 368 369 #else 370 371 static inline void bch2_btree_path_verify_level(struct btree_trans *trans, 372 struct btree_path *path, unsigned l) {} 373 static inline void bch2_btree_path_verify(struct btree_trans *trans, 374 struct btree_path *path) {} 375 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} 376 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} 377 static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } 378 379 #endif 380 381 /* Btree path: fixups after btree updates */ 382 383 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, 384 struct btree *b, 385 struct bset_tree *t, 386 struct bkey_packed *k) 387 { 388 struct btree_node_iter_set *set; 389 390 btree_node_iter_for_each(iter, set) 391 if (set->end == t->end_offset) { 392 set->k = __btree_node_key_to_offset(b, k); 393 bch2_btree_node_iter_sort(iter, b); 394 return; 395 } 396 397 bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); 398 } 399 400 static void __bch2_btree_path_fix_key_modified(struct btree_path *path, 401 struct btree *b, 402 struct bkey_packed *where) 403 { 404 struct btree_path_level *l = &path->l[b->c.level]; 405 406 if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) 407 return; 408 409 if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) 410 bch2_btree_node_iter_advance(&l->iter, l->b); 411 } 412 413 void bch2_btree_path_fix_key_modified(struct btree_trans *trans, 414 struct btree *b, 415 struct bkey_packed *where) 416 { 417 struct btree_path *path; 418 419 trans_for_each_path_with_node(trans, b, path) { 420 __bch2_btree_path_fix_key_modified(path, b, where); 421 bch2_btree_path_verify_level(trans, path, b->c.level); 422 } 423 } 424 425 static void __bch2_btree_node_iter_fix(struct btree_path *path, 426 struct btree *b, 427 struct btree_node_iter *node_iter, 428 struct bset_tree *t, 429 struct bkey_packed *where, 430 unsigned clobber_u64s, 431 unsigned new_u64s) 432 { 433 const struct bkey_packed *end = btree_bkey_last(b, t); 434 struct btree_node_iter_set *set; 435 unsigned offset = __btree_node_key_to_offset(b, where); 436 int shift = new_u64s - clobber_u64s; 437 unsigned old_end = t->end_offset - shift; 438 unsigned orig_iter_pos = node_iter->data[0].k; 439 bool iter_current_key_modified = 440 orig_iter_pos >= offset && 441 orig_iter_pos <= offset + clobber_u64s; 442 443 btree_node_iter_for_each(node_iter, set) 444 if (set->end == old_end) 445 goto found; 446 447 /* didn't find the bset in the iterator - might have to readd it: */ 448 if (new_u64s && 449 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { 450 bch2_btree_node_iter_push(node_iter, b, where, end); 451 goto fixup_done; 452 } else { 453 /* Iterator is after key that changed */ 454 return; 455 } 456 found: 457 set->end = t->end_offset; 458 459 /* Iterator hasn't gotten to the key that changed yet: */ 460 if (set->k < offset) 461 return; 462 463 if (new_u64s && 464 bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { 465 set->k = offset; 466 } else if (set->k < offset + clobber_u64s) { 467 set->k = offset + new_u64s; 468 if (set->k == set->end) 469 bch2_btree_node_iter_set_drop(node_iter, set); 470 } else { 471 /* Iterator is after key that changed */ 472 set->k = (int) set->k + shift; 473 return; 474 } 475 476 bch2_btree_node_iter_sort(node_iter, b); 477 fixup_done: 478 if (node_iter->data[0].k != orig_iter_pos) 479 iter_current_key_modified = true; 480 481 /* 482 * When a new key is added, and the node iterator now points to that 483 * key, the iterator might have skipped past deleted keys that should 484 * come after the key the iterator now points to. We have to rewind to 485 * before those deleted keys - otherwise 486 * bch2_btree_node_iter_prev_all() breaks: 487 */ 488 if (!bch2_btree_node_iter_end(node_iter) && 489 iter_current_key_modified && 490 b->c.level) { 491 struct bkey_packed *k, *k2, *p; 492 493 k = bch2_btree_node_iter_peek_all(node_iter, b); 494 495 for_each_bset(b, t) { 496 bool set_pos = false; 497 498 if (node_iter->data[0].end == t->end_offset) 499 continue; 500 501 k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); 502 503 while ((p = bch2_bkey_prev_all(b, t, k2)) && 504 bkey_iter_cmp(b, k, p) < 0) { 505 k2 = p; 506 set_pos = true; 507 } 508 509 if (set_pos) 510 btree_node_iter_set_set_pos(node_iter, 511 b, t, k2); 512 } 513 } 514 } 515 516 void bch2_btree_node_iter_fix(struct btree_trans *trans, 517 struct btree_path *path, 518 struct btree *b, 519 struct btree_node_iter *node_iter, 520 struct bkey_packed *where, 521 unsigned clobber_u64s, 522 unsigned new_u64s) 523 { 524 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); 525 struct btree_path *linked; 526 527 if (node_iter != &path->l[b->c.level].iter) { 528 __bch2_btree_node_iter_fix(path, b, node_iter, t, 529 where, clobber_u64s, new_u64s); 530 531 if (bch2_debug_check_iterators) 532 bch2_btree_node_iter_verify(node_iter, b); 533 } 534 535 trans_for_each_path_with_node(trans, b, linked) { 536 __bch2_btree_node_iter_fix(linked, b, 537 &linked->l[b->c.level].iter, t, 538 where, clobber_u64s, new_u64s); 539 bch2_btree_path_verify_level(trans, linked, b->c.level); 540 } 541 } 542 543 /* Btree path level: pointer to a particular btree node and node iter */ 544 545 static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, 546 struct btree_path_level *l, 547 struct bkey *u, 548 struct bkey_packed *k) 549 { 550 if (unlikely(!k)) { 551 /* 552 * signal to bch2_btree_iter_peek_slot() that we're currently at 553 * a hole 554 */ 555 u->type = KEY_TYPE_deleted; 556 return bkey_s_c_null; 557 } 558 559 return bkey_disassemble(l->b, k, u); 560 } 561 562 static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, 563 struct btree_path_level *l, 564 struct bkey *u) 565 { 566 return __btree_iter_unpack(c, l, u, 567 bch2_btree_node_iter_peek_all(&l->iter, l->b)); 568 } 569 570 static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, 571 struct btree_path *path, 572 struct btree_path_level *l, 573 struct bkey *u) 574 { 575 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, 576 bch2_btree_node_iter_peek(&l->iter, l->b)); 577 578 path->pos = k.k ? k.k->p : l->b->key.k.p; 579 trans->paths_sorted = false; 580 bch2_btree_path_verify_level(trans, path, l - path->l); 581 return k; 582 } 583 584 static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, 585 struct btree_path *path, 586 struct btree_path_level *l, 587 struct bkey *u) 588 { 589 struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, 590 bch2_btree_node_iter_prev(&l->iter, l->b)); 591 592 path->pos = k.k ? k.k->p : l->b->data->min_key; 593 trans->paths_sorted = false; 594 bch2_btree_path_verify_level(trans, path, l - path->l); 595 return k; 596 } 597 598 static inline bool btree_path_advance_to_pos(struct btree_path *path, 599 struct btree_path_level *l, 600 int max_advance) 601 { 602 struct bkey_packed *k; 603 int nr_advanced = 0; 604 605 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && 606 bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { 607 if (max_advance > 0 && nr_advanced >= max_advance) 608 return false; 609 610 bch2_btree_node_iter_advance(&l->iter, l->b); 611 nr_advanced++; 612 } 613 614 return true; 615 } 616 617 static inline void __btree_path_level_init(struct btree_path *path, 618 unsigned level) 619 { 620 struct btree_path_level *l = &path->l[level]; 621 622 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); 623 624 /* 625 * Iterators to interior nodes should always be pointed at the first non 626 * whiteout: 627 */ 628 if (level) 629 bch2_btree_node_iter_peek(&l->iter, l->b); 630 } 631 632 void bch2_btree_path_level_init(struct btree_trans *trans, 633 struct btree_path *path, 634 struct btree *b) 635 { 636 BUG_ON(path->cached); 637 638 EBUG_ON(!btree_path_pos_in_node(path, b)); 639 640 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); 641 path->l[b->c.level].b = b; 642 __btree_path_level_init(path, b->c.level); 643 } 644 645 /* Btree path: fixups after btree node updates: */ 646 647 static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) 648 { 649 struct bch_fs *c = trans->c; 650 struct btree_insert_entry *i; 651 652 trans_for_each_update(trans, i) 653 if (!i->cached && 654 i->level == b->c.level && 655 i->btree_id == b->c.btree_id && 656 bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && 657 bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { 658 i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; 659 660 if (unlikely(trans->journal_replay_not_finished)) { 661 struct bkey_i *j_k = 662 bch2_journal_keys_peek_slot(c, i->btree_id, i->level, 663 i->k->k.p); 664 665 if (j_k) { 666 i->old_k = j_k->k; 667 i->old_v = &j_k->v; 668 } 669 } 670 } 671 } 672 673 /* 674 * A btree node is being replaced - update the iterator to point to the new 675 * node: 676 */ 677 void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) 678 { 679 struct btree_path *path; 680 681 trans_for_each_path(trans, path) 682 if (path->uptodate == BTREE_ITER_UPTODATE && 683 !path->cached && 684 btree_path_pos_in_node(path, b)) { 685 enum btree_node_locked_type t = 686 btree_lock_want(path, b->c.level); 687 688 if (t != BTREE_NODE_UNLOCKED) { 689 btree_node_unlock(trans, path, b->c.level); 690 six_lock_increment(&b->c.lock, (enum six_lock_type) t); 691 mark_btree_node_locked(trans, path, b->c.level, t); 692 } 693 694 bch2_btree_path_level_init(trans, path, b); 695 } 696 697 bch2_trans_revalidate_updates_in_node(trans, b); 698 } 699 700 /* 701 * A btree node has been modified in such a way as to invalidate iterators - fix 702 * them: 703 */ 704 void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) 705 { 706 struct btree_path *path; 707 708 trans_for_each_path_with_node(trans, b, path) 709 __btree_path_level_init(path, b->c.level); 710 711 bch2_trans_revalidate_updates_in_node(trans, b); 712 } 713 714 /* Btree path: traverse, set_pos: */ 715 716 static inline int btree_path_lock_root(struct btree_trans *trans, 717 struct btree_path *path, 718 unsigned depth_want, 719 unsigned long trace_ip) 720 { 721 struct bch_fs *c = trans->c; 722 struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; 723 enum six_lock_type lock_type; 724 unsigned i; 725 int ret; 726 727 EBUG_ON(path->nodes_locked); 728 729 while (1) { 730 b = READ_ONCE(*rootp); 731 path->level = READ_ONCE(b->c.level); 732 733 if (unlikely(path->level < depth_want)) { 734 /* 735 * the root is at a lower depth than the depth we want: 736 * got to the end of the btree, or we're walking nodes 737 * greater than some depth and there are no nodes >= 738 * that depth 739 */ 740 path->level = depth_want; 741 for (i = path->level; i < BTREE_MAX_DEPTH; i++) 742 path->l[i].b = NULL; 743 return 1; 744 } 745 746 lock_type = __btree_lock_want(path, path->level); 747 ret = btree_node_lock(trans, path, &b->c, 748 path->level, lock_type, trace_ip); 749 if (unlikely(ret)) { 750 if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) 751 continue; 752 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 753 return ret; 754 BUG(); 755 } 756 757 if (likely(b == READ_ONCE(*rootp) && 758 b->c.level == path->level && 759 !race_fault())) { 760 for (i = 0; i < path->level; i++) 761 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); 762 path->l[path->level].b = b; 763 for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) 764 path->l[i].b = NULL; 765 766 mark_btree_node_locked(trans, path, path->level, 767 (enum btree_node_locked_type) lock_type); 768 bch2_btree_path_level_init(trans, path, b); 769 return 0; 770 } 771 772 six_unlock_type(&b->c.lock, lock_type); 773 } 774 } 775 776 noinline 777 static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) 778 { 779 struct bch_fs *c = trans->c; 780 struct btree_path_level *l = path_l(path); 781 struct btree_node_iter node_iter = l->iter; 782 struct bkey_packed *k; 783 struct bkey_buf tmp; 784 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) 785 ? (path->level > 1 ? 0 : 2) 786 : (path->level > 1 ? 1 : 16); 787 bool was_locked = btree_node_locked(path, path->level); 788 int ret = 0; 789 790 bch2_bkey_buf_init(&tmp); 791 792 while (nr-- && !ret) { 793 if (!bch2_btree_node_relock(trans, path, path->level)) 794 break; 795 796 bch2_btree_node_iter_advance(&node_iter, l->b); 797 k = bch2_btree_node_iter_peek(&node_iter, l->b); 798 if (!k) 799 break; 800 801 bch2_bkey_buf_unpack(&tmp, c, l->b, k); 802 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, 803 path->level - 1); 804 } 805 806 if (!was_locked) 807 btree_node_unlock(trans, path, path->level); 808 809 bch2_bkey_buf_exit(&tmp, c); 810 return ret; 811 } 812 813 static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, 814 struct btree_and_journal_iter *jiter) 815 { 816 struct bch_fs *c = trans->c; 817 struct bkey_s_c k; 818 struct bkey_buf tmp; 819 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) 820 ? (path->level > 1 ? 0 : 2) 821 : (path->level > 1 ? 1 : 16); 822 bool was_locked = btree_node_locked(path, path->level); 823 int ret = 0; 824 825 bch2_bkey_buf_init(&tmp); 826 827 while (nr-- && !ret) { 828 if (!bch2_btree_node_relock(trans, path, path->level)) 829 break; 830 831 bch2_btree_and_journal_iter_advance(jiter); 832 k = bch2_btree_and_journal_iter_peek(jiter); 833 if (!k.k) 834 break; 835 836 bch2_bkey_buf_reassemble(&tmp, c, k); 837 ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, 838 path->level - 1); 839 } 840 841 if (!was_locked) 842 btree_node_unlock(trans, path, path->level); 843 844 bch2_bkey_buf_exit(&tmp, c); 845 return ret; 846 } 847 848 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, 849 struct btree_path *path, 850 unsigned plevel, struct btree *b) 851 { 852 struct btree_path_level *l = &path->l[plevel]; 853 bool locked = btree_node_locked(path, plevel); 854 struct bkey_packed *k; 855 struct bch_btree_ptr_v2 *bp; 856 857 if (!bch2_btree_node_relock(trans, path, plevel)) 858 return; 859 860 k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 861 BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); 862 863 bp = (void *) bkeyp_val(&l->b->format, k); 864 bp->mem_ptr = (unsigned long)b; 865 866 if (!locked) 867 btree_node_unlock(trans, path, plevel); 868 } 869 870 static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, 871 struct btree_path *path, 872 unsigned flags, 873 struct bkey_buf *out) 874 { 875 struct bch_fs *c = trans->c; 876 struct btree_path_level *l = path_l(path); 877 struct btree_and_journal_iter jiter; 878 struct bkey_s_c k; 879 int ret = 0; 880 881 __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); 882 883 k = bch2_btree_and_journal_iter_peek(&jiter); 884 885 bch2_bkey_buf_reassemble(out, c, k); 886 887 if (flags & BTREE_ITER_PREFETCH) 888 ret = btree_path_prefetch_j(trans, path, &jiter); 889 890 bch2_btree_and_journal_iter_exit(&jiter); 891 return ret; 892 } 893 894 static __always_inline int btree_path_down(struct btree_trans *trans, 895 struct btree_path *path, 896 unsigned flags, 897 unsigned long trace_ip) 898 { 899 struct bch_fs *c = trans->c; 900 struct btree_path_level *l = path_l(path); 901 struct btree *b; 902 unsigned level = path->level - 1; 903 enum six_lock_type lock_type = __btree_lock_want(path, level); 904 struct bkey_buf tmp; 905 int ret; 906 907 EBUG_ON(!btree_node_locked(path, path->level)); 908 909 bch2_bkey_buf_init(&tmp); 910 911 if (unlikely(trans->journal_replay_not_finished)) { 912 ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); 913 if (ret) 914 goto err; 915 } else { 916 bch2_bkey_buf_unpack(&tmp, c, l->b, 917 bch2_btree_node_iter_peek(&l->iter, l->b)); 918 919 if (flags & BTREE_ITER_PREFETCH) { 920 ret = btree_path_prefetch(trans, path); 921 if (ret) 922 goto err; 923 } 924 } 925 926 b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); 927 ret = PTR_ERR_OR_ZERO(b); 928 if (unlikely(ret)) 929 goto err; 930 931 if (likely(!trans->journal_replay_not_finished && 932 tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && 933 unlikely(b != btree_node_mem_ptr(tmp.k))) 934 btree_node_mem_ptr_set(trans, path, level + 1, b); 935 936 if (btree_node_read_locked(path, level + 1)) 937 btree_node_unlock(trans, path, level + 1); 938 939 mark_btree_node_locked(trans, path, level, 940 (enum btree_node_locked_type) lock_type); 941 path->level = level; 942 bch2_btree_path_level_init(trans, path, b); 943 944 bch2_btree_path_verify_locks(path); 945 err: 946 bch2_bkey_buf_exit(&tmp, c); 947 return ret; 948 } 949 950 951 static int bch2_btree_path_traverse_all(struct btree_trans *trans) 952 { 953 struct bch_fs *c = trans->c; 954 struct btree_path *path; 955 unsigned long trace_ip = _RET_IP_; 956 int i, ret = 0; 957 958 if (trans->in_traverse_all) 959 return -BCH_ERR_transaction_restart_in_traverse_all; 960 961 trans->in_traverse_all = true; 962 retry_all: 963 trans->restarted = 0; 964 trans->last_restarted_ip = 0; 965 966 trans_for_each_path(trans, path) 967 path->should_be_locked = false; 968 969 btree_trans_sort_paths(trans); 970 971 bch2_trans_unlock(trans); 972 cond_resched(); 973 974 if (unlikely(trans->memory_allocation_failure)) { 975 struct closure cl; 976 977 closure_init_stack(&cl); 978 979 do { 980 ret = bch2_btree_cache_cannibalize_lock(c, &cl); 981 closure_sync(&cl); 982 } while (ret); 983 } 984 985 /* Now, redo traversals in correct order: */ 986 i = 0; 987 while (i < trans->nr_sorted) { 988 path = trans->paths + trans->sorted[i]; 989 990 /* 991 * Traversing a path can cause another path to be added at about 992 * the same position: 993 */ 994 if (path->uptodate) { 995 __btree_path_get(path, false); 996 ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); 997 __btree_path_put(path, false); 998 999 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1000 bch2_err_matches(ret, ENOMEM)) 1001 goto retry_all; 1002 if (ret) 1003 goto err; 1004 } else { 1005 i++; 1006 } 1007 } 1008 1009 /* 1010 * We used to assert that all paths had been traversed here 1011 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since 1012 * path->should_be_locked is not set yet, we might have unlocked and 1013 * then failed to relock a path - that's fine. 1014 */ 1015 err: 1016 bch2_btree_cache_cannibalize_unlock(c); 1017 1018 trans->in_traverse_all = false; 1019 1020 trace_and_count(c, trans_traverse_all, trans, trace_ip); 1021 return ret; 1022 } 1023 1024 static inline bool btree_path_check_pos_in_node(struct btree_path *path, 1025 unsigned l, int check_pos) 1026 { 1027 if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) 1028 return false; 1029 if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) 1030 return false; 1031 return true; 1032 } 1033 1034 static inline bool btree_path_good_node(struct btree_trans *trans, 1035 struct btree_path *path, 1036 unsigned l, int check_pos) 1037 { 1038 return is_btree_node(path, l) && 1039 bch2_btree_node_relock(trans, path, l) && 1040 btree_path_check_pos_in_node(path, l, check_pos); 1041 } 1042 1043 static void btree_path_set_level_down(struct btree_trans *trans, 1044 struct btree_path *path, 1045 unsigned new_level) 1046 { 1047 unsigned l; 1048 1049 path->level = new_level; 1050 1051 for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) 1052 if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) 1053 btree_node_unlock(trans, path, l); 1054 1055 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1056 bch2_btree_path_verify(trans, path); 1057 } 1058 1059 static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, 1060 struct btree_path *path, 1061 int check_pos) 1062 { 1063 unsigned i, l = path->level; 1064 again: 1065 while (btree_path_node(path, l) && 1066 !btree_path_good_node(trans, path, l, check_pos)) 1067 __btree_path_set_level_up(trans, path, l++); 1068 1069 /* If we need intent locks, take them too: */ 1070 for (i = l + 1; 1071 i < path->locks_want && btree_path_node(path, i); 1072 i++) 1073 if (!bch2_btree_node_relock(trans, path, i)) { 1074 while (l <= i) 1075 __btree_path_set_level_up(trans, path, l++); 1076 goto again; 1077 } 1078 1079 return l; 1080 } 1081 1082 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, 1083 struct btree_path *path, 1084 int check_pos) 1085 { 1086 return likely(btree_node_locked(path, path->level) && 1087 btree_path_check_pos_in_node(path, path->level, check_pos)) 1088 ? path->level 1089 : __btree_path_up_until_good_node(trans, path, check_pos); 1090 } 1091 1092 /* 1093 * This is the main state machine for walking down the btree - walks down to a 1094 * specified depth 1095 * 1096 * Returns 0 on success, -EIO on error (error reading in a btree node). 1097 * 1098 * On error, caller (peek_node()/peek_key()) must return NULL; the error is 1099 * stashed in the iterator and returned from bch2_trans_exit(). 1100 */ 1101 int bch2_btree_path_traverse_one(struct btree_trans *trans, 1102 struct btree_path *path, 1103 unsigned flags, 1104 unsigned long trace_ip) 1105 { 1106 unsigned depth_want = path->level; 1107 int ret = -((int) trans->restarted); 1108 1109 if (unlikely(ret)) 1110 goto out; 1111 1112 if (unlikely(!trans->srcu_held)) 1113 bch2_trans_srcu_lock(trans); 1114 1115 /* 1116 * Ensure we obey path->should_be_locked: if it's set, we can't unlock 1117 * and re-traverse the path without a transaction restart: 1118 */ 1119 if (path->should_be_locked) { 1120 ret = bch2_btree_path_relock(trans, path, trace_ip); 1121 goto out; 1122 } 1123 1124 if (path->cached) { 1125 ret = bch2_btree_path_traverse_cached(trans, path, flags); 1126 goto out; 1127 } 1128 1129 if (unlikely(path->level >= BTREE_MAX_DEPTH)) 1130 goto out; 1131 1132 path->level = btree_path_up_until_good_node(trans, path, 0); 1133 1134 EBUG_ON(btree_path_node(path, path->level) && 1135 !btree_node_locked(path, path->level)); 1136 1137 /* 1138 * Note: path->nodes[path->level] may be temporarily NULL here - that 1139 * would indicate to other code that we got to the end of the btree, 1140 * here it indicates that relocking the root failed - it's critical that 1141 * btree_path_lock_root() comes next and that it can't fail 1142 */ 1143 while (path->level > depth_want) { 1144 ret = btree_path_node(path, path->level) 1145 ? btree_path_down(trans, path, flags, trace_ip) 1146 : btree_path_lock_root(trans, path, depth_want, trace_ip); 1147 if (unlikely(ret)) { 1148 if (ret == 1) { 1149 /* 1150 * No nodes at this level - got to the end of 1151 * the btree: 1152 */ 1153 ret = 0; 1154 goto out; 1155 } 1156 1157 __bch2_btree_path_unlock(trans, path); 1158 path->level = depth_want; 1159 path->l[path->level].b = ERR_PTR(ret); 1160 goto out; 1161 } 1162 } 1163 1164 path->uptodate = BTREE_ITER_UPTODATE; 1165 out: 1166 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) 1167 panic("ret %s (%i) trans->restarted %s (%i)\n", 1168 bch2_err_str(ret), ret, 1169 bch2_err_str(trans->restarted), trans->restarted); 1170 bch2_btree_path_verify(trans, path); 1171 return ret; 1172 } 1173 1174 static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, 1175 struct btree_path *src) 1176 { 1177 unsigned i, offset = offsetof(struct btree_path, pos); 1178 1179 memcpy((void *) dst + offset, 1180 (void *) src + offset, 1181 sizeof(struct btree_path) - offset); 1182 1183 for (i = 0; i < BTREE_MAX_DEPTH; i++) { 1184 unsigned t = btree_node_locked_type(dst, i); 1185 1186 if (t != BTREE_NODE_UNLOCKED) 1187 six_lock_increment(&dst->l[i].b->c.lock, t); 1188 } 1189 } 1190 1191 static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, 1192 bool intent) 1193 { 1194 struct btree_path *new = btree_path_alloc(trans, src); 1195 1196 btree_path_copy(trans, new, src); 1197 __btree_path_get(new, intent); 1198 return new; 1199 } 1200 1201 __flatten 1202 struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, 1203 struct btree_path *path, bool intent, 1204 unsigned long ip) 1205 { 1206 __btree_path_put(path, intent); 1207 path = btree_path_clone(trans, path, intent); 1208 path->preserve = false; 1209 return path; 1210 } 1211 1212 struct btree_path * __must_check 1213 __bch2_btree_path_set_pos(struct btree_trans *trans, 1214 struct btree_path *path, struct bpos new_pos, 1215 bool intent, unsigned long ip, int cmp) 1216 { 1217 unsigned level = path->level; 1218 1219 bch2_trans_verify_not_in_restart(trans); 1220 EBUG_ON(!path->ref); 1221 1222 path = bch2_btree_path_make_mut(trans, path, intent, ip); 1223 1224 path->pos = new_pos; 1225 trans->paths_sorted = false; 1226 1227 if (unlikely(path->cached)) { 1228 btree_node_unlock(trans, path, 0); 1229 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); 1230 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1231 goto out; 1232 } 1233 1234 level = btree_path_up_until_good_node(trans, path, cmp); 1235 1236 if (btree_path_node(path, level)) { 1237 struct btree_path_level *l = &path->l[level]; 1238 1239 BUG_ON(!btree_node_locked(path, level)); 1240 /* 1241 * We might have to skip over many keys, or just a few: try 1242 * advancing the node iterator, and if we have to skip over too 1243 * many keys just reinit it (or if we're rewinding, since that 1244 * is expensive). 1245 */ 1246 if (cmp < 0 || 1247 !btree_path_advance_to_pos(path, l, 8)) 1248 bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); 1249 1250 /* 1251 * Iterators to interior nodes should always be pointed at the first non 1252 * whiteout: 1253 */ 1254 if (unlikely(level)) 1255 bch2_btree_node_iter_peek(&l->iter, l->b); 1256 } 1257 1258 if (unlikely(level != path->level)) { 1259 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1260 __bch2_btree_path_unlock(trans, path); 1261 } 1262 out: 1263 bch2_btree_path_verify(trans, path); 1264 return path; 1265 } 1266 1267 /* Btree path: main interface: */ 1268 1269 static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) 1270 { 1271 struct btree_path *sib; 1272 1273 sib = prev_btree_path(trans, path); 1274 if (sib && !btree_path_cmp(sib, path)) 1275 return sib; 1276 1277 sib = next_btree_path(trans, path); 1278 if (sib && !btree_path_cmp(sib, path)) 1279 return sib; 1280 1281 return NULL; 1282 } 1283 1284 static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) 1285 { 1286 struct btree_path *sib; 1287 1288 sib = prev_btree_path(trans, path); 1289 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) 1290 return sib; 1291 1292 sib = next_btree_path(trans, path); 1293 if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) 1294 return sib; 1295 1296 return NULL; 1297 } 1298 1299 static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) 1300 { 1301 __bch2_btree_path_unlock(trans, path); 1302 btree_path_list_remove(trans, path); 1303 trans->paths_allocated &= ~(1ULL << path->idx); 1304 } 1305 1306 void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) 1307 { 1308 struct btree_path *dup; 1309 1310 EBUG_ON(trans->paths + path->idx != path); 1311 EBUG_ON(!path->ref); 1312 1313 if (!__btree_path_put(path, intent)) 1314 return; 1315 1316 dup = path->preserve 1317 ? have_path_at_pos(trans, path) 1318 : have_node_at_pos(trans, path); 1319 1320 if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) 1321 return; 1322 1323 if (path->should_be_locked && 1324 !trans->restarted && 1325 (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) 1326 return; 1327 1328 if (dup) { 1329 dup->preserve |= path->preserve; 1330 dup->should_be_locked |= path->should_be_locked; 1331 } 1332 1333 __bch2_path_free(trans, path); 1334 } 1335 1336 static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, 1337 bool intent) 1338 { 1339 EBUG_ON(trans->paths + path->idx != path); 1340 EBUG_ON(!path->ref); 1341 1342 if (!__btree_path_put(path, intent)) 1343 return; 1344 1345 __bch2_path_free(trans, path); 1346 } 1347 1348 void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) 1349 { 1350 panic("trans->restart_count %u, should be %u, last restarted by %pS\n", 1351 trans->restart_count, restart_count, 1352 (void *) trans->last_begin_ip); 1353 } 1354 1355 void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) 1356 { 1357 panic("in transaction restart: %s, last restarted by %pS\n", 1358 bch2_err_str(trans->restarted), 1359 (void *) trans->last_restarted_ip); 1360 } 1361 1362 noinline __cold 1363 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) 1364 { 1365 struct btree_insert_entry *i; 1366 struct btree_write_buffered_key *wb; 1367 1368 prt_printf(buf, "transaction updates for %s journal seq %llu", 1369 trans->fn, trans->journal_res.seq); 1370 prt_newline(buf); 1371 printbuf_indent_add(buf, 2); 1372 1373 trans_for_each_update(trans, i) { 1374 struct bkey_s_c old = { &i->old_k, i->old_v }; 1375 1376 prt_printf(buf, "update: btree=%s cached=%u %pS", 1377 bch2_btree_id_str(i->btree_id), 1378 i->cached, 1379 (void *) i->ip_allocated); 1380 prt_newline(buf); 1381 1382 prt_printf(buf, " old "); 1383 bch2_bkey_val_to_text(buf, trans->c, old); 1384 prt_newline(buf); 1385 1386 prt_printf(buf, " new "); 1387 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); 1388 prt_newline(buf); 1389 } 1390 1391 trans_for_each_wb_update(trans, wb) { 1392 prt_printf(buf, "update: btree=%s wb=1 %pS", 1393 bch2_btree_id_str(wb->btree), 1394 (void *) i->ip_allocated); 1395 prt_newline(buf); 1396 1397 prt_printf(buf, " new "); 1398 bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); 1399 prt_newline(buf); 1400 } 1401 1402 printbuf_indent_sub(buf, 2); 1403 } 1404 1405 noinline __cold 1406 void bch2_dump_trans_updates(struct btree_trans *trans) 1407 { 1408 struct printbuf buf = PRINTBUF; 1409 1410 bch2_trans_updates_to_text(&buf, trans); 1411 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1412 printbuf_exit(&buf); 1413 } 1414 1415 noinline __cold 1416 void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) 1417 { 1418 prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", 1419 path->idx, path->ref, path->intent_ref, 1420 path->preserve ? 'P' : ' ', 1421 path->should_be_locked ? 'S' : ' ', 1422 bch2_btree_id_str(path->btree_id), 1423 path->level); 1424 bch2_bpos_to_text(out, path->pos); 1425 1426 prt_printf(out, " locks %u", path->nodes_locked); 1427 #ifdef TRACK_PATH_ALLOCATED 1428 prt_printf(out, " %pS", (void *) path->ip_allocated); 1429 #endif 1430 prt_newline(out); 1431 } 1432 1433 static noinline __cold 1434 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, 1435 bool nosort) 1436 { 1437 struct btree_path *path; 1438 unsigned idx; 1439 1440 if (!nosort) 1441 btree_trans_sort_paths(trans); 1442 1443 trans_for_each_path_inorder(trans, path, idx) 1444 bch2_btree_path_to_text(out, path); 1445 } 1446 1447 noinline __cold 1448 void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) 1449 { 1450 __bch2_trans_paths_to_text(out, trans, false); 1451 } 1452 1453 static noinline __cold 1454 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) 1455 { 1456 struct printbuf buf = PRINTBUF; 1457 1458 __bch2_trans_paths_to_text(&buf, trans, nosort); 1459 bch2_trans_updates_to_text(&buf, trans); 1460 1461 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1462 printbuf_exit(&buf); 1463 } 1464 1465 noinline __cold 1466 void bch2_dump_trans_paths_updates(struct btree_trans *trans) 1467 { 1468 __bch2_dump_trans_paths_updates(trans, false); 1469 } 1470 1471 noinline __cold 1472 static void bch2_trans_update_max_paths(struct btree_trans *trans) 1473 { 1474 struct btree_transaction_stats *s = btree_trans_stats(trans); 1475 struct printbuf buf = PRINTBUF; 1476 1477 if (!s) 1478 return; 1479 1480 bch2_trans_paths_to_text(&buf, trans); 1481 1482 if (!buf.allocation_failure) { 1483 mutex_lock(&s->lock); 1484 if (s->nr_max_paths < hweight64(trans->paths_allocated)) { 1485 s->nr_max_paths = trans->nr_max_paths = 1486 hweight64(trans->paths_allocated); 1487 swap(s->max_paths_text, buf.buf); 1488 } 1489 mutex_unlock(&s->lock); 1490 } 1491 1492 printbuf_exit(&buf); 1493 1494 trans->nr_max_paths = hweight64(trans->paths_allocated); 1495 } 1496 1497 static noinline void btree_path_overflow(struct btree_trans *trans) 1498 { 1499 bch2_dump_trans_paths_updates(trans); 1500 panic("trans path overflow\n"); 1501 } 1502 1503 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, 1504 struct btree_path *pos) 1505 { 1506 struct btree_path *path; 1507 unsigned idx; 1508 1509 if (unlikely(trans->paths_allocated == 1510 ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) 1511 btree_path_overflow(trans); 1512 1513 idx = __ffs64(~trans->paths_allocated); 1514 1515 /* 1516 * Do this before marking the new path as allocated, since it won't be 1517 * initialized yet: 1518 */ 1519 if (unlikely(idx > trans->nr_max_paths)) 1520 bch2_trans_update_max_paths(trans); 1521 1522 trans->paths_allocated |= 1ULL << idx; 1523 1524 path = &trans->paths[idx]; 1525 path->idx = idx; 1526 path->ref = 0; 1527 path->intent_ref = 0; 1528 path->nodes_locked = 0; 1529 path->alloc_seq++; 1530 1531 btree_path_list_add(trans, pos, path); 1532 trans->paths_sorted = false; 1533 return path; 1534 } 1535 1536 struct btree_path *bch2_path_get(struct btree_trans *trans, 1537 enum btree_id btree_id, struct bpos pos, 1538 unsigned locks_want, unsigned level, 1539 unsigned flags, unsigned long ip) 1540 { 1541 struct btree_path *path, *path_pos = NULL; 1542 bool cached = flags & BTREE_ITER_CACHED; 1543 bool intent = flags & BTREE_ITER_INTENT; 1544 int i; 1545 1546 bch2_trans_verify_not_in_restart(trans); 1547 bch2_trans_verify_locks(trans); 1548 1549 btree_trans_sort_paths(trans); 1550 1551 trans_for_each_path_inorder(trans, path, i) { 1552 if (__btree_path_cmp(path, 1553 btree_id, 1554 cached, 1555 pos, 1556 level) > 0) 1557 break; 1558 1559 path_pos = path; 1560 } 1561 1562 if (path_pos && 1563 path_pos->cached == cached && 1564 path_pos->btree_id == btree_id && 1565 path_pos->level == level) { 1566 __btree_path_get(path_pos, intent); 1567 path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); 1568 } else { 1569 path = btree_path_alloc(trans, path_pos); 1570 path_pos = NULL; 1571 1572 __btree_path_get(path, intent); 1573 path->pos = pos; 1574 path->btree_id = btree_id; 1575 path->cached = cached; 1576 path->uptodate = BTREE_ITER_NEED_TRAVERSE; 1577 path->should_be_locked = false; 1578 path->level = level; 1579 path->locks_want = locks_want; 1580 path->nodes_locked = 0; 1581 for (i = 0; i < ARRAY_SIZE(path->l); i++) 1582 path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); 1583 #ifdef TRACK_PATH_ALLOCATED 1584 path->ip_allocated = ip; 1585 #endif 1586 trans->paths_sorted = false; 1587 } 1588 1589 if (!(flags & BTREE_ITER_NOPRESERVE)) 1590 path->preserve = true; 1591 1592 if (path->intent_ref) 1593 locks_want = max(locks_want, level + 1); 1594 1595 /* 1596 * If the path has locks_want greater than requested, we don't downgrade 1597 * it here - on transaction restart because btree node split needs to 1598 * upgrade locks, we might be putting/getting the iterator again. 1599 * Downgrading iterators only happens via bch2_trans_downgrade(), after 1600 * a successful transaction commit. 1601 */ 1602 1603 locks_want = min(locks_want, BTREE_MAX_DEPTH); 1604 if (locks_want > path->locks_want) 1605 bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); 1606 1607 return path; 1608 } 1609 1610 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) 1611 { 1612 1613 struct btree_path_level *l = path_l(path); 1614 struct bkey_packed *_k; 1615 struct bkey_s_c k; 1616 1617 if (unlikely(!l->b)) 1618 return bkey_s_c_null; 1619 1620 EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); 1621 EBUG_ON(!btree_node_locked(path, path->level)); 1622 1623 if (!path->cached) { 1624 _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); 1625 k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; 1626 1627 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); 1628 1629 if (!k.k || !bpos_eq(path->pos, k.k->p)) 1630 goto hole; 1631 } else { 1632 struct bkey_cached *ck = (void *) path->l[0].b; 1633 1634 EBUG_ON(ck && 1635 (path->btree_id != ck->key.btree_id || 1636 !bkey_eq(path->pos, ck->key.pos))); 1637 if (!ck || !ck->valid) 1638 return bkey_s_c_null; 1639 1640 *u = ck->k->k; 1641 k = bkey_i_to_s_c(ck->k); 1642 } 1643 1644 return k; 1645 hole: 1646 bkey_init(u); 1647 u->p = path->pos; 1648 return (struct bkey_s_c) { u, NULL }; 1649 } 1650 1651 /* Btree iterators: */ 1652 1653 int __must_check 1654 __bch2_btree_iter_traverse(struct btree_iter *iter) 1655 { 1656 return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); 1657 } 1658 1659 int __must_check 1660 bch2_btree_iter_traverse(struct btree_iter *iter) 1661 { 1662 int ret; 1663 1664 iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, 1665 btree_iter_search_key(iter), 1666 iter->flags & BTREE_ITER_INTENT, 1667 btree_iter_ip_allocated(iter)); 1668 1669 ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); 1670 if (ret) 1671 return ret; 1672 1673 btree_path_set_should_be_locked(iter->path); 1674 return 0; 1675 } 1676 1677 /* Iterate across nodes (leaf and interior nodes) */ 1678 1679 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) 1680 { 1681 struct btree_trans *trans = iter->trans; 1682 struct btree *b = NULL; 1683 int ret; 1684 1685 EBUG_ON(iter->path->cached); 1686 bch2_btree_iter_verify(iter); 1687 1688 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 1689 if (ret) 1690 goto err; 1691 1692 b = btree_path_node(iter->path, iter->path->level); 1693 if (!b) 1694 goto out; 1695 1696 BUG_ON(bpos_lt(b->key.k.p, iter->pos)); 1697 1698 bkey_init(&iter->k); 1699 iter->k.p = iter->pos = b->key.k.p; 1700 1701 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, 1702 iter->flags & BTREE_ITER_INTENT, 1703 btree_iter_ip_allocated(iter)); 1704 btree_path_set_should_be_locked(iter->path); 1705 out: 1706 bch2_btree_iter_verify_entry_exit(iter); 1707 bch2_btree_iter_verify(iter); 1708 1709 return b; 1710 err: 1711 b = ERR_PTR(ret); 1712 goto out; 1713 } 1714 1715 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) 1716 { 1717 struct btree *b; 1718 1719 while (b = bch2_btree_iter_peek_node(iter), 1720 bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) 1721 bch2_trans_begin(iter->trans); 1722 1723 return b; 1724 } 1725 1726 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) 1727 { 1728 struct btree_trans *trans = iter->trans; 1729 struct btree_path *path = iter->path; 1730 struct btree *b = NULL; 1731 int ret; 1732 1733 bch2_trans_verify_not_in_restart(trans); 1734 EBUG_ON(iter->path->cached); 1735 bch2_btree_iter_verify(iter); 1736 1737 /* already at end? */ 1738 if (!btree_path_node(path, path->level)) 1739 return NULL; 1740 1741 /* got to end? */ 1742 if (!btree_path_node(path, path->level + 1)) { 1743 btree_path_set_level_up(trans, path); 1744 return NULL; 1745 } 1746 1747 if (!bch2_btree_node_relock(trans, path, path->level + 1)) { 1748 __bch2_btree_path_unlock(trans, path); 1749 path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); 1750 path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); 1751 btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); 1752 trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); 1753 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); 1754 goto err; 1755 } 1756 1757 b = btree_path_node(path, path->level + 1); 1758 1759 if (bpos_eq(iter->pos, b->key.k.p)) { 1760 __btree_path_set_level_up(trans, path, path->level++); 1761 } else { 1762 /* 1763 * Haven't gotten to the end of the parent node: go back down to 1764 * the next child node 1765 */ 1766 path = iter->path = 1767 bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), 1768 iter->flags & BTREE_ITER_INTENT, 1769 btree_iter_ip_allocated(iter)); 1770 1771 btree_path_set_level_down(trans, path, iter->min_depth); 1772 1773 ret = bch2_btree_path_traverse(trans, path, iter->flags); 1774 if (ret) 1775 goto err; 1776 1777 b = path->l[path->level].b; 1778 } 1779 1780 bkey_init(&iter->k); 1781 iter->k.p = iter->pos = b->key.k.p; 1782 1783 iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, 1784 iter->flags & BTREE_ITER_INTENT, 1785 btree_iter_ip_allocated(iter)); 1786 btree_path_set_should_be_locked(iter->path); 1787 BUG_ON(iter->path->uptodate); 1788 out: 1789 bch2_btree_iter_verify_entry_exit(iter); 1790 bch2_btree_iter_verify(iter); 1791 1792 return b; 1793 err: 1794 b = ERR_PTR(ret); 1795 goto out; 1796 } 1797 1798 /* Iterate across keys (in leaf nodes only) */ 1799 1800 inline bool bch2_btree_iter_advance(struct btree_iter *iter) 1801 { 1802 if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { 1803 struct bpos pos = iter->k.p; 1804 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS 1805 ? bpos_eq(pos, SPOS_MAX) 1806 : bkey_eq(pos, SPOS_MAX)); 1807 1808 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) 1809 pos = bkey_successor(iter, pos); 1810 bch2_btree_iter_set_pos(iter, pos); 1811 return ret; 1812 } else { 1813 if (!btree_path_node(iter->path, iter->path->level)) 1814 return true; 1815 1816 iter->advanced = true; 1817 return false; 1818 } 1819 } 1820 1821 inline bool bch2_btree_iter_rewind(struct btree_iter *iter) 1822 { 1823 struct bpos pos = bkey_start_pos(&iter->k); 1824 bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS 1825 ? bpos_eq(pos, POS_MIN) 1826 : bkey_eq(pos, POS_MIN)); 1827 1828 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) 1829 pos = bkey_predecessor(iter, pos); 1830 bch2_btree_iter_set_pos(iter, pos); 1831 return ret; 1832 } 1833 1834 static noinline 1835 struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) 1836 { 1837 struct btree_insert_entry *i; 1838 struct bkey_i *ret = NULL; 1839 1840 trans_for_each_update(iter->trans, i) { 1841 if (i->btree_id < iter->btree_id) 1842 continue; 1843 if (i->btree_id > iter->btree_id) 1844 break; 1845 if (bpos_lt(i->k->k.p, iter->path->pos)) 1846 continue; 1847 if (i->key_cache_already_flushed) 1848 continue; 1849 if (!ret || bpos_lt(i->k->k.p, ret->k.p)) 1850 ret = i->k; 1851 } 1852 1853 return ret; 1854 } 1855 1856 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) 1857 { 1858 return iter->flags & BTREE_ITER_WITH_UPDATES 1859 ? __bch2_btree_trans_peek_updates(iter) 1860 : NULL; 1861 } 1862 1863 static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, 1864 struct btree_iter *iter, 1865 struct bpos end_pos) 1866 { 1867 struct bkey_i *k; 1868 1869 if (bpos_lt(iter->path->pos, iter->journal_pos)) 1870 iter->journal_idx = 0; 1871 1872 k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 1873 iter->path->level, 1874 iter->path->pos, 1875 end_pos, 1876 &iter->journal_idx); 1877 1878 iter->journal_pos = k ? k->k.p : end_pos; 1879 return k; 1880 } 1881 1882 static noinline 1883 struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, 1884 struct btree_iter *iter) 1885 { 1886 struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); 1887 1888 if (k) { 1889 iter->k = k->k; 1890 return bkey_i_to_s_c(k); 1891 } else { 1892 return bkey_s_c_null; 1893 } 1894 } 1895 1896 static noinline 1897 struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, 1898 struct btree_iter *iter, 1899 struct bkey_s_c k) 1900 { 1901 struct bkey_i *next_journal = 1902 bch2_btree_journal_peek(trans, iter, 1903 k.k ? k.k->p : path_l(iter->path)->b->key.k.p); 1904 1905 if (next_journal) { 1906 iter->k = next_journal->k; 1907 k = bkey_i_to_s_c(next_journal); 1908 } 1909 1910 return k; 1911 } 1912 1913 /* 1914 * Checks btree key cache for key at iter->pos and returns it if present, or 1915 * bkey_s_c_null: 1916 */ 1917 static noinline 1918 struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) 1919 { 1920 struct btree_trans *trans = iter->trans; 1921 struct bch_fs *c = trans->c; 1922 struct bkey u; 1923 struct bkey_s_c k; 1924 int ret; 1925 1926 if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && 1927 bpos_eq(iter->pos, pos)) 1928 return bkey_s_c_null; 1929 1930 if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) 1931 return bkey_s_c_null; 1932 1933 if (!iter->key_cache_path) 1934 iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, 1935 iter->flags & BTREE_ITER_INTENT, 0, 1936 iter->flags|BTREE_ITER_CACHED| 1937 BTREE_ITER_CACHED_NOFILL, 1938 _THIS_IP_); 1939 1940 iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, 1941 iter->flags & BTREE_ITER_INTENT, 1942 btree_iter_ip_allocated(iter)); 1943 1944 ret = bch2_btree_path_traverse(trans, iter->key_cache_path, 1945 iter->flags|BTREE_ITER_CACHED) ?: 1946 bch2_btree_path_relock(trans, iter->path, _THIS_IP_); 1947 if (unlikely(ret)) 1948 return bkey_s_c_err(ret); 1949 1950 btree_path_set_should_be_locked(iter->key_cache_path); 1951 1952 k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); 1953 if (k.k && !bkey_err(k)) { 1954 iter->k = u; 1955 k.k = &iter->k; 1956 } 1957 return k; 1958 } 1959 1960 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) 1961 { 1962 struct btree_trans *trans = iter->trans; 1963 struct bkey_i *next_update; 1964 struct bkey_s_c k, k2; 1965 int ret; 1966 1967 EBUG_ON(iter->path->cached); 1968 bch2_btree_iter_verify(iter); 1969 1970 while (1) { 1971 struct btree_path_level *l; 1972 1973 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 1974 iter->flags & BTREE_ITER_INTENT, 1975 btree_iter_ip_allocated(iter)); 1976 1977 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 1978 if (unlikely(ret)) { 1979 /* ensure that iter->k is consistent with iter->pos: */ 1980 bch2_btree_iter_set_pos(iter, iter->pos); 1981 k = bkey_s_c_err(ret); 1982 goto out; 1983 } 1984 1985 l = path_l(iter->path); 1986 1987 if (unlikely(!l->b)) { 1988 /* No btree nodes at requested level: */ 1989 bch2_btree_iter_set_pos(iter, SPOS_MAX); 1990 k = bkey_s_c_null; 1991 goto out; 1992 } 1993 1994 btree_path_set_should_be_locked(iter->path); 1995 1996 k = btree_path_level_peek_all(trans->c, l, &iter->k); 1997 1998 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && 1999 k.k && 2000 (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { 2001 k = k2; 2002 ret = bkey_err(k); 2003 if (ret) { 2004 bch2_btree_iter_set_pos(iter, iter->pos); 2005 goto out; 2006 } 2007 } 2008 2009 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) 2010 k = btree_trans_peek_journal(trans, iter, k); 2011 2012 next_update = btree_trans_peek_updates(iter); 2013 2014 if (next_update && 2015 bpos_le(next_update->k.p, 2016 k.k ? k.k->p : l->b->key.k.p)) { 2017 iter->k = next_update->k; 2018 k = bkey_i_to_s_c(next_update); 2019 } 2020 2021 if (k.k && bkey_deleted(k.k)) { 2022 /* 2023 * If we've got a whiteout, and it's after the search 2024 * key, advance the search key to the whiteout instead 2025 * of just after the whiteout - it might be a btree 2026 * whiteout, with a real key at the same position, since 2027 * in the btree deleted keys sort before non deleted. 2028 */ 2029 search_key = !bpos_eq(search_key, k.k->p) 2030 ? k.k->p 2031 : bpos_successor(k.k->p); 2032 continue; 2033 } 2034 2035 if (likely(k.k)) { 2036 break; 2037 } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { 2038 /* Advance to next leaf node: */ 2039 search_key = bpos_successor(l->b->key.k.p); 2040 } else { 2041 /* End of btree: */ 2042 bch2_btree_iter_set_pos(iter, SPOS_MAX); 2043 k = bkey_s_c_null; 2044 goto out; 2045 } 2046 } 2047 out: 2048 bch2_btree_iter_verify(iter); 2049 2050 return k; 2051 } 2052 2053 /** 2054 * bch2_btree_iter_peek_upto() - returns first key greater than or equal to 2055 * iterator's current position 2056 * @iter: iterator to peek from 2057 * @end: search limit: returns keys less than or equal to @end 2058 * 2059 * Returns: key if found, or an error extractable with bkey_err(). 2060 */ 2061 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) 2062 { 2063 struct btree_trans *trans = iter->trans; 2064 struct bpos search_key = btree_iter_search_key(iter); 2065 struct bkey_s_c k; 2066 struct bpos iter_pos; 2067 int ret; 2068 2069 EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); 2070 EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); 2071 2072 if (iter->update_path) { 2073 bch2_path_put_nokeep(trans, iter->update_path, 2074 iter->flags & BTREE_ITER_INTENT); 2075 iter->update_path = NULL; 2076 } 2077 2078 bch2_btree_iter_verify_entry_exit(iter); 2079 2080 while (1) { 2081 k = __bch2_btree_iter_peek(iter, search_key); 2082 if (unlikely(!k.k)) 2083 goto end; 2084 if (unlikely(bkey_err(k))) 2085 goto out_no_locked; 2086 2087 /* 2088 * iter->pos should be mononotically increasing, and always be 2089 * equal to the key we just returned - except extents can 2090 * straddle iter->pos: 2091 */ 2092 if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) 2093 iter_pos = k.k->p; 2094 else 2095 iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); 2096 2097 if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) 2098 ? bkey_gt(iter_pos, end) 2099 : bkey_ge(iter_pos, end))) 2100 goto end; 2101 2102 if (iter->update_path && 2103 !bkey_eq(iter->update_path->pos, k.k->p)) { 2104 bch2_path_put_nokeep(trans, iter->update_path, 2105 iter->flags & BTREE_ITER_INTENT); 2106 iter->update_path = NULL; 2107 } 2108 2109 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 2110 (iter->flags & BTREE_ITER_INTENT) && 2111 !(iter->flags & BTREE_ITER_IS_EXTENTS) && 2112 !iter->update_path) { 2113 struct bpos pos = k.k->p; 2114 2115 if (pos.snapshot < iter->snapshot) { 2116 search_key = bpos_successor(k.k->p); 2117 continue; 2118 } 2119 2120 pos.snapshot = iter->snapshot; 2121 2122 /* 2123 * advance, same as on exit for iter->path, but only up 2124 * to snapshot 2125 */ 2126 __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); 2127 iter->update_path = iter->path; 2128 2129 iter->update_path = bch2_btree_path_set_pos(trans, 2130 iter->update_path, pos, 2131 iter->flags & BTREE_ITER_INTENT, 2132 _THIS_IP_); 2133 ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); 2134 if (unlikely(ret)) { 2135 k = bkey_s_c_err(ret); 2136 goto out_no_locked; 2137 } 2138 } 2139 2140 /* 2141 * We can never have a key in a leaf node at POS_MAX, so 2142 * we don't have to check these successor() calls: 2143 */ 2144 if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && 2145 !bch2_snapshot_is_ancestor(trans->c, 2146 iter->snapshot, 2147 k.k->p.snapshot)) { 2148 search_key = bpos_successor(k.k->p); 2149 continue; 2150 } 2151 2152 if (bkey_whiteout(k.k) && 2153 !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { 2154 search_key = bkey_successor(iter, k.k->p); 2155 continue; 2156 } 2157 2158 break; 2159 } 2160 2161 iter->pos = iter_pos; 2162 2163 iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, 2164 iter->flags & BTREE_ITER_INTENT, 2165 btree_iter_ip_allocated(iter)); 2166 2167 btree_path_set_should_be_locked(iter->path); 2168 out_no_locked: 2169 if (iter->update_path) { 2170 ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); 2171 if (unlikely(ret)) 2172 k = bkey_s_c_err(ret); 2173 else 2174 btree_path_set_should_be_locked(iter->update_path); 2175 } 2176 2177 if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) 2178 iter->pos.snapshot = iter->snapshot; 2179 2180 ret = bch2_btree_iter_verify_ret(iter, k); 2181 if (unlikely(ret)) { 2182 bch2_btree_iter_set_pos(iter, iter->pos); 2183 k = bkey_s_c_err(ret); 2184 } 2185 2186 bch2_btree_iter_verify_entry_exit(iter); 2187 2188 return k; 2189 end: 2190 bch2_btree_iter_set_pos(iter, end); 2191 k = bkey_s_c_null; 2192 goto out_no_locked; 2193 } 2194 2195 /** 2196 * bch2_btree_iter_peek_all_levels() - returns the first key greater than or 2197 * equal to iterator's current position, returning keys from every level of the 2198 * btree. For keys at different levels of the btree that compare equal, the key 2199 * from the lower level (leaf) is returned first. 2200 * @iter: iterator to peek from 2201 * 2202 * Returns: key if found, or an error extractable with bkey_err(). 2203 */ 2204 struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) 2205 { 2206 struct btree_trans *trans = iter->trans; 2207 struct bkey_s_c k; 2208 int ret; 2209 2210 EBUG_ON(iter->path->cached); 2211 bch2_btree_iter_verify(iter); 2212 BUG_ON(iter->path->level < iter->min_depth); 2213 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); 2214 EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); 2215 2216 while (1) { 2217 iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, 2218 iter->flags & BTREE_ITER_INTENT, 2219 btree_iter_ip_allocated(iter)); 2220 2221 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2222 if (unlikely(ret)) { 2223 /* ensure that iter->k is consistent with iter->pos: */ 2224 bch2_btree_iter_set_pos(iter, iter->pos); 2225 k = bkey_s_c_err(ret); 2226 goto out_no_locked; 2227 } 2228 2229 /* Already at end? */ 2230 if (!btree_path_node(iter->path, iter->path->level)) { 2231 k = bkey_s_c_null; 2232 goto out_no_locked; 2233 } 2234 2235 k = btree_path_level_peek_all(trans->c, 2236 &iter->path->l[iter->path->level], &iter->k); 2237 2238 /* Check if we should go up to the parent node: */ 2239 if (!k.k || 2240 (iter->advanced && 2241 bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { 2242 iter->pos = path_l(iter->path)->b->key.k.p; 2243 btree_path_set_level_up(trans, iter->path); 2244 iter->advanced = false; 2245 continue; 2246 } 2247 2248 /* 2249 * Check if we should go back down to a leaf: 2250 * If we're not in a leaf node, we only return the current key 2251 * if it exactly matches iter->pos - otherwise we first have to 2252 * go back to the leaf: 2253 */ 2254 if (iter->path->level != iter->min_depth && 2255 (iter->advanced || 2256 !k.k || 2257 !bpos_eq(iter->pos, k.k->p))) { 2258 btree_path_set_level_down(trans, iter->path, iter->min_depth); 2259 iter->pos = bpos_successor(iter->pos); 2260 iter->advanced = false; 2261 continue; 2262 } 2263 2264 /* Check if we should go to the next key: */ 2265 if (iter->path->level == iter->min_depth && 2266 iter->advanced && 2267 k.k && 2268 bpos_eq(iter->pos, k.k->p)) { 2269 iter->pos = bpos_successor(iter->pos); 2270 iter->advanced = false; 2271 continue; 2272 } 2273 2274 if (iter->advanced && 2275 iter->path->level == iter->min_depth && 2276 !bpos_eq(k.k->p, iter->pos)) 2277 iter->advanced = false; 2278 2279 BUG_ON(iter->advanced); 2280 BUG_ON(!k.k); 2281 break; 2282 } 2283 2284 iter->pos = k.k->p; 2285 btree_path_set_should_be_locked(iter->path); 2286 out_no_locked: 2287 bch2_btree_iter_verify(iter); 2288 2289 return k; 2290 } 2291 2292 /** 2293 * bch2_btree_iter_next() - returns first key greater than iterator's current 2294 * position 2295 * @iter: iterator to peek from 2296 * 2297 * Returns: key if found, or an error extractable with bkey_err(). 2298 */ 2299 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) 2300 { 2301 if (!bch2_btree_iter_advance(iter)) 2302 return bkey_s_c_null; 2303 2304 return bch2_btree_iter_peek(iter); 2305 } 2306 2307 /** 2308 * bch2_btree_iter_peek_prev() - returns first key less than or equal to 2309 * iterator's current position 2310 * @iter: iterator to peek from 2311 * 2312 * Returns: key if found, or an error extractable with bkey_err(). 2313 */ 2314 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) 2315 { 2316 struct btree_trans *trans = iter->trans; 2317 struct bpos search_key = iter->pos; 2318 struct btree_path *saved_path = NULL; 2319 struct bkey_s_c k; 2320 struct bkey saved_k; 2321 const struct bch_val *saved_v; 2322 int ret; 2323 2324 EBUG_ON(iter->path->cached || iter->path->level); 2325 EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); 2326 2327 if (iter->flags & BTREE_ITER_WITH_JOURNAL) 2328 return bkey_s_c_err(-EIO); 2329 2330 bch2_btree_iter_verify(iter); 2331 bch2_btree_iter_verify_entry_exit(iter); 2332 2333 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2334 search_key.snapshot = U32_MAX; 2335 2336 while (1) { 2337 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 2338 iter->flags & BTREE_ITER_INTENT, 2339 btree_iter_ip_allocated(iter)); 2340 2341 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2342 if (unlikely(ret)) { 2343 /* ensure that iter->k is consistent with iter->pos: */ 2344 bch2_btree_iter_set_pos(iter, iter->pos); 2345 k = bkey_s_c_err(ret); 2346 goto out_no_locked; 2347 } 2348 2349 k = btree_path_level_peek(trans, iter->path, 2350 &iter->path->l[0], &iter->k); 2351 if (!k.k || 2352 ((iter->flags & BTREE_ITER_IS_EXTENTS) 2353 ? bpos_ge(bkey_start_pos(k.k), search_key) 2354 : bpos_gt(k.k->p, search_key))) 2355 k = btree_path_level_prev(trans, iter->path, 2356 &iter->path->l[0], &iter->k); 2357 2358 if (likely(k.k)) { 2359 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { 2360 if (k.k->p.snapshot == iter->snapshot) 2361 goto got_key; 2362 2363 /* 2364 * If we have a saved candidate, and we're no 2365 * longer at the same _key_ (not pos), return 2366 * that candidate 2367 */ 2368 if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { 2369 bch2_path_put_nokeep(trans, iter->path, 2370 iter->flags & BTREE_ITER_INTENT); 2371 iter->path = saved_path; 2372 saved_path = NULL; 2373 iter->k = saved_k; 2374 k.v = saved_v; 2375 goto got_key; 2376 } 2377 2378 if (bch2_snapshot_is_ancestor(iter->trans->c, 2379 iter->snapshot, 2380 k.k->p.snapshot)) { 2381 if (saved_path) 2382 bch2_path_put_nokeep(trans, saved_path, 2383 iter->flags & BTREE_ITER_INTENT); 2384 saved_path = btree_path_clone(trans, iter->path, 2385 iter->flags & BTREE_ITER_INTENT); 2386 saved_k = *k.k; 2387 saved_v = k.v; 2388 } 2389 2390 search_key = bpos_predecessor(k.k->p); 2391 continue; 2392 } 2393 got_key: 2394 if (bkey_whiteout(k.k) && 2395 !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { 2396 search_key = bkey_predecessor(iter, k.k->p); 2397 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2398 search_key.snapshot = U32_MAX; 2399 continue; 2400 } 2401 2402 break; 2403 } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { 2404 /* Advance to previous leaf node: */ 2405 search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); 2406 } else { 2407 /* Start of btree: */ 2408 bch2_btree_iter_set_pos(iter, POS_MIN); 2409 k = bkey_s_c_null; 2410 goto out_no_locked; 2411 } 2412 } 2413 2414 EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); 2415 2416 /* Extents can straddle iter->pos: */ 2417 if (bkey_lt(k.k->p, iter->pos)) 2418 iter->pos = k.k->p; 2419 2420 if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) 2421 iter->pos.snapshot = iter->snapshot; 2422 2423 btree_path_set_should_be_locked(iter->path); 2424 out_no_locked: 2425 if (saved_path) 2426 bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); 2427 2428 bch2_btree_iter_verify_entry_exit(iter); 2429 bch2_btree_iter_verify(iter); 2430 2431 return k; 2432 } 2433 2434 /** 2435 * bch2_btree_iter_prev() - returns first key less than iterator's current 2436 * position 2437 * @iter: iterator to peek from 2438 * 2439 * Returns: key if found, or an error extractable with bkey_err(). 2440 */ 2441 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) 2442 { 2443 if (!bch2_btree_iter_rewind(iter)) 2444 return bkey_s_c_null; 2445 2446 return bch2_btree_iter_peek_prev(iter); 2447 } 2448 2449 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) 2450 { 2451 struct btree_trans *trans = iter->trans; 2452 struct bpos search_key; 2453 struct bkey_s_c k; 2454 int ret; 2455 2456 bch2_btree_iter_verify(iter); 2457 bch2_btree_iter_verify_entry_exit(iter); 2458 EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); 2459 EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); 2460 2461 /* extents can't span inode numbers: */ 2462 if ((iter->flags & BTREE_ITER_IS_EXTENTS) && 2463 unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { 2464 if (iter->pos.inode == KEY_INODE_MAX) 2465 return bkey_s_c_null; 2466 2467 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); 2468 } 2469 2470 search_key = btree_iter_search_key(iter); 2471 iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, 2472 iter->flags & BTREE_ITER_INTENT, 2473 btree_iter_ip_allocated(iter)); 2474 2475 ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); 2476 if (unlikely(ret)) { 2477 k = bkey_s_c_err(ret); 2478 goto out_no_locked; 2479 } 2480 2481 if ((iter->flags & BTREE_ITER_CACHED) || 2482 !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { 2483 struct bkey_i *next_update; 2484 2485 if ((next_update = btree_trans_peek_updates(iter)) && 2486 bpos_eq(next_update->k.p, iter->pos)) { 2487 iter->k = next_update->k; 2488 k = bkey_i_to_s_c(next_update); 2489 goto out; 2490 } 2491 2492 if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && 2493 (k = btree_trans_peek_slot_journal(trans, iter)).k) 2494 goto out; 2495 2496 if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && 2497 (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { 2498 if (!bkey_err(k)) 2499 iter->k = *k.k; 2500 /* We're not returning a key from iter->path: */ 2501 goto out_no_locked; 2502 } 2503 2504 k = bch2_btree_path_peek_slot(iter->path, &iter->k); 2505 if (unlikely(!k.k)) 2506 goto out_no_locked; 2507 } else { 2508 struct bpos next; 2509 struct bpos end = iter->pos; 2510 2511 if (iter->flags & BTREE_ITER_IS_EXTENTS) 2512 end.offset = U64_MAX; 2513 2514 EBUG_ON(iter->path->level); 2515 2516 if (iter->flags & BTREE_ITER_INTENT) { 2517 struct btree_iter iter2; 2518 2519 bch2_trans_copy_iter(&iter2, iter); 2520 k = bch2_btree_iter_peek_upto(&iter2, end); 2521 2522 if (k.k && !bkey_err(k)) { 2523 iter->k = iter2.k; 2524 k.k = &iter->k; 2525 } 2526 bch2_trans_iter_exit(trans, &iter2); 2527 } else { 2528 struct bpos pos = iter->pos; 2529 2530 k = bch2_btree_iter_peek_upto(iter, end); 2531 if (unlikely(bkey_err(k))) 2532 bch2_btree_iter_set_pos(iter, pos); 2533 else 2534 iter->pos = pos; 2535 } 2536 2537 if (unlikely(bkey_err(k))) 2538 goto out_no_locked; 2539 2540 next = k.k ? bkey_start_pos(k.k) : POS_MAX; 2541 2542 if (bkey_lt(iter->pos, next)) { 2543 bkey_init(&iter->k); 2544 iter->k.p = iter->pos; 2545 2546 if (iter->flags & BTREE_ITER_IS_EXTENTS) { 2547 bch2_key_resize(&iter->k, 2548 min_t(u64, KEY_SIZE_MAX, 2549 (next.inode == iter->pos.inode 2550 ? next.offset 2551 : KEY_OFFSET_MAX) - 2552 iter->pos.offset)); 2553 EBUG_ON(!iter->k.size); 2554 } 2555 2556 k = (struct bkey_s_c) { &iter->k, NULL }; 2557 } 2558 } 2559 out: 2560 btree_path_set_should_be_locked(iter->path); 2561 out_no_locked: 2562 bch2_btree_iter_verify_entry_exit(iter); 2563 bch2_btree_iter_verify(iter); 2564 ret = bch2_btree_iter_verify_ret(iter, k); 2565 if (unlikely(ret)) 2566 return bkey_s_c_err(ret); 2567 2568 return k; 2569 } 2570 2571 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) 2572 { 2573 if (!bch2_btree_iter_advance(iter)) 2574 return bkey_s_c_null; 2575 2576 return bch2_btree_iter_peek_slot(iter); 2577 } 2578 2579 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) 2580 { 2581 if (!bch2_btree_iter_rewind(iter)) 2582 return bkey_s_c_null; 2583 2584 return bch2_btree_iter_peek_slot(iter); 2585 } 2586 2587 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) 2588 { 2589 struct bkey_s_c k; 2590 2591 while (btree_trans_too_many_iters(iter->trans) || 2592 (k = bch2_btree_iter_peek_type(iter, iter->flags), 2593 bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) 2594 bch2_trans_begin(iter->trans); 2595 2596 return k; 2597 } 2598 2599 /* new transactional stuff: */ 2600 2601 #ifdef CONFIG_BCACHEFS_DEBUG 2602 static void btree_trans_verify_sorted_refs(struct btree_trans *trans) 2603 { 2604 struct btree_path *path; 2605 unsigned i; 2606 2607 BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); 2608 2609 trans_for_each_path(trans, path) { 2610 BUG_ON(path->sorted_idx >= trans->nr_sorted); 2611 BUG_ON(trans->sorted[path->sorted_idx] != path->idx); 2612 } 2613 2614 for (i = 0; i < trans->nr_sorted; i++) { 2615 unsigned idx = trans->sorted[i]; 2616 2617 EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); 2618 BUG_ON(trans->paths[idx].sorted_idx != i); 2619 } 2620 } 2621 2622 static void btree_trans_verify_sorted(struct btree_trans *trans) 2623 { 2624 struct btree_path *path, *prev = NULL; 2625 unsigned i; 2626 2627 if (!bch2_debug_check_iterators) 2628 return; 2629 2630 trans_for_each_path_inorder(trans, path, i) { 2631 if (prev && btree_path_cmp(prev, path) > 0) { 2632 __bch2_dump_trans_paths_updates(trans, true); 2633 panic("trans paths out of order!\n"); 2634 } 2635 prev = path; 2636 } 2637 } 2638 #else 2639 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} 2640 static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} 2641 #endif 2642 2643 void __bch2_btree_trans_sort_paths(struct btree_trans *trans) 2644 { 2645 int i, l = 0, r = trans->nr_sorted, inc = 1; 2646 bool swapped; 2647 2648 btree_trans_verify_sorted_refs(trans); 2649 2650 if (trans->paths_sorted) 2651 goto out; 2652 2653 /* 2654 * Cocktail shaker sort: this is efficient because iterators will be 2655 * mostly sorted. 2656 */ 2657 do { 2658 swapped = false; 2659 2660 for (i = inc > 0 ? l : r - 2; 2661 i + 1 < r && i >= l; 2662 i += inc) { 2663 if (btree_path_cmp(trans->paths + trans->sorted[i], 2664 trans->paths + trans->sorted[i + 1]) > 0) { 2665 swap(trans->sorted[i], trans->sorted[i + 1]); 2666 trans->paths[trans->sorted[i]].sorted_idx = i; 2667 trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; 2668 swapped = true; 2669 } 2670 } 2671 2672 if (inc > 0) 2673 --r; 2674 else 2675 l++; 2676 inc = -inc; 2677 } while (swapped); 2678 2679 trans->paths_sorted = true; 2680 out: 2681 btree_trans_verify_sorted(trans); 2682 } 2683 2684 static inline void btree_path_list_remove(struct btree_trans *trans, 2685 struct btree_path *path) 2686 { 2687 unsigned i; 2688 2689 EBUG_ON(path->sorted_idx >= trans->nr_sorted); 2690 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 2691 trans->nr_sorted--; 2692 memmove_u64s_down_small(trans->sorted + path->sorted_idx, 2693 trans->sorted + path->sorted_idx + 1, 2694 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); 2695 #else 2696 array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); 2697 #endif 2698 for (i = path->sorted_idx; i < trans->nr_sorted; i++) 2699 trans->paths[trans->sorted[i]].sorted_idx = i; 2700 2701 path->sorted_idx = U8_MAX; 2702 } 2703 2704 static inline void btree_path_list_add(struct btree_trans *trans, 2705 struct btree_path *pos, 2706 struct btree_path *path) 2707 { 2708 unsigned i; 2709 2710 path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; 2711 2712 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 2713 memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, 2714 trans->sorted + path->sorted_idx, 2715 DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); 2716 trans->nr_sorted++; 2717 trans->sorted[path->sorted_idx] = path->idx; 2718 #else 2719 array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); 2720 #endif 2721 2722 for (i = path->sorted_idx; i < trans->nr_sorted; i++) 2723 trans->paths[trans->sorted[i]].sorted_idx = i; 2724 2725 btree_trans_verify_sorted_refs(trans); 2726 } 2727 2728 void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) 2729 { 2730 if (iter->update_path) 2731 bch2_path_put_nokeep(trans, iter->update_path, 2732 iter->flags & BTREE_ITER_INTENT); 2733 if (iter->path) 2734 bch2_path_put(trans, iter->path, 2735 iter->flags & BTREE_ITER_INTENT); 2736 if (iter->key_cache_path) 2737 bch2_path_put(trans, iter->key_cache_path, 2738 iter->flags & BTREE_ITER_INTENT); 2739 iter->path = NULL; 2740 iter->update_path = NULL; 2741 iter->key_cache_path = NULL; 2742 } 2743 2744 void bch2_trans_iter_init_outlined(struct btree_trans *trans, 2745 struct btree_iter *iter, 2746 enum btree_id btree_id, struct bpos pos, 2747 unsigned flags) 2748 { 2749 bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, 2750 bch2_btree_iter_flags(trans, btree_id, flags), 2751 _RET_IP_); 2752 } 2753 2754 void bch2_trans_node_iter_init(struct btree_trans *trans, 2755 struct btree_iter *iter, 2756 enum btree_id btree_id, 2757 struct bpos pos, 2758 unsigned locks_want, 2759 unsigned depth, 2760 unsigned flags) 2761 { 2762 flags |= BTREE_ITER_NOT_EXTENTS; 2763 flags |= __BTREE_ITER_ALL_SNAPSHOTS; 2764 flags |= BTREE_ITER_ALL_SNAPSHOTS; 2765 2766 bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, 2767 __bch2_btree_iter_flags(trans, btree_id, flags), 2768 _RET_IP_); 2769 2770 iter->min_depth = depth; 2771 2772 BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); 2773 BUG_ON(iter->path->level != depth); 2774 BUG_ON(iter->min_depth != depth); 2775 } 2776 2777 void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) 2778 { 2779 *dst = *src; 2780 if (src->path) 2781 __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); 2782 if (src->update_path) 2783 __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); 2784 dst->key_cache_path = NULL; 2785 } 2786 2787 void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) 2788 { 2789 unsigned new_top = trans->mem_top + size; 2790 size_t old_bytes = trans->mem_bytes; 2791 size_t new_bytes = roundup_pow_of_two(new_top); 2792 int ret; 2793 void *new_mem; 2794 void *p; 2795 2796 trans->mem_max = max(trans->mem_max, new_top); 2797 2798 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); 2799 2800 new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); 2801 if (unlikely(!new_mem)) { 2802 bch2_trans_unlock(trans); 2803 2804 new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); 2805 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { 2806 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); 2807 new_bytes = BTREE_TRANS_MEM_MAX; 2808 kfree(trans->mem); 2809 } 2810 2811 if (!new_mem) 2812 return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); 2813 2814 trans->mem = new_mem; 2815 trans->mem_bytes = new_bytes; 2816 2817 ret = bch2_trans_relock(trans); 2818 if (ret) 2819 return ERR_PTR(ret); 2820 } 2821 2822 trans->mem = new_mem; 2823 trans->mem_bytes = new_bytes; 2824 2825 if (old_bytes) { 2826 trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); 2827 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); 2828 } 2829 2830 p = trans->mem + trans->mem_top; 2831 trans->mem_top += size; 2832 memset(p, 0, size); 2833 return p; 2834 } 2835 2836 static inline void check_srcu_held_too_long(struct btree_trans *trans) 2837 { 2838 WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), 2839 "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", 2840 (jiffies - trans->srcu_lock_time) / HZ); 2841 } 2842 2843 void bch2_trans_srcu_unlock(struct btree_trans *trans) 2844 { 2845 if (trans->srcu_held) { 2846 struct bch_fs *c = trans->c; 2847 struct btree_path *path; 2848 2849 trans_for_each_path(trans, path) 2850 if (path->cached && !btree_node_locked(path, 0)) 2851 path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); 2852 2853 check_srcu_held_too_long(trans); 2854 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); 2855 trans->srcu_held = false; 2856 } 2857 } 2858 2859 void bch2_trans_srcu_lock(struct btree_trans *trans) 2860 { 2861 if (!trans->srcu_held) { 2862 trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); 2863 trans->srcu_lock_time = jiffies; 2864 trans->srcu_held = true; 2865 } 2866 } 2867 2868 /** 2869 * bch2_trans_begin() - reset a transaction after a interrupted attempt 2870 * @trans: transaction to reset 2871 * 2872 * Returns: current restart counter, to be used with trans_was_restarted() 2873 * 2874 * While iterating over nodes or updating nodes a attempt to lock a btree node 2875 * may return BCH_ERR_transaction_restart when the trylock fails. When this 2876 * occurs bch2_trans_begin() should be called and the transaction retried. 2877 */ 2878 u32 bch2_trans_begin(struct btree_trans *trans) 2879 { 2880 struct btree_path *path; 2881 u64 now; 2882 2883 bch2_trans_reset_updates(trans); 2884 2885 trans->restart_count++; 2886 trans->mem_top = 0; 2887 2888 trans_for_each_path(trans, path) { 2889 path->should_be_locked = false; 2890 2891 /* 2892 * If the transaction wasn't restarted, we're presuming to be 2893 * doing something new: dont keep iterators excpt the ones that 2894 * are in use - except for the subvolumes btree: 2895 */ 2896 if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) 2897 path->preserve = false; 2898 2899 /* 2900 * XXX: we probably shouldn't be doing this if the transaction 2901 * was restarted, but currently we still overflow transaction 2902 * iterators if we do that 2903 */ 2904 if (!path->ref && !path->preserve) 2905 __bch2_path_free(trans, path); 2906 else 2907 path->preserve = false; 2908 } 2909 2910 now = local_clock(); 2911 if (!trans->restarted && 2912 (need_resched() || 2913 now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { 2914 drop_locks_do(trans, (cond_resched(), 0)); 2915 now = local_clock(); 2916 } 2917 trans->last_begin_time = now; 2918 2919 if (unlikely(trans->srcu_held && 2920 time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) 2921 bch2_trans_srcu_unlock(trans); 2922 2923 trans->last_begin_ip = _RET_IP_; 2924 if (trans->restarted) { 2925 bch2_btree_path_traverse_all(trans); 2926 trans->notrace_relock_fail = false; 2927 } 2928 2929 return trans->restart_count; 2930 } 2931 2932 static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) 2933 { 2934 struct btree_trans *trans; 2935 2936 if (IS_ENABLED(__KERNEL__)) { 2937 trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); 2938 if (trans) 2939 return trans; 2940 } 2941 2942 trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); 2943 /* 2944 * paths need to be zeroed, bch2_check_for_deadlock looks at 2945 * paths in other threads 2946 */ 2947 memset(&trans->paths, 0, sizeof(trans->paths)); 2948 return trans; 2949 } 2950 2951 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; 2952 2953 unsigned bch2_trans_get_fn_idx(const char *fn) 2954 { 2955 unsigned i; 2956 2957 for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) 2958 if (!bch2_btree_transaction_fns[i] || 2959 bch2_btree_transaction_fns[i] == fn) { 2960 bch2_btree_transaction_fns[i] = fn; 2961 return i; 2962 } 2963 2964 pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); 2965 return i; 2966 } 2967 2968 struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) 2969 __acquires(&c->btree_trans_barrier) 2970 { 2971 struct btree_trans *trans; 2972 struct btree_transaction_stats *s; 2973 2974 trans = bch2_trans_alloc(c); 2975 2976 memset(trans, 0, sizeof(*trans)); 2977 trans->c = c; 2978 trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) 2979 ? bch2_btree_transaction_fns[fn_idx] : NULL; 2980 trans->last_begin_time = local_clock(); 2981 trans->fn_idx = fn_idx; 2982 trans->locking_wait.task = current; 2983 trans->journal_replay_not_finished = 2984 !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); 2985 closure_init_stack(&trans->ref); 2986 2987 s = btree_trans_stats(trans); 2988 if (s && s->max_mem) { 2989 unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); 2990 2991 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); 2992 2993 if (!unlikely(trans->mem)) { 2994 trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); 2995 trans->mem_bytes = BTREE_TRANS_MEM_MAX; 2996 } else { 2997 trans->mem_bytes = expected_mem_bytes; 2998 } 2999 } 3000 3001 if (s) { 3002 trans->nr_max_paths = s->nr_max_paths; 3003 trans->wb_updates_size = s->wb_updates_size; 3004 } 3005 3006 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 3007 trans->srcu_lock_time = jiffies; 3008 trans->srcu_held = true; 3009 3010 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 3011 struct btree_trans *pos; 3012 3013 seqmutex_lock(&c->btree_trans_lock); 3014 list_for_each_entry(pos, &c->btree_trans_list, list) { 3015 /* 3016 * We'd much prefer to be stricter here and completely 3017 * disallow multiple btree_trans in the same thread - 3018 * but the data move path calls bch2_write when we 3019 * already have a btree_trans initialized. 3020 */ 3021 BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && 3022 bch2_trans_locked(pos)); 3023 3024 if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { 3025 list_add_tail(&trans->list, &pos->list); 3026 goto list_add_done; 3027 } 3028 } 3029 list_add_tail(&trans->list, &c->btree_trans_list); 3030 list_add_done: 3031 seqmutex_unlock(&c->btree_trans_lock); 3032 } 3033 3034 return trans; 3035 } 3036 3037 static void check_btree_paths_leaked(struct btree_trans *trans) 3038 { 3039 #ifdef CONFIG_BCACHEFS_DEBUG 3040 struct bch_fs *c = trans->c; 3041 struct btree_path *path; 3042 3043 trans_for_each_path(trans, path) 3044 if (path->ref) 3045 goto leaked; 3046 return; 3047 leaked: 3048 bch_err(c, "btree paths leaked from %s!", trans->fn); 3049 trans_for_each_path(trans, path) 3050 if (path->ref) 3051 printk(KERN_ERR " btree %s %pS\n", 3052 bch2_btree_id_str(path->btree_id), 3053 (void *) path->ip_allocated); 3054 /* Be noisy about this: */ 3055 bch2_fatal_error(c); 3056 #endif 3057 } 3058 3059 void bch2_trans_put(struct btree_trans *trans) 3060 __releases(&c->btree_trans_barrier) 3061 { 3062 struct btree_insert_entry *i; 3063 struct bch_fs *c = trans->c; 3064 struct btree_transaction_stats *s = btree_trans_stats(trans); 3065 3066 bch2_trans_unlock(trans); 3067 3068 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { 3069 seqmutex_lock(&c->btree_trans_lock); 3070 list_del(&trans->list); 3071 seqmutex_unlock(&c->btree_trans_lock); 3072 } 3073 3074 closure_sync(&trans->ref); 3075 3076 if (s) 3077 s->max_mem = max(s->max_mem, trans->mem_max); 3078 3079 trans_for_each_update(trans, i) 3080 __btree_path_put(i->path, true); 3081 trans->nr_updates = 0; 3082 3083 check_btree_paths_leaked(trans); 3084 3085 if (trans->srcu_held) { 3086 check_srcu_held_too_long(trans); 3087 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); 3088 } 3089 3090 bch2_journal_preres_put(&c->journal, &trans->journal_preres); 3091 3092 kfree(trans->extra_journal_entries.data); 3093 3094 if (trans->fs_usage_deltas) { 3095 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == 3096 REPLICAS_DELTA_LIST_MAX) 3097 mempool_free(trans->fs_usage_deltas, 3098 &c->replicas_delta_pool); 3099 else 3100 kfree(trans->fs_usage_deltas); 3101 } 3102 3103 if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) 3104 mempool_free(trans->mem, &c->btree_trans_mem_pool); 3105 else 3106 kfree(trans->mem); 3107 3108 /* Userspace doesn't have a real percpu implementation: */ 3109 if (IS_ENABLED(__KERNEL__)) 3110 trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); 3111 if (trans) 3112 mempool_free(trans, &c->btree_trans_pool); 3113 } 3114 3115 static void __maybe_unused 3116 bch2_btree_bkey_cached_common_to_text(struct printbuf *out, 3117 struct btree_bkey_cached_common *b) 3118 { 3119 struct six_lock_count c = six_lock_counts(&b->lock); 3120 struct task_struct *owner; 3121 pid_t pid; 3122 3123 rcu_read_lock(); 3124 owner = READ_ONCE(b->lock.owner); 3125 pid = owner ? owner->pid : 0; 3126 rcu_read_unlock(); 3127 3128 prt_tab(out); 3129 prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', 3130 b->level, bch2_btree_id_str(b->btree_id)); 3131 bch2_bpos_to_text(out, btree_node_pos(b)); 3132 3133 prt_tab(out); 3134 prt_printf(out, " locks %u:%u:%u held by pid %u", 3135 c.n[0], c.n[1], c.n[2], pid); 3136 } 3137 3138 void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) 3139 { 3140 struct btree_path *path; 3141 struct btree_bkey_cached_common *b; 3142 static char lock_types[] = { 'r', 'i', 'w' }; 3143 unsigned l, idx; 3144 3145 if (!out->nr_tabstops) { 3146 printbuf_tabstop_push(out, 16); 3147 printbuf_tabstop_push(out, 32); 3148 } 3149 3150 prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); 3151 3152 trans_for_each_path_safe(trans, path, idx) { 3153 if (!path->nodes_locked) 3154 continue; 3155 3156 prt_printf(out, " path %u %c l=%u %s:", 3157 path->idx, 3158 path->cached ? 'c' : 'b', 3159 path->level, 3160 bch2_btree_id_str(path->btree_id)); 3161 bch2_bpos_to_text(out, path->pos); 3162 prt_newline(out); 3163 3164 for (l = 0; l < BTREE_MAX_DEPTH; l++) { 3165 if (btree_node_locked(path, l) && 3166 !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { 3167 prt_printf(out, " %c l=%u ", 3168 lock_types[btree_node_locked_type(path, l)], l); 3169 bch2_btree_bkey_cached_common_to_text(out, b); 3170 prt_newline(out); 3171 } 3172 } 3173 } 3174 3175 b = READ_ONCE(trans->locking); 3176 if (b) { 3177 prt_printf(out, " blocked for %lluus on", 3178 div_u64(local_clock() - trans->locking_wait.start_time, 3179 1000)); 3180 prt_newline(out); 3181 prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); 3182 bch2_btree_bkey_cached_common_to_text(out, b); 3183 prt_newline(out); 3184 } 3185 } 3186 3187 void bch2_fs_btree_iter_exit(struct bch_fs *c) 3188 { 3189 struct btree_transaction_stats *s; 3190 struct btree_trans *trans; 3191 int cpu; 3192 3193 trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); 3194 if (trans) 3195 panic("%s leaked btree_trans\n", trans->fn); 3196 3197 if (c->btree_trans_bufs) 3198 for_each_possible_cpu(cpu) 3199 kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); 3200 free_percpu(c->btree_trans_bufs); 3201 3202 for (s = c->btree_transaction_stats; 3203 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); 3204 s++) { 3205 kfree(s->max_paths_text); 3206 bch2_time_stats_exit(&s->lock_hold_times); 3207 } 3208 3209 if (c->btree_trans_barrier_initialized) 3210 cleanup_srcu_struct(&c->btree_trans_barrier); 3211 mempool_exit(&c->btree_trans_mem_pool); 3212 mempool_exit(&c->btree_trans_pool); 3213 } 3214 3215 int bch2_fs_btree_iter_init(struct bch_fs *c) 3216 { 3217 struct btree_transaction_stats *s; 3218 int ret; 3219 3220 for (s = c->btree_transaction_stats; 3221 s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); 3222 s++) { 3223 bch2_time_stats_init(&s->lock_hold_times); 3224 mutex_init(&s->lock); 3225 } 3226 3227 INIT_LIST_HEAD(&c->btree_trans_list); 3228 seqmutex_init(&c->btree_trans_lock); 3229 3230 c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); 3231 if (!c->btree_trans_bufs) 3232 return -ENOMEM; 3233 3234 ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, 3235 sizeof(struct btree_trans)) ?: 3236 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, 3237 BTREE_TRANS_MEM_MAX) ?: 3238 init_srcu_struct(&c->btree_trans_barrier); 3239 if (!ret) 3240 c->btree_trans_barrier_initialized = true; 3241 return ret; 3242 } 3243