1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_foreground.h" 5 #include "bkey_buf.h" 6 #include "bkey_methods.h" 7 #include "btree_cache.h" 8 #include "btree_gc.h" 9 #include "btree_journal_iter.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_io.h" 13 #include "btree_iter.h" 14 #include "btree_locking.h" 15 #include "buckets.h" 16 #include "clock.h" 17 #include "error.h" 18 #include "extents.h" 19 #include "io_write.h" 20 #include "journal.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "recovery_passes.h" 24 #include "replicas.h" 25 #include "sb-members.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/random.h> 30 31 static const char * const bch2_btree_update_modes[] = { 32 #define x(t) #t, 33 BTREE_UPDATE_MODES() 34 #undef x 35 NULL 36 }; 37 38 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, 39 btree_path_idx_t, struct btree *, struct keylist *); 40 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); 41 42 /* 43 * Verify that child nodes correctly span parent node's range: 44 */ 45 int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) 46 { 47 struct bch_fs *c = trans->c; 48 struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 49 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key 50 : b->data->min_key; 51 struct btree_and_journal_iter iter; 52 struct bkey_s_c k; 53 struct printbuf buf = PRINTBUF; 54 struct bkey_buf prev; 55 int ret = 0; 56 57 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && 58 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, 59 b->data->min_key)); 60 61 bch2_bkey_buf_init(&prev); 62 bkey_init(&prev.k->k); 63 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); 64 65 if (b == btree_node_root(c, b)) { 66 if (!bpos_eq(b->data->min_key, POS_MIN)) { 67 printbuf_reset(&buf); 68 bch2_bpos_to_text(&buf, b->data->min_key); 69 log_fsck_err(trans, btree_root_bad_min_key, 70 "btree root with incorrect min_key: %s", buf.buf); 71 goto topology_repair; 72 } 73 74 if (!bpos_eq(b->data->max_key, SPOS_MAX)) { 75 printbuf_reset(&buf); 76 bch2_bpos_to_text(&buf, b->data->max_key); 77 log_fsck_err(trans, btree_root_bad_max_key, 78 "btree root with incorrect max_key: %s", buf.buf); 79 goto topology_repair; 80 } 81 } 82 83 if (!b->c.level) 84 goto out; 85 86 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { 87 if (k.k->type != KEY_TYPE_btree_ptr_v2) 88 goto out; 89 90 struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); 91 92 struct bpos expected_min = bkey_deleted(&prev.k->k) 93 ? node_min 94 : bpos_successor(prev.k->k.p); 95 96 if (!bpos_eq(expected_min, bp.v->min_key)) { 97 bch2_topology_error(c); 98 99 printbuf_reset(&buf); 100 prt_str(&buf, "end of prev node doesn't match start of next node\n in "); 101 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 102 prt_str(&buf, " node "); 103 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 104 prt_str(&buf, "\n prev "); 105 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); 106 prt_str(&buf, "\n next "); 107 bch2_bkey_val_to_text(&buf, c, k); 108 109 log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); 110 goto topology_repair; 111 } 112 113 bch2_bkey_buf_reassemble(&prev, c, k); 114 bch2_btree_and_journal_iter_advance(&iter); 115 } 116 117 if (bkey_deleted(&prev.k->k)) { 118 bch2_topology_error(c); 119 120 printbuf_reset(&buf); 121 prt_str(&buf, "empty interior node\n in "); 122 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 123 prt_str(&buf, " node "); 124 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 125 126 log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); 127 goto topology_repair; 128 } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { 129 bch2_topology_error(c); 130 131 printbuf_reset(&buf); 132 prt_str(&buf, "last child node doesn't end at end of parent node\n in "); 133 bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); 134 prt_str(&buf, " node "); 135 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 136 prt_str(&buf, "\n last key "); 137 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); 138 139 log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); 140 goto topology_repair; 141 } 142 out: 143 fsck_err: 144 bch2_btree_and_journal_iter_exit(&iter); 145 bch2_bkey_buf_exit(&prev, c); 146 printbuf_exit(&buf); 147 return ret; 148 topology_repair: 149 ret = bch2_topology_error(c); 150 goto out; 151 } 152 153 /* Calculate ideal packed bkey format for new btree nodes: */ 154 155 static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) 156 { 157 struct bkey_packed *k; 158 struct bkey uk; 159 160 for_each_bset(b, t) 161 bset_tree_for_each_key(b, t, k) 162 if (!bkey_deleted(k)) { 163 uk = bkey_unpack_key(b, k); 164 bch2_bkey_format_add_key(s, &uk); 165 } 166 } 167 168 static struct bkey_format bch2_btree_calc_format(struct btree *b) 169 { 170 struct bkey_format_state s; 171 172 bch2_bkey_format_init(&s); 173 bch2_bkey_format_add_pos(&s, b->data->min_key); 174 bch2_bkey_format_add_pos(&s, b->data->max_key); 175 __bch2_btree_calc_format(&s, b); 176 177 return bch2_bkey_format_done(&s); 178 } 179 180 static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, 181 struct bkey_format *old_f, 182 struct bkey_format *new_f) 183 { 184 /* stupid integer promotion rules */ 185 ssize_t delta = 186 (((int) new_f->key_u64s - old_f->key_u64s) * 187 (int) nr.packed_keys) + 188 (((int) new_f->key_u64s - BKEY_U64s) * 189 (int) nr.unpacked_keys); 190 191 BUG_ON(delta + nr.live_u64s < 0); 192 193 return nr.live_u64s + delta; 194 } 195 196 /** 197 * bch2_btree_node_format_fits - check if we could rewrite node with a new format 198 * 199 * @c: filesystem handle 200 * @b: btree node to rewrite 201 * @nr: number of keys for new node (i.e. b->nr) 202 * @new_f: bkey format to translate keys to 203 * 204 * Returns: true if all re-packed keys will be able to fit in a new node. 205 * 206 * Assumes all keys will successfully pack with the new format. 207 */ 208 static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, 209 struct btree_nr_keys nr, 210 struct bkey_format *new_f) 211 { 212 size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); 213 214 return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); 215 } 216 217 /* Btree node freeing/allocation: */ 218 219 static void __btree_node_free(struct btree_trans *trans, struct btree *b) 220 { 221 struct bch_fs *c = trans->c; 222 223 trace_and_count(c, btree_node_free, trans, b); 224 225 BUG_ON(btree_node_write_blocked(b)); 226 BUG_ON(btree_node_dirty(b)); 227 BUG_ON(btree_node_need_write(b)); 228 BUG_ON(b == btree_node_root(c, b)); 229 BUG_ON(b->ob.nr); 230 BUG_ON(!list_empty(&b->write_blocked)); 231 BUG_ON(b->will_make_reachable); 232 233 clear_btree_node_noevict(b); 234 } 235 236 static void bch2_btree_node_free_inmem(struct btree_trans *trans, 237 struct btree_path *path, 238 struct btree *b) 239 { 240 struct bch_fs *c = trans->c; 241 242 bch2_btree_node_lock_write_nofail(trans, path, &b->c); 243 244 __btree_node_free(trans, b); 245 246 mutex_lock(&c->btree_cache.lock); 247 bch2_btree_node_hash_remove(&c->btree_cache, b); 248 mutex_unlock(&c->btree_cache.lock); 249 250 six_unlock_write(&b->c.lock); 251 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); 252 253 bch2_trans_node_drop(trans, b); 254 } 255 256 static void bch2_btree_node_free_never_used(struct btree_update *as, 257 struct btree_trans *trans, 258 struct btree *b) 259 { 260 struct bch_fs *c = as->c; 261 struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; 262 263 BUG_ON(!list_empty(&b->write_blocked)); 264 BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); 265 266 b->will_make_reachable = 0; 267 closure_put(&as->cl); 268 269 clear_btree_node_will_make_reachable(b); 270 clear_btree_node_accessed(b); 271 clear_btree_node_dirty_acct(c, b); 272 clear_btree_node_need_write(b); 273 274 mutex_lock(&c->btree_cache.lock); 275 __bch2_btree_node_hash_remove(&c->btree_cache, b); 276 mutex_unlock(&c->btree_cache.lock); 277 278 BUG_ON(p->nr >= ARRAY_SIZE(p->b)); 279 p->b[p->nr++] = b; 280 281 six_unlock_intent(&b->c.lock); 282 283 bch2_trans_node_drop(trans, b); 284 } 285 286 static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, 287 struct disk_reservation *res, 288 struct closure *cl, 289 bool interior_node, 290 unsigned flags) 291 { 292 struct bch_fs *c = trans->c; 293 struct write_point *wp; 294 struct btree *b; 295 BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; 296 struct open_buckets obs = { .nr = 0 }; 297 struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; 298 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 299 unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim 300 ? BTREE_NODE_RESERVE 301 : 0; 302 int ret; 303 304 b = bch2_btree_node_mem_alloc(trans, interior_node); 305 if (IS_ERR(b)) 306 return b; 307 308 BUG_ON(b->ob.nr); 309 310 mutex_lock(&c->btree_reserve_cache_lock); 311 if (c->btree_reserve_cache_nr > nr_reserve) { 312 struct btree_alloc *a = 313 &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; 314 315 obs = a->ob; 316 bkey_copy(&tmp.k, &a->k); 317 mutex_unlock(&c->btree_reserve_cache_lock); 318 goto out; 319 } 320 mutex_unlock(&c->btree_reserve_cache_lock); 321 retry: 322 ret = bch2_alloc_sectors_start_trans(trans, 323 c->opts.metadata_target ?: 324 c->opts.foreground_target, 325 0, 326 writepoint_ptr(&c->btree_write_point), 327 &devs_have, 328 res->nr_replicas, 329 min(res->nr_replicas, 330 c->opts.metadata_replicas_required), 331 watermark, 0, cl, &wp); 332 if (unlikely(ret)) 333 goto err; 334 335 if (wp->sectors_free < btree_sectors(c)) { 336 struct open_bucket *ob; 337 unsigned i; 338 339 open_bucket_for_each(c, &wp->ptrs, ob, i) 340 if (ob->sectors_free < btree_sectors(c)) 341 ob->sectors_free = 0; 342 343 bch2_alloc_sectors_done(c, wp); 344 goto retry; 345 } 346 347 bkey_btree_ptr_v2_init(&tmp.k); 348 bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); 349 350 bch2_open_bucket_get(c, wp, &obs); 351 bch2_alloc_sectors_done(c, wp); 352 out: 353 bkey_copy(&b->key, &tmp.k); 354 b->ob = obs; 355 six_unlock_write(&b->c.lock); 356 six_unlock_intent(&b->c.lock); 357 358 return b; 359 err: 360 bch2_btree_node_to_freelist(c, b); 361 return ERR_PTR(ret); 362 } 363 364 static struct btree *bch2_btree_node_alloc(struct btree_update *as, 365 struct btree_trans *trans, 366 unsigned level) 367 { 368 struct bch_fs *c = as->c; 369 struct btree *b; 370 struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; 371 int ret; 372 373 BUG_ON(level >= BTREE_MAX_DEPTH); 374 BUG_ON(!p->nr); 375 376 b = p->b[--p->nr]; 377 378 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 379 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 380 381 set_btree_node_accessed(b); 382 set_btree_node_dirty_acct(c, b); 383 set_btree_node_need_write(b); 384 385 bch2_bset_init_first(b, &b->data->keys); 386 b->c.level = level; 387 b->c.btree_id = as->btree_id; 388 b->version_ondisk = c->sb.version; 389 390 memset(&b->nr, 0, sizeof(b->nr)); 391 b->data->magic = cpu_to_le64(bset_magic(c)); 392 memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); 393 b->data->flags = 0; 394 SET_BTREE_NODE_ID(b->data, as->btree_id); 395 SET_BTREE_NODE_LEVEL(b->data, level); 396 397 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 398 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); 399 400 bp->v.mem_ptr = 0; 401 bp->v.seq = b->data->keys.seq; 402 bp->v.sectors_written = 0; 403 } 404 405 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); 406 407 bch2_btree_build_aux_trees(b); 408 409 ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); 410 BUG_ON(ret); 411 412 trace_and_count(c, btree_node_alloc, trans, b); 413 bch2_increment_clock(c, btree_sectors(c), WRITE); 414 return b; 415 } 416 417 static void btree_set_min(struct btree *b, struct bpos pos) 418 { 419 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) 420 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; 421 b->data->min_key = pos; 422 } 423 424 static void btree_set_max(struct btree *b, struct bpos pos) 425 { 426 b->key.k.p = pos; 427 b->data->max_key = pos; 428 } 429 430 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, 431 struct btree_trans *trans, 432 struct btree *b) 433 { 434 struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); 435 struct bkey_format format = bch2_btree_calc_format(b); 436 437 /* 438 * The keys might expand with the new format - if they wouldn't fit in 439 * the btree node anymore, use the old format for now: 440 */ 441 if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) 442 format = b->format; 443 444 SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); 445 446 btree_set_min(n, b->data->min_key); 447 btree_set_max(n, b->data->max_key); 448 449 n->data->format = format; 450 btree_node_set_format(n, format); 451 452 bch2_btree_sort_into(as->c, n, b); 453 454 btree_node_reset_sib_u64s(n); 455 return n; 456 } 457 458 static struct btree *__btree_root_alloc(struct btree_update *as, 459 struct btree_trans *trans, unsigned level) 460 { 461 struct btree *b = bch2_btree_node_alloc(as, trans, level); 462 463 btree_set_min(b, POS_MIN); 464 btree_set_max(b, SPOS_MAX); 465 b->data->format = bch2_btree_calc_format(b); 466 467 btree_node_set_format(b, b->data->format); 468 bch2_btree_build_aux_trees(b); 469 470 return b; 471 } 472 473 static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) 474 { 475 struct bch_fs *c = as->c; 476 struct prealloc_nodes *p; 477 478 for (p = as->prealloc_nodes; 479 p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); 480 p++) { 481 while (p->nr) { 482 struct btree *b = p->b[--p->nr]; 483 484 mutex_lock(&c->btree_reserve_cache_lock); 485 486 if (c->btree_reserve_cache_nr < 487 ARRAY_SIZE(c->btree_reserve_cache)) { 488 struct btree_alloc *a = 489 &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; 490 491 a->ob = b->ob; 492 b->ob.nr = 0; 493 bkey_copy(&a->k, &b->key); 494 } else { 495 bch2_open_buckets_put(c, &b->ob); 496 } 497 498 mutex_unlock(&c->btree_reserve_cache_lock); 499 500 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 501 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); 502 __btree_node_free(trans, b); 503 bch2_btree_node_to_freelist(c, b); 504 } 505 } 506 } 507 508 static int bch2_btree_reserve_get(struct btree_trans *trans, 509 struct btree_update *as, 510 unsigned nr_nodes[2], 511 unsigned flags, 512 struct closure *cl) 513 { 514 struct btree *b; 515 unsigned interior; 516 int ret = 0; 517 518 BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); 519 520 /* 521 * Protects reaping from the btree node cache and using the btree node 522 * open bucket reserve: 523 */ 524 ret = bch2_btree_cache_cannibalize_lock(trans, cl); 525 if (ret) 526 return ret; 527 528 for (interior = 0; interior < 2; interior++) { 529 struct prealloc_nodes *p = as->prealloc_nodes + interior; 530 531 while (p->nr < nr_nodes[interior]) { 532 b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, 533 interior, flags); 534 if (IS_ERR(b)) { 535 ret = PTR_ERR(b); 536 goto err; 537 } 538 539 p->b[p->nr++] = b; 540 } 541 } 542 err: 543 bch2_btree_cache_cannibalize_unlock(trans); 544 return ret; 545 } 546 547 /* Asynchronous interior node update machinery */ 548 549 static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) 550 { 551 struct bch_fs *c = as->c; 552 553 if (as->took_gc_lock) 554 up_read(&c->gc_lock); 555 as->took_gc_lock = false; 556 557 bch2_journal_pin_drop(&c->journal, &as->journal); 558 bch2_journal_pin_flush(&c->journal, &as->journal); 559 bch2_disk_reservation_put(c, &as->disk_res); 560 bch2_btree_reserve_put(as, trans); 561 562 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], 563 as->start_time); 564 565 mutex_lock(&c->btree_interior_update_lock); 566 list_del(&as->unwritten_list); 567 list_del(&as->list); 568 569 closure_debug_destroy(&as->cl); 570 mempool_free(as, &c->btree_interior_update_pool); 571 572 /* 573 * Have to do the wakeup with btree_interior_update_lock still held, 574 * since being on btree_interior_update_list is our ref on @c: 575 */ 576 closure_wake_up(&c->btree_interior_update_wait); 577 578 mutex_unlock(&c->btree_interior_update_lock); 579 } 580 581 static void btree_update_add_key(struct btree_update *as, 582 struct keylist *keys, struct btree *b) 583 { 584 struct bkey_i *k = &b->key; 585 586 BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > 587 ARRAY_SIZE(as->_old_keys)); 588 589 bkey_copy(keys->top, k); 590 bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; 591 592 bch2_keylist_push(keys); 593 } 594 595 static bool btree_update_new_nodes_marked_sb(struct btree_update *as) 596 { 597 for_each_keylist_key(&as->new_keys, k) 598 if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) 599 return false; 600 return true; 601 } 602 603 static void btree_update_new_nodes_mark_sb(struct btree_update *as) 604 { 605 struct bch_fs *c = as->c; 606 607 mutex_lock(&c->sb_lock); 608 for_each_keylist_key(&as->new_keys, k) 609 bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); 610 611 bch2_write_super(c); 612 mutex_unlock(&c->sb_lock); 613 } 614 615 /* 616 * The transactional part of an interior btree node update, where we journal the 617 * update we did to the interior node and update alloc info: 618 */ 619 static int btree_update_nodes_written_trans(struct btree_trans *trans, 620 struct btree_update *as) 621 { 622 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); 623 int ret = PTR_ERR_OR_ZERO(e); 624 if (ret) 625 return ret; 626 627 memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); 628 629 trans->journal_pin = &as->journal; 630 631 for_each_keylist_key(&as->old_keys, k) { 632 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; 633 634 ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 635 BTREE_TRIGGER_transactional); 636 if (ret) 637 return ret; 638 } 639 640 for_each_keylist_key(&as->new_keys, k) { 641 unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; 642 643 ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k), 644 BTREE_TRIGGER_transactional); 645 if (ret) 646 return ret; 647 } 648 649 return 0; 650 } 651 652 static void btree_update_nodes_written(struct btree_update *as) 653 { 654 struct bch_fs *c = as->c; 655 struct btree *b; 656 struct btree_trans *trans = bch2_trans_get(c); 657 u64 journal_seq = 0; 658 unsigned i; 659 int ret; 660 661 /* 662 * If we're already in an error state, it might be because a btree node 663 * was never written, and we might be trying to free that same btree 664 * node here, but it won't have been marked as allocated and we'll see 665 * spurious disk usage inconsistencies in the transactional part below 666 * if we don't skip it: 667 */ 668 ret = bch2_journal_error(&c->journal); 669 if (ret) 670 goto err; 671 672 if (!btree_update_new_nodes_marked_sb(as)) 673 btree_update_new_nodes_mark_sb(as); 674 675 /* 676 * Wait for any in flight writes to finish before we free the old nodes 677 * on disk: 678 */ 679 for (i = 0; i < as->nr_old_nodes; i++) { 680 __le64 seq; 681 682 b = as->old_nodes[i]; 683 684 bch2_trans_begin(trans); 685 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 686 seq = b->data ? b->data->keys.seq : 0; 687 six_unlock_read(&b->c.lock); 688 bch2_trans_unlock_long(trans); 689 690 if (seq == as->old_nodes_seq[i]) 691 wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, 692 TASK_UNINTERRUPTIBLE); 693 } 694 695 /* 696 * We did an update to a parent node where the pointers we added pointed 697 * to child nodes that weren't written yet: now, the child nodes have 698 * been written so we can write out the update to the interior node. 699 */ 700 701 /* 702 * We can't call into journal reclaim here: we'd block on the journal 703 * reclaim lock, but we may need to release the open buckets we have 704 * pinned in order for other btree updates to make forward progress, and 705 * journal reclaim does btree updates when flushing bkey_cached entries, 706 * which may require allocations as well. 707 */ 708 ret = commit_do(trans, &as->disk_res, &journal_seq, 709 BCH_WATERMARK_interior_updates| 710 BCH_TRANS_COMMIT_no_enospc| 711 BCH_TRANS_COMMIT_no_check_rw| 712 BCH_TRANS_COMMIT_journal_reclaim, 713 btree_update_nodes_written_trans(trans, as)); 714 bch2_trans_unlock(trans); 715 716 bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, 717 "%s", bch2_err_str(ret)); 718 err: 719 /* 720 * Ensure transaction is unlocked before using btree_node_lock_nopath() 721 * (the use of which is always suspect, we need to work on removing this 722 * in the future) 723 * 724 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() 725 * calls bch2_path_upgrade(), before we call path_make_mut(), so we may 726 * rarely end up with a locked path besides the one we have here: 727 */ 728 bch2_trans_unlock(trans); 729 bch2_trans_begin(trans); 730 731 /* 732 * We have to be careful because another thread might be getting ready 733 * to free as->b and calling btree_update_reparent() on us - we'll 734 * recheck under btree_update_lock below: 735 */ 736 b = READ_ONCE(as->b); 737 if (b) { 738 /* 739 * @b is the node we did the final insert into: 740 * 741 * On failure to get a journal reservation, we still have to 742 * unblock the write and allow most of the write path to happen 743 * so that shutdown works, but the i->journal_seq mechanism 744 * won't work to prevent the btree write from being visible (we 745 * didn't get a journal sequence number) - instead 746 * __bch2_btree_node_write() doesn't do the actual write if 747 * we're in journal error state: 748 */ 749 750 btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, 751 as->btree_id, b->c.level, b->key.k.p); 752 struct btree_path *path = trans->paths + path_idx; 753 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); 754 mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); 755 path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); 756 path->l[b->c.level].b = b; 757 758 bch2_btree_node_lock_write_nofail(trans, path, &b->c); 759 760 mutex_lock(&c->btree_interior_update_lock); 761 762 list_del(&as->write_blocked_list); 763 if (list_empty(&b->write_blocked)) 764 clear_btree_node_write_blocked(b); 765 766 /* 767 * Node might have been freed, recheck under 768 * btree_interior_update_lock: 769 */ 770 if (as->b == b) { 771 BUG_ON(!b->c.level); 772 BUG_ON(!btree_node_dirty(b)); 773 774 if (!ret) { 775 struct bset *last = btree_bset_last(b); 776 777 last->journal_seq = cpu_to_le64( 778 max(journal_seq, 779 le64_to_cpu(last->journal_seq))); 780 781 bch2_btree_add_journal_pin(c, b, journal_seq); 782 } else { 783 /* 784 * If we didn't get a journal sequence number we 785 * can't write this btree node, because recovery 786 * won't know to ignore this write: 787 */ 788 set_btree_node_never_write(b); 789 } 790 } 791 792 mutex_unlock(&c->btree_interior_update_lock); 793 794 mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); 795 six_unlock_write(&b->c.lock); 796 797 btree_node_write_if_need(trans, b, SIX_LOCK_intent); 798 btree_node_unlock(trans, path, b->c.level); 799 bch2_path_put(trans, path_idx, true); 800 } 801 802 bch2_journal_pin_drop(&c->journal, &as->journal); 803 804 mutex_lock(&c->btree_interior_update_lock); 805 for (i = 0; i < as->nr_new_nodes; i++) { 806 b = as->new_nodes[i]; 807 808 BUG_ON(b->will_make_reachable != (unsigned long) as); 809 b->will_make_reachable = 0; 810 clear_btree_node_will_make_reachable(b); 811 } 812 mutex_unlock(&c->btree_interior_update_lock); 813 814 for (i = 0; i < as->nr_new_nodes; i++) { 815 b = as->new_nodes[i]; 816 817 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); 818 btree_node_write_if_need(trans, b, SIX_LOCK_read); 819 six_unlock_read(&b->c.lock); 820 } 821 822 for (i = 0; i < as->nr_open_buckets; i++) 823 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); 824 825 bch2_btree_update_free(as, trans); 826 bch2_trans_put(trans); 827 } 828 829 static void btree_interior_update_work(struct work_struct *work) 830 { 831 struct bch_fs *c = 832 container_of(work, struct bch_fs, btree_interior_update_work); 833 struct btree_update *as; 834 835 while (1) { 836 mutex_lock(&c->btree_interior_update_lock); 837 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, 838 struct btree_update, unwritten_list); 839 if (as && !as->nodes_written) 840 as = NULL; 841 mutex_unlock(&c->btree_interior_update_lock); 842 843 if (!as) 844 break; 845 846 btree_update_nodes_written(as); 847 } 848 } 849 850 static CLOSURE_CALLBACK(btree_update_set_nodes_written) 851 { 852 closure_type(as, struct btree_update, cl); 853 struct bch_fs *c = as->c; 854 855 mutex_lock(&c->btree_interior_update_lock); 856 as->nodes_written = true; 857 mutex_unlock(&c->btree_interior_update_lock); 858 859 queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); 860 } 861 862 /* 863 * We're updating @b with pointers to nodes that haven't finished writing yet: 864 * block @b from being written until @as completes 865 */ 866 static void btree_update_updated_node(struct btree_update *as, struct btree *b) 867 { 868 struct bch_fs *c = as->c; 869 870 BUG_ON(as->mode != BTREE_UPDATE_none); 871 BUG_ON(as->update_level_end < b->c.level); 872 BUG_ON(!btree_node_dirty(b)); 873 BUG_ON(!b->c.level); 874 875 mutex_lock(&c->btree_interior_update_lock); 876 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 877 878 as->mode = BTREE_UPDATE_node; 879 as->b = b; 880 as->update_level_end = b->c.level; 881 882 set_btree_node_write_blocked(b); 883 list_add(&as->write_blocked_list, &b->write_blocked); 884 885 mutex_unlock(&c->btree_interior_update_lock); 886 } 887 888 static int bch2_update_reparent_journal_pin_flush(struct journal *j, 889 struct journal_entry_pin *_pin, u64 seq) 890 { 891 return 0; 892 } 893 894 static void btree_update_reparent(struct btree_update *as, 895 struct btree_update *child) 896 { 897 struct bch_fs *c = as->c; 898 899 lockdep_assert_held(&c->btree_interior_update_lock); 900 901 child->b = NULL; 902 child->mode = BTREE_UPDATE_update; 903 904 bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, 905 bch2_update_reparent_journal_pin_flush); 906 } 907 908 static void btree_update_updated_root(struct btree_update *as, struct btree *b) 909 { 910 struct bkey_i *insert = &b->key; 911 struct bch_fs *c = as->c; 912 913 BUG_ON(as->mode != BTREE_UPDATE_none); 914 915 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > 916 ARRAY_SIZE(as->journal_entries)); 917 918 as->journal_u64s += 919 journal_entry_set((void *) &as->journal_entries[as->journal_u64s], 920 BCH_JSET_ENTRY_btree_root, 921 b->c.btree_id, b->c.level, 922 insert, insert->k.u64s); 923 924 mutex_lock(&c->btree_interior_update_lock); 925 list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); 926 927 as->mode = BTREE_UPDATE_root; 928 mutex_unlock(&c->btree_interior_update_lock); 929 } 930 931 /* 932 * bch2_btree_update_add_new_node: 933 * 934 * This causes @as to wait on @b to be written, before it gets to 935 * bch2_btree_update_nodes_written 936 * 937 * Additionally, it sets b->will_make_reachable to prevent any additional writes 938 * to @b from happening besides the first until @b is reachable on disk 939 * 940 * And it adds @b to the list of @as's new nodes, so that we can update sector 941 * counts in bch2_btree_update_nodes_written: 942 */ 943 static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) 944 { 945 struct bch_fs *c = as->c; 946 947 closure_get(&as->cl); 948 949 mutex_lock(&c->btree_interior_update_lock); 950 BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); 951 BUG_ON(b->will_make_reachable); 952 953 as->new_nodes[as->nr_new_nodes++] = b; 954 b->will_make_reachable = 1UL|(unsigned long) as; 955 set_btree_node_will_make_reachable(b); 956 957 mutex_unlock(&c->btree_interior_update_lock); 958 959 btree_update_add_key(as, &as->new_keys, b); 960 961 if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { 962 unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; 963 unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; 964 965 bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = 966 cpu_to_le16(sectors); 967 } 968 } 969 970 /* 971 * returns true if @b was a new node 972 */ 973 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) 974 { 975 struct btree_update *as; 976 unsigned long v; 977 unsigned i; 978 979 mutex_lock(&c->btree_interior_update_lock); 980 /* 981 * When b->will_make_reachable != 0, it owns a ref on as->cl that's 982 * dropped when it gets written by bch2_btree_complete_write - the 983 * xchg() is for synchronization with bch2_btree_complete_write: 984 */ 985 v = xchg(&b->will_make_reachable, 0); 986 clear_btree_node_will_make_reachable(b); 987 as = (struct btree_update *) (v & ~1UL); 988 989 if (!as) { 990 mutex_unlock(&c->btree_interior_update_lock); 991 return; 992 } 993 994 for (i = 0; i < as->nr_new_nodes; i++) 995 if (as->new_nodes[i] == b) 996 goto found; 997 998 BUG(); 999 found: 1000 array_remove_item(as->new_nodes, as->nr_new_nodes, i); 1001 mutex_unlock(&c->btree_interior_update_lock); 1002 1003 if (v & 1) 1004 closure_put(&as->cl); 1005 } 1006 1007 static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) 1008 { 1009 while (b->ob.nr) 1010 as->open_buckets[as->nr_open_buckets++] = 1011 b->ob.v[--b->ob.nr]; 1012 } 1013 1014 static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, 1015 struct journal_entry_pin *_pin, u64 seq) 1016 { 1017 return 0; 1018 } 1019 1020 /* 1021 * @b is being split/rewritten: it may have pointers to not-yet-written btree 1022 * nodes and thus outstanding btree_updates - redirect @b's 1023 * btree_updates to point to this btree_update: 1024 */ 1025 static void bch2_btree_interior_update_will_free_node(struct btree_update *as, 1026 struct btree *b) 1027 { 1028 struct bch_fs *c = as->c; 1029 struct btree_update *p, *n; 1030 struct btree_write *w; 1031 1032 set_btree_node_dying(b); 1033 1034 if (btree_node_fake(b)) 1035 return; 1036 1037 mutex_lock(&c->btree_interior_update_lock); 1038 1039 /* 1040 * Does this node have any btree_update operations preventing 1041 * it from being written? 1042 * 1043 * If so, redirect them to point to this btree_update: we can 1044 * write out our new nodes, but we won't make them visible until those 1045 * operations complete 1046 */ 1047 list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { 1048 list_del_init(&p->write_blocked_list); 1049 btree_update_reparent(as, p); 1050 1051 /* 1052 * for flush_held_btree_writes() waiting on updates to flush or 1053 * nodes to be writeable: 1054 */ 1055 closure_wake_up(&c->btree_interior_update_wait); 1056 } 1057 1058 clear_btree_node_dirty_acct(c, b); 1059 clear_btree_node_need_write(b); 1060 clear_btree_node_write_blocked(b); 1061 1062 /* 1063 * Does this node have unwritten data that has a pin on the journal? 1064 * 1065 * If so, transfer that pin to the btree_update operation - 1066 * note that if we're freeing multiple nodes, we only need to keep the 1067 * oldest pin of any of the nodes we're freeing. We'll release the pin 1068 * when the new nodes are persistent and reachable on disk: 1069 */ 1070 w = btree_current_write(b); 1071 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, 1072 bch2_btree_update_will_free_node_journal_pin_flush); 1073 bch2_journal_pin_drop(&c->journal, &w->journal); 1074 1075 w = btree_prev_write(b); 1076 bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, 1077 bch2_btree_update_will_free_node_journal_pin_flush); 1078 bch2_journal_pin_drop(&c->journal, &w->journal); 1079 1080 mutex_unlock(&c->btree_interior_update_lock); 1081 1082 /* 1083 * Is this a node that isn't reachable on disk yet? 1084 * 1085 * Nodes that aren't reachable yet have writes blocked until they're 1086 * reachable - now that we've cancelled any pending writes and moved 1087 * things waiting on that write to wait on this update, we can drop this 1088 * node from the list of nodes that the other update is making 1089 * reachable, prior to freeing it: 1090 */ 1091 btree_update_drop_new_node(c, b); 1092 1093 btree_update_add_key(as, &as->old_keys, b); 1094 1095 as->old_nodes[as->nr_old_nodes] = b; 1096 as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; 1097 as->nr_old_nodes++; 1098 } 1099 1100 static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) 1101 { 1102 struct bch_fs *c = as->c; 1103 u64 start_time = as->start_time; 1104 1105 BUG_ON(as->mode == BTREE_UPDATE_none); 1106 1107 if (as->took_gc_lock) 1108 up_read(&as->c->gc_lock); 1109 as->took_gc_lock = false; 1110 1111 bch2_btree_reserve_put(as, trans); 1112 1113 continue_at(&as->cl, btree_update_set_nodes_written, 1114 as->c->btree_interior_update_worker); 1115 1116 bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], 1117 start_time); 1118 } 1119 1120 static struct btree_update * 1121 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, 1122 unsigned level_start, bool split, unsigned flags) 1123 { 1124 struct bch_fs *c = trans->c; 1125 struct btree_update *as; 1126 u64 start_time = local_clock(); 1127 int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) 1128 ? BCH_DISK_RESERVATION_NOFAIL : 0; 1129 unsigned nr_nodes[2] = { 0, 0 }; 1130 unsigned level_end = level_start; 1131 enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; 1132 int ret = 0; 1133 u32 restart_count = trans->restart_count; 1134 1135 BUG_ON(!path->should_be_locked); 1136 1137 if (watermark == BCH_WATERMARK_copygc) 1138 watermark = BCH_WATERMARK_btree_copygc; 1139 if (watermark < BCH_WATERMARK_btree) 1140 watermark = BCH_WATERMARK_btree; 1141 1142 flags &= ~BCH_WATERMARK_MASK; 1143 flags |= watermark; 1144 1145 if (watermark < BCH_WATERMARK_reclaim && 1146 test_bit(JOURNAL_space_low, &c->journal.flags)) { 1147 if (flags & BCH_TRANS_COMMIT_journal_reclaim) 1148 return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock); 1149 1150 ret = drop_locks_do(trans, 1151 ({ wait_event(c->journal.wait, !test_bit(JOURNAL_space_low, &c->journal.flags)); 0; })); 1152 if (ret) 1153 return ERR_PTR(ret); 1154 } 1155 1156 while (1) { 1157 nr_nodes[!!level_end] += 1 + split; 1158 level_end++; 1159 1160 ret = bch2_btree_path_upgrade(trans, path, level_end + 1); 1161 if (ret) 1162 return ERR_PTR(ret); 1163 1164 if (!btree_path_node(path, level_end)) { 1165 /* Allocating new root? */ 1166 nr_nodes[1] += split; 1167 level_end = BTREE_MAX_DEPTH; 1168 break; 1169 } 1170 1171 /* 1172 * Always check for space for two keys, even if we won't have to 1173 * split at prior level - it might have been a merge instead: 1174 */ 1175 if (bch2_btree_node_insert_fits(path->l[level_end].b, 1176 BKEY_BTREE_PTR_U64s_MAX * 2)) 1177 break; 1178 1179 split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); 1180 } 1181 1182 if (!down_read_trylock(&c->gc_lock)) { 1183 ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); 1184 if (ret) { 1185 up_read(&c->gc_lock); 1186 return ERR_PTR(ret); 1187 } 1188 } 1189 1190 as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); 1191 memset(as, 0, sizeof(*as)); 1192 closure_init(&as->cl, NULL); 1193 as->c = c; 1194 as->start_time = start_time; 1195 as->ip_started = _RET_IP_; 1196 as->mode = BTREE_UPDATE_none; 1197 as->flags = flags; 1198 as->took_gc_lock = true; 1199 as->btree_id = path->btree_id; 1200 as->update_level_start = level_start; 1201 as->update_level_end = level_end; 1202 INIT_LIST_HEAD(&as->list); 1203 INIT_LIST_HEAD(&as->unwritten_list); 1204 INIT_LIST_HEAD(&as->write_blocked_list); 1205 bch2_keylist_init(&as->old_keys, as->_old_keys); 1206 bch2_keylist_init(&as->new_keys, as->_new_keys); 1207 bch2_keylist_init(&as->parent_keys, as->inline_keys); 1208 1209 mutex_lock(&c->btree_interior_update_lock); 1210 list_add_tail(&as->list, &c->btree_interior_update_list); 1211 mutex_unlock(&c->btree_interior_update_lock); 1212 1213 /* 1214 * We don't want to allocate if we're in an error state, that can cause 1215 * deadlock on emergency shutdown due to open buckets getting stuck in 1216 * the btree_reserve_cache after allocator shutdown has cleared it out. 1217 * This check needs to come after adding us to the btree_interior_update 1218 * list but before calling bch2_btree_reserve_get, to synchronize with 1219 * __bch2_fs_read_only(). 1220 */ 1221 ret = bch2_journal_error(&c->journal); 1222 if (ret) 1223 goto err; 1224 1225 ret = bch2_disk_reservation_get(c, &as->disk_res, 1226 (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), 1227 c->opts.metadata_replicas, 1228 disk_res_flags); 1229 if (ret) 1230 goto err; 1231 1232 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); 1233 if (bch2_err_matches(ret, ENOSPC) || 1234 bch2_err_matches(ret, ENOMEM)) { 1235 struct closure cl; 1236 1237 /* 1238 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK 1239 * flag 1240 */ 1241 if (bch2_err_matches(ret, ENOSPC) && 1242 (flags & BCH_TRANS_COMMIT_journal_reclaim) && 1243 watermark < BCH_WATERMARK_reclaim) { 1244 ret = -BCH_ERR_journal_reclaim_would_deadlock; 1245 goto err; 1246 } 1247 1248 closure_init_stack(&cl); 1249 1250 do { 1251 ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); 1252 1253 bch2_trans_unlock(trans); 1254 bch2_wait_on_allocator(c, &cl); 1255 } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); 1256 } 1257 1258 if (ret) { 1259 trace_and_count(c, btree_reserve_get_fail, trans->fn, 1260 _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); 1261 goto err; 1262 } 1263 1264 ret = bch2_trans_relock(trans); 1265 if (ret) 1266 goto err; 1267 1268 bch2_trans_verify_not_restarted(trans, restart_count); 1269 return as; 1270 err: 1271 bch2_btree_update_free(as, trans); 1272 if (!bch2_err_matches(ret, ENOSPC) && 1273 !bch2_err_matches(ret, EROFS) && 1274 ret != -BCH_ERR_journal_reclaim_would_deadlock) 1275 bch_err_fn_ratelimited(c, ret); 1276 return ERR_PTR(ret); 1277 } 1278 1279 /* Btree root updates: */ 1280 1281 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) 1282 { 1283 /* Root nodes cannot be reaped */ 1284 mutex_lock(&c->btree_cache.lock); 1285 list_del_init(&b->list); 1286 mutex_unlock(&c->btree_cache.lock); 1287 1288 mutex_lock(&c->btree_root_lock); 1289 bch2_btree_id_root(c, b->c.btree_id)->b = b; 1290 mutex_unlock(&c->btree_root_lock); 1291 1292 bch2_recalc_btree_reserve(c); 1293 } 1294 1295 static int bch2_btree_set_root(struct btree_update *as, 1296 struct btree_trans *trans, 1297 struct btree_path *path, 1298 struct btree *b, 1299 bool nofail) 1300 { 1301 struct bch_fs *c = as->c; 1302 1303 trace_and_count(c, btree_node_set_root, trans, b); 1304 1305 struct btree *old = btree_node_root(c, b); 1306 1307 /* 1308 * Ensure no one is using the old root while we switch to the 1309 * new root: 1310 */ 1311 if (nofail) { 1312 bch2_btree_node_lock_write_nofail(trans, path, &old->c); 1313 } else { 1314 int ret = bch2_btree_node_lock_write(trans, path, &old->c); 1315 if (ret) 1316 return ret; 1317 } 1318 1319 bch2_btree_set_root_inmem(c, b); 1320 1321 btree_update_updated_root(as, b); 1322 1323 /* 1324 * Unlock old root after new root is visible: 1325 * 1326 * The new root isn't persistent, but that's ok: we still have 1327 * an intent lock on the new root, and any updates that would 1328 * depend on the new root would have to update the new root. 1329 */ 1330 bch2_btree_node_unlock_write(trans, path, old); 1331 return 0; 1332 } 1333 1334 /* Interior node updates: */ 1335 1336 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, 1337 struct btree_trans *trans, 1338 struct btree_path *path, 1339 struct btree *b, 1340 struct btree_node_iter *node_iter, 1341 struct bkey_i *insert) 1342 { 1343 struct bch_fs *c = as->c; 1344 struct bkey_packed *k; 1345 struct printbuf buf = PRINTBUF; 1346 unsigned long old, new; 1347 1348 BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && 1349 !btree_ptr_sectors_written(bkey_i_to_s_c(insert))); 1350 1351 if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) 1352 bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); 1353 1354 struct bkey_validate_context from = (struct bkey_validate_context) { 1355 .from = BKEY_VALIDATE_btree_node, 1356 .level = b->c.level, 1357 .btree = b->c.btree_id, 1358 .flags = BCH_VALIDATE_commit, 1359 }; 1360 if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: 1361 bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { 1362 bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); 1363 dump_stack(); 1364 } 1365 1366 BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > 1367 ARRAY_SIZE(as->journal_entries)); 1368 1369 as->journal_u64s += 1370 journal_entry_set((void *) &as->journal_entries[as->journal_u64s], 1371 BCH_JSET_ENTRY_btree_keys, 1372 b->c.btree_id, b->c.level, 1373 insert, insert->k.u64s); 1374 1375 while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && 1376 bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) 1377 bch2_btree_node_iter_advance(node_iter, b); 1378 1379 bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); 1380 set_btree_node_dirty_acct(c, b); 1381 1382 old = READ_ONCE(b->flags); 1383 do { 1384 new = old; 1385 1386 new &= ~BTREE_WRITE_TYPE_MASK; 1387 new |= BTREE_WRITE_interior; 1388 new |= 1 << BTREE_NODE_need_write; 1389 } while (!try_cmpxchg(&b->flags, &old, new)); 1390 1391 printbuf_exit(&buf); 1392 } 1393 1394 static void 1395 bch2_btree_insert_keys_interior(struct btree_update *as, 1396 struct btree_trans *trans, 1397 struct btree_path *path, 1398 struct btree *b, 1399 struct btree_node_iter node_iter, 1400 struct keylist *keys) 1401 { 1402 struct bkey_i *insert = bch2_keylist_front(keys); 1403 struct bkey_packed *k; 1404 1405 BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); 1406 1407 while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && 1408 (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) 1409 ; 1410 1411 for (; 1412 insert != keys->top && bpos_le(insert->k.p, b->key.k.p); 1413 insert = bkey_next(insert)) 1414 bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); 1415 1416 if (bch2_btree_node_check_topology(trans, b)) { 1417 struct printbuf buf = PRINTBUF; 1418 1419 for (struct bkey_i *k = keys->keys; 1420 k != insert; 1421 k = bkey_next(k)) { 1422 bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); 1423 prt_newline(&buf); 1424 } 1425 1426 panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); 1427 } 1428 1429 memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); 1430 keys->top_p -= insert->_data - keys->keys_p; 1431 } 1432 1433 static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) 1434 { 1435 if (insert_keys) 1436 for_each_keylist_key(insert_keys, k) 1437 if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos)) 1438 return true; 1439 return false; 1440 } 1441 1442 /* 1443 * Move keys from n1 (original replacement node, now lower node) to n2 (higher 1444 * node) 1445 */ 1446 static void __btree_split_node(struct btree_update *as, 1447 struct btree_trans *trans, 1448 struct btree *b, 1449 struct btree *n[2], 1450 struct keylist *insert_keys) 1451 { 1452 struct bkey_packed *k; 1453 struct bpos n1_pos = POS_MIN; 1454 struct btree_node_iter iter; 1455 struct bset *bsets[2]; 1456 struct bkey_format_state format[2]; 1457 struct bkey_packed *out[2]; 1458 struct bkey uk; 1459 unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; 1460 struct { unsigned nr_keys, val_u64s; } nr_keys[2]; 1461 int i; 1462 1463 memset(&nr_keys, 0, sizeof(nr_keys)); 1464 1465 for (i = 0; i < 2; i++) { 1466 BUG_ON(n[i]->nsets != 1); 1467 1468 bsets[i] = btree_bset_first(n[i]); 1469 out[i] = bsets[i]->start; 1470 1471 SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); 1472 bch2_bkey_format_init(&format[i]); 1473 } 1474 1475 u64s = 0; 1476 for_each_btree_node_key(b, k, &iter) { 1477 if (bkey_deleted(k)) 1478 continue; 1479 1480 uk = bkey_unpack_key(b, k); 1481 1482 if (b->c.level && 1483 u64s < n1_u64s && 1484 u64s + k->u64s >= n1_u64s && 1485 (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) || 1486 key_deleted_in_insert(insert_keys, uk.p))) 1487 n1_u64s += k->u64s; 1488 1489 i = u64s >= n1_u64s; 1490 u64s += k->u64s; 1491 if (!i) 1492 n1_pos = uk.p; 1493 bch2_bkey_format_add_key(&format[i], &uk); 1494 1495 nr_keys[i].nr_keys++; 1496 nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); 1497 } 1498 1499 btree_set_min(n[0], b->data->min_key); 1500 btree_set_max(n[0], n1_pos); 1501 btree_set_min(n[1], bpos_successor(n1_pos)); 1502 btree_set_max(n[1], b->data->max_key); 1503 1504 for (i = 0; i < 2; i++) { 1505 bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); 1506 bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); 1507 1508 n[i]->data->format = bch2_bkey_format_done(&format[i]); 1509 1510 unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + 1511 nr_keys[i].val_u64s; 1512 if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) 1513 n[i]->data->format = b->format; 1514 1515 btree_node_set_format(n[i], n[i]->data->format); 1516 } 1517 1518 u64s = 0; 1519 for_each_btree_node_key(b, k, &iter) { 1520 if (bkey_deleted(k)) 1521 continue; 1522 1523 i = u64s >= n1_u64s; 1524 u64s += k->u64s; 1525 1526 if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) 1527 ? &b->format: &bch2_bkey_format_current, k)) 1528 out[i]->format = KEY_FORMAT_LOCAL_BTREE; 1529 else 1530 bch2_bkey_unpack(b, (void *) out[i], k); 1531 1532 out[i]->needs_whiteout = false; 1533 1534 btree_keys_account_key_add(&n[i]->nr, 0, out[i]); 1535 out[i] = bkey_p_next(out[i]); 1536 } 1537 1538 for (i = 0; i < 2; i++) { 1539 bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); 1540 1541 BUG_ON(!bsets[i]->u64s); 1542 1543 set_btree_bset_end(n[i], n[i]->set); 1544 1545 btree_node_reset_sib_u64s(n[i]); 1546 1547 bch2_verify_btree_nr_keys(n[i]); 1548 1549 BUG_ON(bch2_btree_node_check_topology(trans, n[i])); 1550 } 1551 } 1552 1553 /* 1554 * For updates to interior nodes, we've got to do the insert before we split 1555 * because the stuff we're inserting has to be inserted atomically. Post split, 1556 * the keys might have to go in different nodes and the split would no longer be 1557 * atomic. 1558 * 1559 * Worse, if the insert is from btree node coalescing, if we do the insert after 1560 * we do the split (and pick the pivot) - the pivot we pick might be between 1561 * nodes that were coalesced, and thus in the middle of a child node post 1562 * coalescing: 1563 */ 1564 static void btree_split_insert_keys(struct btree_update *as, 1565 struct btree_trans *trans, 1566 btree_path_idx_t path_idx, 1567 struct btree *b, 1568 struct keylist *keys) 1569 { 1570 struct btree_path *path = trans->paths + path_idx; 1571 1572 if (!bch2_keylist_empty(keys) && 1573 bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { 1574 struct btree_node_iter node_iter; 1575 1576 bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); 1577 1578 bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); 1579 } 1580 } 1581 1582 static int btree_split(struct btree_update *as, struct btree_trans *trans, 1583 btree_path_idx_t path, struct btree *b, 1584 struct keylist *keys) 1585 { 1586 struct bch_fs *c = as->c; 1587 struct btree *parent = btree_node_parent(trans->paths + path, b); 1588 struct btree *n1, *n2 = NULL, *n3 = NULL; 1589 btree_path_idx_t path1 = 0, path2 = 0; 1590 u64 start_time = local_clock(); 1591 int ret = 0; 1592 1593 bch2_verify_btree_nr_keys(b); 1594 BUG_ON(!parent && (b != btree_node_root(c, b))); 1595 BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); 1596 1597 ret = bch2_btree_node_check_topology(trans, b); 1598 if (ret) 1599 return ret; 1600 1601 if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { 1602 struct btree *n[2]; 1603 1604 trace_and_count(c, btree_node_split, trans, b); 1605 1606 n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); 1607 n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); 1608 1609 __btree_split_node(as, trans, b, n, keys); 1610 1611 if (keys) { 1612 btree_split_insert_keys(as, trans, path, n1, keys); 1613 btree_split_insert_keys(as, trans, path, n2, keys); 1614 BUG_ON(!bch2_keylist_empty(keys)); 1615 } 1616 1617 bch2_btree_build_aux_trees(n2); 1618 bch2_btree_build_aux_trees(n1); 1619 1620 bch2_btree_update_add_new_node(as, n1); 1621 bch2_btree_update_add_new_node(as, n2); 1622 six_unlock_write(&n2->c.lock); 1623 six_unlock_write(&n1->c.lock); 1624 1625 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); 1626 six_lock_increment(&n1->c.lock, SIX_LOCK_intent); 1627 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); 1628 bch2_btree_path_level_init(trans, trans->paths + path1, n1); 1629 1630 path2 = bch2_path_get_unlocked_mut(trans, as->btree_id, n2->c.level, n2->key.k.p); 1631 six_lock_increment(&n2->c.lock, SIX_LOCK_intent); 1632 mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); 1633 bch2_btree_path_level_init(trans, trans->paths + path2, n2); 1634 1635 /* 1636 * Note that on recursive parent_keys == keys, so we 1637 * can't start adding new keys to parent_keys before emptying it 1638 * out (which we did with btree_split_insert_keys() above) 1639 */ 1640 bch2_keylist_add(&as->parent_keys, &n1->key); 1641 bch2_keylist_add(&as->parent_keys, &n2->key); 1642 1643 if (!parent) { 1644 /* Depth increases, make a new root */ 1645 n3 = __btree_root_alloc(as, trans, b->c.level + 1); 1646 1647 bch2_btree_update_add_new_node(as, n3); 1648 six_unlock_write(&n3->c.lock); 1649 1650 trans->paths[path2].locks_want++; 1651 BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); 1652 six_lock_increment(&n3->c.lock, SIX_LOCK_intent); 1653 mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); 1654 bch2_btree_path_level_init(trans, trans->paths + path2, n3); 1655 1656 n3->sib_u64s[0] = U16_MAX; 1657 n3->sib_u64s[1] = U16_MAX; 1658 1659 btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); 1660 } 1661 } else { 1662 trace_and_count(c, btree_node_compact, trans, b); 1663 1664 n1 = bch2_btree_node_alloc_replacement(as, trans, b); 1665 1666 if (keys) { 1667 btree_split_insert_keys(as, trans, path, n1, keys); 1668 BUG_ON(!bch2_keylist_empty(keys)); 1669 } 1670 1671 bch2_btree_build_aux_trees(n1); 1672 bch2_btree_update_add_new_node(as, n1); 1673 six_unlock_write(&n1->c.lock); 1674 1675 path1 = bch2_path_get_unlocked_mut(trans, as->btree_id, n1->c.level, n1->key.k.p); 1676 six_lock_increment(&n1->c.lock, SIX_LOCK_intent); 1677 mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); 1678 bch2_btree_path_level_init(trans, trans->paths + path1, n1); 1679 1680 if (parent) 1681 bch2_keylist_add(&as->parent_keys, &n1->key); 1682 } 1683 1684 /* New nodes all written, now make them visible: */ 1685 1686 if (parent) { 1687 /* Split a non root node */ 1688 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 1689 } else if (n3) { 1690 ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false); 1691 } else { 1692 /* Root filled up but didn't need to be split */ 1693 ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false); 1694 } 1695 1696 if (ret) 1697 goto err; 1698 1699 bch2_btree_interior_update_will_free_node(as, b); 1700 1701 if (n3) { 1702 bch2_btree_update_get_open_buckets(as, n3); 1703 bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); 1704 } 1705 if (n2) { 1706 bch2_btree_update_get_open_buckets(as, n2); 1707 bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); 1708 } 1709 bch2_btree_update_get_open_buckets(as, n1); 1710 bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); 1711 1712 /* 1713 * The old node must be freed (in memory) _before_ unlocking the new 1714 * nodes - else another thread could re-acquire a read lock on the old 1715 * node after another thread has locked and updated the new node, thus 1716 * seeing stale data: 1717 */ 1718 bch2_btree_node_free_inmem(trans, trans->paths + path, b); 1719 1720 if (n3) 1721 bch2_trans_node_add(trans, trans->paths + path, n3); 1722 if (n2) 1723 bch2_trans_node_add(trans, trans->paths + path2, n2); 1724 bch2_trans_node_add(trans, trans->paths + path1, n1); 1725 1726 if (n3) 1727 six_unlock_intent(&n3->c.lock); 1728 if (n2) 1729 six_unlock_intent(&n2->c.lock); 1730 six_unlock_intent(&n1->c.lock); 1731 out: 1732 if (path2) { 1733 __bch2_btree_path_unlock(trans, trans->paths + path2); 1734 bch2_path_put(trans, path2, true); 1735 } 1736 if (path1) { 1737 __bch2_btree_path_unlock(trans, trans->paths + path1); 1738 bch2_path_put(trans, path1, true); 1739 } 1740 1741 bch2_trans_verify_locks(trans); 1742 1743 bch2_time_stats_update(&c->times[n2 1744 ? BCH_TIME_btree_node_split 1745 : BCH_TIME_btree_node_compact], 1746 start_time); 1747 return ret; 1748 err: 1749 if (n3) 1750 bch2_btree_node_free_never_used(as, trans, n3); 1751 if (n2) 1752 bch2_btree_node_free_never_used(as, trans, n2); 1753 bch2_btree_node_free_never_used(as, trans, n1); 1754 goto out; 1755 } 1756 1757 /** 1758 * bch2_btree_insert_node - insert bkeys into a given btree node 1759 * 1760 * @as: btree_update object 1761 * @trans: btree_trans object 1762 * @path_idx: path that points to current node 1763 * @b: node to insert keys into 1764 * @keys: list of keys to insert 1765 * 1766 * Returns: 0 on success, typically transaction restart error on failure 1767 * 1768 * Inserts as many keys as it can into a given btree node, splitting it if full. 1769 * If a split occurred, this function will return early. This can only happen 1770 * for leaf nodes -- inserts into interior nodes have to be atomic. 1771 */ 1772 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, 1773 btree_path_idx_t path_idx, struct btree *b, 1774 struct keylist *keys) 1775 { 1776 struct bch_fs *c = as->c; 1777 struct btree_path *path = trans->paths + path_idx, *linked; 1778 unsigned i; 1779 int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); 1780 int old_live_u64s = b->nr.live_u64s; 1781 int live_u64s_added, u64s_added; 1782 int ret; 1783 1784 lockdep_assert_held(&c->gc_lock); 1785 BUG_ON(!btree_node_intent_locked(path, b->c.level)); 1786 BUG_ON(!b->c.level); 1787 BUG_ON(!as || as->b); 1788 bch2_verify_keylist_sorted(keys); 1789 1790 ret = bch2_btree_node_lock_write(trans, path, &b->c); 1791 if (ret) 1792 return ret; 1793 1794 bch2_btree_node_prep_for_write(trans, path, b); 1795 1796 if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) { 1797 bch2_btree_node_unlock_write(trans, path, b); 1798 goto split; 1799 } 1800 1801 ret = bch2_btree_node_check_topology(trans, b); 1802 if (ret) { 1803 bch2_btree_node_unlock_write(trans, path, b); 1804 return ret; 1805 } 1806 1807 bch2_btree_insert_keys_interior(as, trans, path, b, 1808 path->l[b->c.level].iter, keys); 1809 1810 trans_for_each_path_with_node(trans, b, linked, i) 1811 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); 1812 1813 bch2_trans_verify_paths(trans); 1814 1815 live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; 1816 u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; 1817 1818 if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) 1819 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); 1820 if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) 1821 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); 1822 1823 if (u64s_added > live_u64s_added && 1824 bch2_maybe_compact_whiteouts(c, b)) 1825 bch2_trans_node_reinit_iter(trans, b); 1826 1827 btree_update_updated_node(as, b); 1828 bch2_btree_node_unlock_write(trans, path, b); 1829 return 0; 1830 split: 1831 /* 1832 * We could attempt to avoid the transaction restart, by calling 1833 * bch2_btree_path_upgrade() and allocating more nodes: 1834 */ 1835 if (b->c.level >= as->update_level_end) { 1836 trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); 1837 return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); 1838 } 1839 1840 return btree_split(as, trans, path_idx, b, keys); 1841 } 1842 1843 int bch2_btree_split_leaf(struct btree_trans *trans, 1844 btree_path_idx_t path, 1845 unsigned flags) 1846 { 1847 /* btree_split & merge may both cause paths array to be reallocated */ 1848 struct btree *b = path_l(trans->paths + path)->b; 1849 struct btree_update *as; 1850 unsigned l; 1851 int ret = 0; 1852 1853 as = bch2_btree_update_start(trans, trans->paths + path, 1854 trans->paths[path].level, 1855 true, flags); 1856 if (IS_ERR(as)) 1857 return PTR_ERR(as); 1858 1859 ret = btree_split(as, trans, path, b, NULL); 1860 if (ret) { 1861 bch2_btree_update_free(as, trans); 1862 return ret; 1863 } 1864 1865 bch2_btree_update_done(as, trans); 1866 1867 for (l = trans->paths[path].level + 1; 1868 btree_node_intent_locked(&trans->paths[path], l) && !ret; 1869 l++) 1870 ret = bch2_foreground_maybe_merge(trans, path, l, flags); 1871 1872 return ret; 1873 } 1874 1875 static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, 1876 btree_path_idx_t path_idx) 1877 { 1878 struct bch_fs *c = as->c; 1879 struct btree_path *path = trans->paths + path_idx; 1880 struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; 1881 1882 BUG_ON(!btree_node_locked(path, b->c.level)); 1883 1884 n = __btree_root_alloc(as, trans, b->c.level + 1); 1885 1886 bch2_btree_update_add_new_node(as, n); 1887 six_unlock_write(&n->c.lock); 1888 1889 path->locks_want++; 1890 BUG_ON(btree_node_locked(path, n->c.level)); 1891 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 1892 mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); 1893 bch2_btree_path_level_init(trans, path, n); 1894 1895 n->sib_u64s[0] = U16_MAX; 1896 n->sib_u64s[1] = U16_MAX; 1897 1898 bch2_keylist_add(&as->parent_keys, &b->key); 1899 btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); 1900 1901 int ret = bch2_btree_set_root(as, trans, path, n, true); 1902 BUG_ON(ret); 1903 1904 bch2_btree_update_get_open_buckets(as, n); 1905 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 1906 bch2_trans_node_add(trans, path, n); 1907 six_unlock_intent(&n->c.lock); 1908 1909 mutex_lock(&c->btree_cache.lock); 1910 list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); 1911 mutex_unlock(&c->btree_cache.lock); 1912 1913 bch2_trans_verify_locks(trans); 1914 } 1915 1916 int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) 1917 { 1918 struct bch_fs *c = trans->c; 1919 struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; 1920 1921 if (btree_node_fake(b)) 1922 return bch2_btree_split_leaf(trans, path, flags); 1923 1924 struct btree_update *as = 1925 bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); 1926 if (IS_ERR(as)) 1927 return PTR_ERR(as); 1928 1929 __btree_increase_depth(as, trans, path); 1930 bch2_btree_update_done(as, trans); 1931 return 0; 1932 } 1933 1934 int __bch2_foreground_maybe_merge(struct btree_trans *trans, 1935 btree_path_idx_t path, 1936 unsigned level, 1937 unsigned flags, 1938 enum btree_node_sibling sib) 1939 { 1940 struct bch_fs *c = trans->c; 1941 struct btree_update *as; 1942 struct bkey_format_state new_s; 1943 struct bkey_format new_f; 1944 struct bkey_i delete; 1945 struct btree *b, *m, *n, *prev, *next, *parent; 1946 struct bpos sib_pos; 1947 size_t sib_u64s; 1948 enum btree_id btree = trans->paths[path].btree_id; 1949 btree_path_idx_t sib_path = 0, new_path = 0; 1950 u64 start_time = local_clock(); 1951 int ret = 0; 1952 1953 bch2_trans_verify_not_unlocked_or_in_restart(trans); 1954 BUG_ON(!trans->paths[path].should_be_locked); 1955 BUG_ON(!btree_node_locked(&trans->paths[path], level)); 1956 1957 /* 1958 * Work around a deadlock caused by the btree write buffer not doing 1959 * merges and leaving tons of merges for us to do - we really don't need 1960 * to be doing merges at all from the interior update path, and if the 1961 * interior update path is generating too many new interior updates we 1962 * deadlock: 1963 */ 1964 if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) 1965 return 0; 1966 1967 if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { 1968 flags &= ~BCH_WATERMARK_MASK; 1969 flags |= BCH_WATERMARK_btree; 1970 flags |= BCH_TRANS_COMMIT_journal_reclaim; 1971 } 1972 1973 b = trans->paths[path].l[level].b; 1974 1975 if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || 1976 (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { 1977 b->sib_u64s[sib] = U16_MAX; 1978 return 0; 1979 } 1980 1981 sib_pos = sib == btree_prev_sib 1982 ? bpos_predecessor(b->data->min_key) 1983 : bpos_successor(b->data->max_key); 1984 1985 sib_path = bch2_path_get(trans, btree, sib_pos, 1986 U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); 1987 ret = bch2_btree_path_traverse(trans, sib_path, false); 1988 if (ret) 1989 goto err; 1990 1991 btree_path_set_should_be_locked(trans, trans->paths + sib_path); 1992 1993 m = trans->paths[sib_path].l[level].b; 1994 1995 if (btree_node_parent(trans->paths + path, b) != 1996 btree_node_parent(trans->paths + sib_path, m)) { 1997 b->sib_u64s[sib] = U16_MAX; 1998 goto out; 1999 } 2000 2001 if (sib == btree_prev_sib) { 2002 prev = m; 2003 next = b; 2004 } else { 2005 prev = b; 2006 next = m; 2007 } 2008 2009 if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { 2010 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 2011 2012 bch2_bpos_to_text(&buf1, prev->data->max_key); 2013 bch2_bpos_to_text(&buf2, next->data->min_key); 2014 bch_err(c, 2015 "%s(): btree topology error:\n" 2016 " prev ends at %s\n" 2017 " next starts at %s", 2018 __func__, buf1.buf, buf2.buf); 2019 printbuf_exit(&buf1); 2020 printbuf_exit(&buf2); 2021 ret = bch2_topology_error(c); 2022 goto err; 2023 } 2024 2025 bch2_bkey_format_init(&new_s); 2026 bch2_bkey_format_add_pos(&new_s, prev->data->min_key); 2027 __bch2_btree_calc_format(&new_s, prev); 2028 __bch2_btree_calc_format(&new_s, next); 2029 bch2_bkey_format_add_pos(&new_s, next->data->max_key); 2030 new_f = bch2_bkey_format_done(&new_s); 2031 2032 sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + 2033 btree_node_u64s_with_format(m->nr, &m->format, &new_f); 2034 2035 if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { 2036 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); 2037 sib_u64s /= 2; 2038 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); 2039 } 2040 2041 sib_u64s = min(sib_u64s, btree_max_u64s(c)); 2042 sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); 2043 b->sib_u64s[sib] = sib_u64s; 2044 2045 if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) 2046 goto out; 2047 2048 parent = btree_node_parent(trans->paths + path, b); 2049 as = bch2_btree_update_start(trans, trans->paths + path, level, false, 2050 BCH_TRANS_COMMIT_no_enospc|flags); 2051 ret = PTR_ERR_OR_ZERO(as); 2052 if (ret) 2053 goto err; 2054 2055 trace_and_count(c, btree_node_merge, trans, b); 2056 2057 n = bch2_btree_node_alloc(as, trans, b->c.level); 2058 2059 SET_BTREE_NODE_SEQ(n->data, 2060 max(BTREE_NODE_SEQ(b->data), 2061 BTREE_NODE_SEQ(m->data)) + 1); 2062 2063 btree_set_min(n, prev->data->min_key); 2064 btree_set_max(n, next->data->max_key); 2065 2066 n->data->format = new_f; 2067 btree_node_set_format(n, new_f); 2068 2069 bch2_btree_sort_into(c, n, prev); 2070 bch2_btree_sort_into(c, n, next); 2071 2072 bch2_btree_build_aux_trees(n); 2073 bch2_btree_update_add_new_node(as, n); 2074 six_unlock_write(&n->c.lock); 2075 2076 new_path = bch2_path_get_unlocked_mut(trans, btree, n->c.level, n->key.k.p); 2077 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 2078 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); 2079 bch2_btree_path_level_init(trans, trans->paths + new_path, n); 2080 2081 bkey_init(&delete.k); 2082 delete.k.p = prev->key.k.p; 2083 bch2_keylist_add(&as->parent_keys, &delete); 2084 bch2_keylist_add(&as->parent_keys, &n->key); 2085 2086 bch2_trans_verify_paths(trans); 2087 2088 ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); 2089 if (ret) 2090 goto err_free_update; 2091 2092 bch2_btree_interior_update_will_free_node(as, b); 2093 bch2_btree_interior_update_will_free_node(as, m); 2094 2095 bch2_trans_verify_paths(trans); 2096 2097 bch2_btree_update_get_open_buckets(as, n); 2098 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 2099 2100 bch2_btree_node_free_inmem(trans, trans->paths + path, b); 2101 bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); 2102 2103 bch2_trans_node_add(trans, trans->paths + path, n); 2104 2105 bch2_trans_verify_paths(trans); 2106 2107 six_unlock_intent(&n->c.lock); 2108 2109 bch2_btree_update_done(as, trans); 2110 2111 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); 2112 out: 2113 err: 2114 if (new_path) 2115 bch2_path_put(trans, new_path, true); 2116 bch2_path_put(trans, sib_path, true); 2117 bch2_trans_verify_locks(trans); 2118 if (ret == -BCH_ERR_journal_reclaim_would_deadlock) 2119 ret = 0; 2120 if (!ret) 2121 ret = bch2_trans_relock(trans); 2122 return ret; 2123 err_free_update: 2124 bch2_btree_node_free_never_used(as, trans, n); 2125 bch2_btree_update_free(as, trans); 2126 goto out; 2127 } 2128 2129 int bch2_btree_node_rewrite(struct btree_trans *trans, 2130 struct btree_iter *iter, 2131 struct btree *b, 2132 unsigned flags) 2133 { 2134 struct bch_fs *c = trans->c; 2135 struct btree *n, *parent; 2136 struct btree_update *as; 2137 btree_path_idx_t new_path = 0; 2138 int ret; 2139 2140 flags |= BCH_TRANS_COMMIT_no_enospc; 2141 2142 struct btree_path *path = btree_iter_path(trans, iter); 2143 parent = btree_node_parent(path, b); 2144 as = bch2_btree_update_start(trans, path, b->c.level, false, flags); 2145 ret = PTR_ERR_OR_ZERO(as); 2146 if (ret) 2147 goto out; 2148 2149 n = bch2_btree_node_alloc_replacement(as, trans, b); 2150 2151 bch2_btree_build_aux_trees(n); 2152 bch2_btree_update_add_new_node(as, n); 2153 six_unlock_write(&n->c.lock); 2154 2155 new_path = bch2_path_get_unlocked_mut(trans, iter->btree_id, n->c.level, n->key.k.p); 2156 six_lock_increment(&n->c.lock, SIX_LOCK_intent); 2157 mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); 2158 bch2_btree_path_level_init(trans, trans->paths + new_path, n); 2159 2160 trace_and_count(c, btree_node_rewrite, trans, b); 2161 2162 if (parent) { 2163 bch2_keylist_add(&as->parent_keys, &n->key); 2164 ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); 2165 } else { 2166 ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false); 2167 } 2168 2169 if (ret) 2170 goto err; 2171 2172 bch2_btree_interior_update_will_free_node(as, b); 2173 2174 bch2_btree_update_get_open_buckets(as, n); 2175 bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); 2176 2177 bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); 2178 2179 bch2_trans_node_add(trans, trans->paths + iter->path, n); 2180 six_unlock_intent(&n->c.lock); 2181 2182 bch2_btree_update_done(as, trans); 2183 out: 2184 if (new_path) 2185 bch2_path_put(trans, new_path, true); 2186 bch2_trans_downgrade(trans); 2187 return ret; 2188 err: 2189 bch2_btree_node_free_never_used(as, trans, n); 2190 bch2_btree_update_free(as, trans); 2191 goto out; 2192 } 2193 2194 struct async_btree_rewrite { 2195 struct bch_fs *c; 2196 struct work_struct work; 2197 struct list_head list; 2198 enum btree_id btree_id; 2199 unsigned level; 2200 struct bkey_buf key; 2201 }; 2202 2203 static int async_btree_node_rewrite_trans(struct btree_trans *trans, 2204 struct async_btree_rewrite *a) 2205 { 2206 struct btree_iter iter; 2207 bch2_trans_node_iter_init(trans, &iter, 2208 a->btree_id, a->key.k->k.p, 2209 BTREE_MAX_DEPTH, a->level, 0); 2210 struct btree *b = bch2_btree_iter_peek_node(&iter); 2211 int ret = PTR_ERR_OR_ZERO(b); 2212 if (ret) 2213 goto out; 2214 2215 bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); 2216 ret = found 2217 ? bch2_btree_node_rewrite(trans, &iter, b, 0) 2218 : -ENOENT; 2219 2220 #if 0 2221 /* Tracepoint... */ 2222 if (!ret || ret == -ENOENT) { 2223 struct bch_fs *c = trans->c; 2224 struct printbuf buf = PRINTBUF; 2225 2226 if (!ret) { 2227 prt_printf(&buf, "rewrite node:\n "); 2228 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); 2229 } else { 2230 prt_printf(&buf, "node to rewrite not found:\n want: "); 2231 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); 2232 prt_printf(&buf, "\n got: "); 2233 if (b) 2234 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); 2235 else 2236 prt_str(&buf, "(null)"); 2237 } 2238 bch_info(c, "%s", buf.buf); 2239 printbuf_exit(&buf); 2240 } 2241 #endif 2242 out: 2243 bch2_trans_iter_exit(trans, &iter); 2244 return ret; 2245 } 2246 2247 static void async_btree_node_rewrite_work(struct work_struct *work) 2248 { 2249 struct async_btree_rewrite *a = 2250 container_of(work, struct async_btree_rewrite, work); 2251 struct bch_fs *c = a->c; 2252 2253 int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); 2254 if (ret != -ENOENT) 2255 bch_err_fn_ratelimited(c, ret); 2256 2257 spin_lock(&c->btree_node_rewrites_lock); 2258 list_del(&a->list); 2259 spin_unlock(&c->btree_node_rewrites_lock); 2260 2261 closure_wake_up(&c->btree_node_rewrites_wait); 2262 2263 bch2_bkey_buf_exit(&a->key, c); 2264 bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); 2265 kfree(a); 2266 } 2267 2268 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) 2269 { 2270 struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); 2271 if (!a) 2272 return; 2273 2274 a->c = c; 2275 a->btree_id = b->c.btree_id; 2276 a->level = b->c.level; 2277 INIT_WORK(&a->work, async_btree_node_rewrite_work); 2278 2279 bch2_bkey_buf_init(&a->key); 2280 bch2_bkey_buf_copy(&a->key, c, &b->key); 2281 2282 bool now = false, pending = false; 2283 2284 spin_lock(&c->btree_node_rewrites_lock); 2285 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && 2286 bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { 2287 list_add(&a->list, &c->btree_node_rewrites); 2288 now = true; 2289 } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { 2290 list_add(&a->list, &c->btree_node_rewrites_pending); 2291 pending = true; 2292 } 2293 spin_unlock(&c->btree_node_rewrites_lock); 2294 2295 if (now) { 2296 queue_work(c->btree_node_rewrite_worker, &a->work); 2297 } else if (pending) { 2298 /* bch2_do_pending_node_rewrites will execute */ 2299 } else { 2300 bch2_bkey_buf_exit(&a->key, c); 2301 kfree(a); 2302 } 2303 } 2304 2305 void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) 2306 { 2307 closure_wait_event(&c->btree_node_rewrites_wait, 2308 list_empty(&c->btree_node_rewrites)); 2309 } 2310 2311 void bch2_do_pending_node_rewrites(struct bch_fs *c) 2312 { 2313 while (1) { 2314 spin_lock(&c->btree_node_rewrites_lock); 2315 struct async_btree_rewrite *a = 2316 list_pop_entry(&c->btree_node_rewrites_pending, 2317 struct async_btree_rewrite, list); 2318 if (a) 2319 list_add(&a->list, &c->btree_node_rewrites); 2320 spin_unlock(&c->btree_node_rewrites_lock); 2321 2322 if (!a) 2323 break; 2324 2325 bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); 2326 queue_work(c->btree_node_rewrite_worker, &a->work); 2327 } 2328 } 2329 2330 void bch2_free_pending_node_rewrites(struct bch_fs *c) 2331 { 2332 while (1) { 2333 spin_lock(&c->btree_node_rewrites_lock); 2334 struct async_btree_rewrite *a = 2335 list_pop_entry(&c->btree_node_rewrites_pending, 2336 struct async_btree_rewrite, list); 2337 spin_unlock(&c->btree_node_rewrites_lock); 2338 2339 if (!a) 2340 break; 2341 2342 bch2_bkey_buf_exit(&a->key, c); 2343 kfree(a); 2344 } 2345 } 2346 2347 static int __bch2_btree_node_update_key(struct btree_trans *trans, 2348 struct btree_iter *iter, 2349 struct btree *b, struct btree *new_hash, 2350 struct bkey_i *new_key, 2351 unsigned commit_flags, 2352 bool skip_triggers) 2353 { 2354 struct bch_fs *c = trans->c; 2355 struct btree_iter iter2 = { NULL }; 2356 struct btree *parent; 2357 int ret; 2358 2359 if (!skip_triggers) { 2360 ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1, 2361 bkey_i_to_s_c(&b->key), 2362 BTREE_TRIGGER_transactional) ?: 2363 bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1, 2364 bkey_i_to_s(new_key), 2365 BTREE_TRIGGER_transactional); 2366 if (ret) 2367 return ret; 2368 } 2369 2370 if (new_hash) { 2371 bkey_copy(&new_hash->key, new_key); 2372 ret = bch2_btree_node_hash_insert(&c->btree_cache, 2373 new_hash, b->c.level, b->c.btree_id); 2374 BUG_ON(ret); 2375 } 2376 2377 parent = btree_node_parent(btree_iter_path(trans, iter), b); 2378 if (parent) { 2379 bch2_trans_copy_iter(&iter2, iter); 2380 2381 iter2.path = bch2_btree_path_make_mut(trans, iter2.path, 2382 iter2.flags & BTREE_ITER_intent, 2383 _THIS_IP_); 2384 2385 struct btree_path *path2 = btree_iter_path(trans, &iter2); 2386 BUG_ON(path2->level != b->c.level); 2387 BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); 2388 2389 btree_path_set_level_up(trans, path2); 2390 2391 trans->paths_sorted = false; 2392 2393 ret = bch2_btree_iter_traverse(&iter2) ?: 2394 bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); 2395 if (ret) 2396 goto err; 2397 } else { 2398 BUG_ON(btree_node_root(c, b) != b); 2399 2400 struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, 2401 jset_u64s(new_key->k.u64s)); 2402 ret = PTR_ERR_OR_ZERO(e); 2403 if (ret) 2404 return ret; 2405 2406 journal_entry_set(e, 2407 BCH_JSET_ENTRY_btree_root, 2408 b->c.btree_id, b->c.level, 2409 new_key, new_key->k.u64s); 2410 } 2411 2412 ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); 2413 if (ret) 2414 goto err; 2415 2416 bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); 2417 2418 if (new_hash) { 2419 mutex_lock(&c->btree_cache.lock); 2420 bch2_btree_node_hash_remove(&c->btree_cache, new_hash); 2421 2422 __bch2_btree_node_hash_remove(&c->btree_cache, b); 2423 2424 bkey_copy(&b->key, new_key); 2425 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); 2426 BUG_ON(ret); 2427 mutex_unlock(&c->btree_cache.lock); 2428 } else { 2429 bkey_copy(&b->key, new_key); 2430 } 2431 2432 bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); 2433 out: 2434 bch2_trans_iter_exit(trans, &iter2); 2435 return ret; 2436 err: 2437 if (new_hash) { 2438 mutex_lock(&c->btree_cache.lock); 2439 bch2_btree_node_hash_remove(&c->btree_cache, b); 2440 mutex_unlock(&c->btree_cache.lock); 2441 } 2442 goto out; 2443 } 2444 2445 int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, 2446 struct btree *b, struct bkey_i *new_key, 2447 unsigned commit_flags, bool skip_triggers) 2448 { 2449 struct bch_fs *c = trans->c; 2450 struct btree *new_hash = NULL; 2451 struct btree_path *path = btree_iter_path(trans, iter); 2452 struct closure cl; 2453 int ret = 0; 2454 2455 ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); 2456 if (ret) 2457 return ret; 2458 2459 closure_init_stack(&cl); 2460 2461 /* 2462 * check btree_ptr_hash_val() after @b is locked by 2463 * btree_iter_traverse(): 2464 */ 2465 if (btree_ptr_hash_val(new_key) != b->hash_val) { 2466 ret = bch2_btree_cache_cannibalize_lock(trans, &cl); 2467 if (ret) { 2468 ret = drop_locks_do(trans, (closure_sync(&cl), 0)); 2469 if (ret) 2470 return ret; 2471 } 2472 2473 new_hash = bch2_btree_node_mem_alloc(trans, false); 2474 ret = PTR_ERR_OR_ZERO(new_hash); 2475 if (ret) 2476 goto err; 2477 } 2478 2479 path->intent_ref++; 2480 ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, 2481 commit_flags, skip_triggers); 2482 --path->intent_ref; 2483 2484 if (new_hash) 2485 bch2_btree_node_to_freelist(c, new_hash); 2486 err: 2487 closure_sync(&cl); 2488 bch2_btree_cache_cannibalize_unlock(trans); 2489 return ret; 2490 } 2491 2492 int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, 2493 struct btree *b, struct bkey_i *new_key, 2494 unsigned commit_flags, bool skip_triggers) 2495 { 2496 struct btree_iter iter; 2497 int ret; 2498 2499 bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, 2500 BTREE_MAX_DEPTH, b->c.level, 2501 BTREE_ITER_intent); 2502 ret = bch2_btree_iter_traverse(&iter); 2503 if (ret) 2504 goto out; 2505 2506 /* has node been freed? */ 2507 if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { 2508 /* node has been freed: */ 2509 BUG_ON(!btree_node_dying(b)); 2510 goto out; 2511 } 2512 2513 BUG_ON(!btree_node_hashed(b)); 2514 2515 bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, 2516 !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); 2517 2518 ret = bch2_btree_node_update_key(trans, &iter, b, new_key, 2519 commit_flags, skip_triggers); 2520 out: 2521 bch2_trans_iter_exit(trans, &iter); 2522 return ret; 2523 } 2524 2525 /* Init code: */ 2526 2527 /* 2528 * Only for filesystem bringup, when first reading the btree roots or allocating 2529 * btree roots when initializing a new filesystem: 2530 */ 2531 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) 2532 { 2533 BUG_ON(btree_node_root(c, b)); 2534 2535 bch2_btree_set_root_inmem(c, b); 2536 } 2537 2538 int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id, unsigned level) 2539 { 2540 struct bch_fs *c = trans->c; 2541 struct closure cl; 2542 struct btree *b; 2543 int ret; 2544 2545 closure_init_stack(&cl); 2546 2547 do { 2548 ret = bch2_btree_cache_cannibalize_lock(trans, &cl); 2549 closure_sync(&cl); 2550 } while (ret); 2551 2552 b = bch2_btree_node_mem_alloc(trans, false); 2553 bch2_btree_cache_cannibalize_unlock(trans); 2554 2555 ret = PTR_ERR_OR_ZERO(b); 2556 if (ret) 2557 return ret; 2558 2559 set_btree_node_fake(b); 2560 set_btree_node_need_rewrite(b); 2561 b->c.level = level; 2562 b->c.btree_id = id; 2563 2564 bkey_btree_ptr_init(&b->key); 2565 b->key.k.p = SPOS_MAX; 2566 *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; 2567 2568 bch2_bset_init_first(b, &b->data->keys); 2569 bch2_btree_build_aux_trees(b); 2570 2571 b->data->flags = 0; 2572 btree_set_min(b, POS_MIN); 2573 btree_set_max(b, SPOS_MAX); 2574 b->data->format = bch2_btree_calc_format(b); 2575 btree_node_set_format(b, b->data->format); 2576 2577 ret = bch2_btree_node_hash_insert(&c->btree_cache, b, 2578 b->c.level, b->c.btree_id); 2579 BUG_ON(ret); 2580 2581 bch2_btree_set_root_inmem(c, b); 2582 2583 six_unlock_write(&b->c.lock); 2584 six_unlock_intent(&b->c.lock); 2585 return 0; 2586 } 2587 2588 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) 2589 { 2590 bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); 2591 } 2592 2593 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) 2594 { 2595 prt_printf(out, "%ps: ", (void *) as->ip_started); 2596 bch2_trans_commit_flags_to_text(out, as->flags); 2597 2598 prt_str(out, " "); 2599 bch2_btree_id_to_text(out, as->btree_id); 2600 prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", 2601 as->update_level_start, 2602 as->update_level_end, 2603 bch2_btree_update_modes[as->mode], 2604 as->nodes_written, 2605 closure_nr_remaining(&as->cl), 2606 as->journal.seq); 2607 } 2608 2609 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) 2610 { 2611 struct btree_update *as; 2612 2613 mutex_lock(&c->btree_interior_update_lock); 2614 list_for_each_entry(as, &c->btree_interior_update_list, list) 2615 bch2_btree_update_to_text(out, as); 2616 mutex_unlock(&c->btree_interior_update_lock); 2617 } 2618 2619 static bool bch2_btree_interior_updates_pending(struct bch_fs *c) 2620 { 2621 bool ret; 2622 2623 mutex_lock(&c->btree_interior_update_lock); 2624 ret = !list_empty(&c->btree_interior_update_list); 2625 mutex_unlock(&c->btree_interior_update_lock); 2626 2627 return ret; 2628 } 2629 2630 bool bch2_btree_interior_updates_flush(struct bch_fs *c) 2631 { 2632 bool ret = bch2_btree_interior_updates_pending(c); 2633 2634 if (ret) 2635 closure_wait_event(&c->btree_interior_update_wait, 2636 !bch2_btree_interior_updates_pending(c)); 2637 return ret; 2638 } 2639 2640 void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) 2641 { 2642 struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); 2643 2644 mutex_lock(&c->btree_root_lock); 2645 2646 r->level = entry->level; 2647 r->alive = true; 2648 bkey_copy(&r->key, (struct bkey_i *) entry->start); 2649 2650 mutex_unlock(&c->btree_root_lock); 2651 } 2652 2653 struct jset_entry * 2654 bch2_btree_roots_to_journal_entries(struct bch_fs *c, 2655 struct jset_entry *end, 2656 unsigned long skip) 2657 { 2658 unsigned i; 2659 2660 mutex_lock(&c->btree_root_lock); 2661 2662 for (i = 0; i < btree_id_nr_alive(c); i++) { 2663 struct btree_root *r = bch2_btree_id_root(c, i); 2664 2665 if (r->alive && !test_bit(i, &skip)) { 2666 journal_entry_set(end, BCH_JSET_ENTRY_btree_root, 2667 i, r->level, &r->key, r->key.k.u64s); 2668 end = vstruct_next(end); 2669 } 2670 } 2671 2672 mutex_unlock(&c->btree_root_lock); 2673 2674 return end; 2675 } 2676 2677 static void bch2_btree_alloc_to_text(struct printbuf *out, 2678 struct bch_fs *c, 2679 struct btree_alloc *a) 2680 { 2681 printbuf_indent_add(out, 2); 2682 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&a->k)); 2683 prt_newline(out); 2684 2685 struct open_bucket *ob; 2686 unsigned i; 2687 open_bucket_for_each(c, &a->ob, ob, i) 2688 bch2_open_bucket_to_text(out, c, ob); 2689 2690 printbuf_indent_sub(out, 2); 2691 } 2692 2693 void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) 2694 { 2695 for (unsigned i = 0; i < c->btree_reserve_cache_nr; i++) 2696 bch2_btree_alloc_to_text(out, c, &c->btree_reserve_cache[i]); 2697 } 2698 2699 void bch2_fs_btree_interior_update_exit(struct bch_fs *c) 2700 { 2701 WARN_ON(!list_empty(&c->btree_node_rewrites)); 2702 WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); 2703 2704 if (c->btree_node_rewrite_worker) 2705 destroy_workqueue(c->btree_node_rewrite_worker); 2706 if (c->btree_interior_update_worker) 2707 destroy_workqueue(c->btree_interior_update_worker); 2708 mempool_exit(&c->btree_interior_update_pool); 2709 } 2710 2711 void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) 2712 { 2713 mutex_init(&c->btree_reserve_cache_lock); 2714 INIT_LIST_HEAD(&c->btree_interior_update_list); 2715 INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); 2716 mutex_init(&c->btree_interior_update_lock); 2717 INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); 2718 2719 INIT_LIST_HEAD(&c->btree_node_rewrites); 2720 INIT_LIST_HEAD(&c->btree_node_rewrites_pending); 2721 spin_lock_init(&c->btree_node_rewrites_lock); 2722 } 2723 2724 int bch2_fs_btree_interior_update_init(struct bch_fs *c) 2725 { 2726 c->btree_interior_update_worker = 2727 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); 2728 if (!c->btree_interior_update_worker) 2729 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; 2730 2731 c->btree_node_rewrite_worker = 2732 alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); 2733 if (!c->btree_node_rewrite_worker) 2734 return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; 2735 2736 if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, 2737 sizeof(struct btree_update))) 2738 return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; 2739 2740 return 0; 2741 } 2742