1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_foreground.h" 5 #include "bkey_buf.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "compress.h" 9 #include "data_update.h" 10 #include "disk_groups.h" 11 #include "ec.h" 12 #include "error.h" 13 #include "extents.h" 14 #include "io_write.h" 15 #include "keylist.h" 16 #include "move.h" 17 #include "nocow_locking.h" 18 #include "rebalance.h" 19 #include "snapshot.h" 20 #include "subvolume.h" 21 #include "trace.h" 22 23 #include <linux/ioprio.h> 24 25 static const char * const bch2_data_update_type_strs[] = { 26 #define x(t, n, ...) [n] = #t, 27 BCH_DATA_UPDATE_TYPES() 28 #undef x 29 NULL 30 }; 31 32 static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) 33 { 34 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 35 36 bkey_for_each_ptr(ptrs, ptr) 37 bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); 38 } 39 40 static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) 41 { 42 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 43 44 bkey_for_each_ptr(ptrs, ptr) { 45 if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { 46 bkey_for_each_ptr(ptrs, ptr2) { 47 if (ptr2 == ptr) 48 break; 49 bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); 50 } 51 return false; 52 } 53 } 54 return true; 55 } 56 57 static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) 58 { 59 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 60 61 bkey_for_each_ptr(ptrs, ptr) { 62 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 63 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 64 65 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 66 } 67 } 68 69 static noinline_for_stack 70 bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, 71 const struct bch_extent_ptr *start) 72 { 73 if (!ctxt) { 74 bkey_for_each_ptr(ptrs, ptr) { 75 if (ptr == start) 76 break; 77 78 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 79 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 80 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 81 } 82 return false; 83 } 84 85 __bkey_for_each_ptr(start, ptrs.end, ptr) { 86 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 87 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 88 89 bool locked; 90 move_ctxt_wait_event(ctxt, 91 (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || 92 list_empty(&ctxt->ios)); 93 if (!locked) 94 bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); 95 } 96 return true; 97 } 98 99 static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) 100 { 101 bkey_for_each_ptr(ptrs, ptr) { 102 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 103 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 104 105 if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) 106 return __bkey_nocow_lock(c, ctxt, ptrs, ptr); 107 } 108 109 return true; 110 } 111 112 noinline_for_stack 113 static void trace_io_move_finish2(struct data_update *u, 114 struct bkey_i *new, 115 struct bkey_i *insert) 116 { 117 struct bch_fs *c = u->op.c; 118 struct printbuf buf = PRINTBUF; 119 120 prt_newline(&buf); 121 122 bch2_data_update_to_text(&buf, u); 123 prt_newline(&buf); 124 125 prt_str_indented(&buf, "new replicas:\t"); 126 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 127 prt_newline(&buf); 128 129 prt_str_indented(&buf, "insert:\t"); 130 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 131 prt_newline(&buf); 132 133 trace_io_move_finish(c, buf.buf); 134 printbuf_exit(&buf); 135 } 136 137 noinline_for_stack 138 static void trace_io_move_fail2(struct data_update *m, 139 struct bkey_s_c new, 140 struct bkey_s_c wrote, 141 struct bkey_i *insert, 142 const char *msg) 143 { 144 struct bch_fs *c = m->op.c; 145 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 146 struct printbuf buf = PRINTBUF; 147 unsigned rewrites_found = 0; 148 149 if (!trace_io_move_fail_enabled()) 150 return; 151 152 prt_str(&buf, msg); 153 154 if (insert) { 155 const union bch_extent_entry *entry; 156 struct bch_extent_ptr *ptr; 157 struct extent_ptr_decoded p; 158 159 unsigned ptr_bit = 1; 160 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { 161 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 162 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 163 !ptr->cached) 164 rewrites_found |= ptr_bit; 165 ptr_bit <<= 1; 166 } 167 } 168 169 prt_str(&buf, "rewrites found:\t"); 170 bch2_prt_u64_base2(&buf, rewrites_found); 171 prt_newline(&buf); 172 173 bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); 174 175 prt_str(&buf, "\nold: "); 176 bch2_bkey_val_to_text(&buf, c, old); 177 178 prt_str(&buf, "\nnew: "); 179 bch2_bkey_val_to_text(&buf, c, new); 180 181 prt_str(&buf, "\nwrote: "); 182 bch2_bkey_val_to_text(&buf, c, wrote); 183 184 if (insert) { 185 prt_str(&buf, "\ninsert: "); 186 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 187 } 188 189 trace_io_move_fail(c, buf.buf); 190 printbuf_exit(&buf); 191 } 192 193 noinline_for_stack 194 static void trace_data_update2(struct data_update *m, 195 struct bkey_s_c old, struct bkey_s_c k, 196 struct bkey_i *insert) 197 { 198 struct bch_fs *c = m->op.c; 199 struct printbuf buf = PRINTBUF; 200 201 prt_str(&buf, "\nold: "); 202 bch2_bkey_val_to_text(&buf, c, old); 203 prt_str(&buf, "\nk: "); 204 bch2_bkey_val_to_text(&buf, c, k); 205 prt_str(&buf, "\nnew: "); 206 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 207 208 trace_data_update(c, buf.buf); 209 printbuf_exit(&buf); 210 } 211 212 noinline_for_stack 213 static void trace_io_move_created_rebalance2(struct data_update *m, 214 struct bkey_s_c old, struct bkey_s_c k, 215 struct bkey_i *insert) 216 { 217 struct bch_fs *c = m->op.c; 218 struct printbuf buf = PRINTBUF; 219 220 bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); 221 222 prt_str(&buf, "\nold: "); 223 bch2_bkey_val_to_text(&buf, c, old); 224 prt_str(&buf, "\nk: "); 225 bch2_bkey_val_to_text(&buf, c, k); 226 prt_str(&buf, "\nnew: "); 227 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 228 229 trace_io_move_created_rebalance(c, buf.buf); 230 printbuf_exit(&buf); 231 232 this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); 233 } 234 235 noinline_for_stack 236 static int data_update_invalid_bkey(struct data_update *m, 237 struct bkey_s_c old, struct bkey_s_c k, 238 struct bkey_i *insert) 239 { 240 struct bch_fs *c = m->op.c; 241 struct printbuf buf = PRINTBUF; 242 bch2_log_msg_start(c, &buf); 243 244 prt_str(&buf, "about to insert invalid key in data update path"); 245 prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); 246 prt_str(&buf, "\nold: "); 247 bch2_bkey_val_to_text(&buf, c, old); 248 prt_str(&buf, "\nk: "); 249 bch2_bkey_val_to_text(&buf, c, k); 250 prt_str(&buf, "\nnew: "); 251 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 252 253 bch2_fs_emergency_read_only2(c, &buf); 254 255 bch2_print_str(c, KERN_ERR, buf.buf); 256 printbuf_exit(&buf); 257 258 return bch_err_throw(c, invalid_bkey); 259 } 260 261 static int __bch2_data_update_index_update(struct btree_trans *trans, 262 struct bch_write_op *op) 263 { 264 struct bch_fs *c = op->c; 265 struct btree_iter iter; 266 struct data_update *m = container_of(op, struct data_update, op); 267 int ret = 0; 268 269 bch2_trans_iter_init(trans, &iter, m->btree_id, 270 bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), 271 BTREE_ITER_slots|BTREE_ITER_intent); 272 273 while (1) { 274 struct bkey_s_c k; 275 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 276 struct bkey_i *insert = NULL; 277 struct bkey_i_extent *new; 278 const union bch_extent_entry *entry_c; 279 union bch_extent_entry *entry; 280 struct extent_ptr_decoded p; 281 struct bch_extent_ptr *ptr; 282 const struct bch_extent_ptr *ptr_c; 283 struct bpos next_pos; 284 bool should_check_enospc; 285 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 286 unsigned rewrites_found = 0, durability, ptr_bit; 287 288 bch2_trans_begin(trans); 289 290 k = bch2_btree_iter_peek_slot(trans, &iter); 291 ret = bkey_err(k); 292 if (ret) 293 goto err; 294 295 new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); 296 297 if (!bch2_extents_match(k, old)) { 298 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), 299 NULL, "no match:"); 300 goto nowork; 301 } 302 303 insert = bch2_trans_kmalloc(trans, 304 bkey_bytes(k.k) + 305 bkey_val_bytes(&new->k) + 306 sizeof(struct bch_extent_rebalance)); 307 ret = PTR_ERR_OR_ZERO(insert); 308 if (ret) 309 goto err; 310 311 bkey_reassemble(insert, k); 312 313 new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); 314 ret = PTR_ERR_OR_ZERO(new); 315 if (ret) 316 goto err; 317 318 bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); 319 bch2_cut_front(iter.pos, &new->k_i); 320 321 bch2_cut_front(iter.pos, insert); 322 bch2_cut_back(new->k.p, insert); 323 bch2_cut_back(insert->k.p, &new->k_i); 324 325 /* 326 * @old: extent that we read from 327 * @insert: key that we're going to update, initialized from 328 * extent currently in btree - same as @old unless we raced with 329 * other updates 330 * @new: extent with new pointers that we'll be adding to @insert 331 * 332 * Fist, drop rewrite_ptrs from @new: 333 */ 334 ptr_bit = 1; 335 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { 336 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 337 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 338 !ptr->cached) { 339 bch2_extent_ptr_set_cached(c, &m->op.opts, 340 bkey_i_to_s(insert), ptr); 341 rewrites_found |= ptr_bit; 342 } 343 ptr_bit <<= 1; 344 } 345 346 if (m->data_opts.rewrite_ptrs && 347 !rewrites_found && 348 bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { 349 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); 350 goto nowork; 351 } 352 353 /* 354 * A replica that we just wrote might conflict with a replica 355 * that we want to keep, due to racing with another move: 356 */ 357 restart_drop_conflicting_replicas: 358 extent_for_each_ptr(extent_i_to_s(new), ptr) 359 if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && 360 !ptr_c->cached) { 361 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); 362 goto restart_drop_conflicting_replicas; 363 } 364 365 if (!bkey_val_u64s(&new->k)) { 366 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); 367 goto nowork; 368 } 369 370 /* Now, drop pointers that conflict with what we just wrote: */ 371 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 372 if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) 373 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); 374 375 durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + 376 bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); 377 378 /* Now, drop excess replicas: */ 379 scoped_guard(rcu) { 380 restart_drop_extra_replicas: 381 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { 382 unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); 383 384 if (!p.ptr.cached && 385 durability - ptr_durability >= m->op.opts.data_replicas) { 386 durability -= ptr_durability; 387 388 bch2_extent_ptr_set_cached(c, &m->op.opts, 389 bkey_i_to_s(insert), &entry->ptr); 390 goto restart_drop_extra_replicas; 391 } 392 } 393 } 394 395 /* Finally, add the pointers we just wrote: */ 396 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 397 bch2_extent_ptr_decoded_append(insert, &p); 398 399 bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); 400 bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); 401 402 ret = bch2_sum_sector_overwrites(trans, &iter, insert, 403 &should_check_enospc, 404 &i_sectors_delta, 405 &disk_sectors_delta); 406 if (ret) 407 goto err; 408 409 if (disk_sectors_delta > (s64) op->res.sectors) { 410 ret = bch2_disk_reservation_add(c, &op->res, 411 disk_sectors_delta - op->res.sectors, 412 !should_check_enospc 413 ? BCH_DISK_RESERVATION_NOFAIL : 0); 414 if (ret) 415 goto out; 416 } 417 418 next_pos = insert->k.p; 419 420 /* 421 * Check for nonce offset inconsistency: 422 * This is debug code - we've been seeing this bug rarely, and 423 * it's been hard to reproduce, so this should give us some more 424 * information when it does occur: 425 */ 426 int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), 427 (struct bkey_validate_context) { 428 .btree = m->btree_id, 429 .flags = BCH_VALIDATE_commit, 430 }); 431 if (unlikely(invalid)) { 432 ret = data_update_invalid_bkey(m, old, k, insert); 433 goto out; 434 } 435 436 ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: 437 bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: 438 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 439 k.k->p, bkey_start_pos(&insert->k)) ?: 440 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 441 k.k->p, insert->k.p) ?: 442 bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: 443 bch2_trans_update(trans, &iter, insert, 444 BTREE_UPDATE_internal_snapshot_node); 445 if (ret) 446 goto err; 447 448 if (trace_data_update_enabled()) 449 trace_data_update2(m, old, k, insert); 450 451 if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > 452 bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) 453 trace_io_move_created_rebalance2(m, old, k, insert); 454 455 ret = bch2_trans_commit(trans, &op->res, 456 NULL, 457 BCH_TRANS_COMMIT_no_check_rw| 458 BCH_TRANS_COMMIT_no_enospc| 459 m->data_opts.btree_insert_flags); 460 if (ret) 461 goto err; 462 463 bch2_btree_iter_set_pos(trans, &iter, next_pos); 464 465 this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); 466 if (trace_io_move_finish_enabled()) 467 trace_io_move_finish2(m, &new->k_i, insert); 468 err: 469 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 470 ret = 0; 471 if (ret) 472 break; 473 next: 474 while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { 475 bch2_keylist_pop_front(&op->insert_keys); 476 if (bch2_keylist_empty(&op->insert_keys)) 477 goto out; 478 } 479 continue; 480 nowork: 481 if (m->stats) { 482 BUG_ON(k.k->p.offset <= iter.pos.offset); 483 atomic64_inc(&m->stats->keys_raced); 484 atomic64_add(k.k->p.offset - iter.pos.offset, 485 &m->stats->sectors_raced); 486 } 487 488 count_event(c, io_move_fail); 489 490 bch2_btree_iter_advance(trans, &iter); 491 goto next; 492 } 493 out: 494 bch2_trans_iter_exit(trans, &iter); 495 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 496 return ret; 497 } 498 499 int bch2_data_update_index_update(struct bch_write_op *op) 500 { 501 return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); 502 } 503 504 void bch2_data_update_read_done(struct data_update *m) 505 { 506 m->read_done = true; 507 508 /* write bio must own pages: */ 509 BUG_ON(!m->op.wbio.bio.bi_vcnt); 510 511 m->op.crc = m->rbio.pick.crc; 512 m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; 513 514 this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); 515 516 closure_call(&m->op.cl, bch2_write, NULL, NULL); 517 } 518 519 void bch2_data_update_exit(struct data_update *update) 520 { 521 struct bch_fs *c = update->op.c; 522 struct bkey_s_c k = bkey_i_to_s_c(update->k.k); 523 524 bch2_bio_free_pages_pool(c, &update->op.wbio.bio); 525 kfree(update->bvecs); 526 update->bvecs = NULL; 527 528 if (c->opts.nocow_enabled) 529 bkey_nocow_unlock(c, k); 530 bkey_put_dev_refs(c, k); 531 bch2_disk_reservation_put(c, &update->op.res); 532 bch2_bkey_buf_exit(&update->k, c); 533 } 534 535 static noinline_for_stack 536 int bch2_update_unwritten_extent(struct btree_trans *trans, 537 struct data_update *update) 538 { 539 struct bch_fs *c = update->op.c; 540 struct bkey_i_extent *e; 541 struct write_point *wp; 542 struct closure cl; 543 struct btree_iter iter; 544 struct bkey_s_c k; 545 int ret = 0; 546 547 closure_init_stack(&cl); 548 bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); 549 550 while (bpos_lt(update->op.pos, update->k.k->k.p)) { 551 unsigned sectors = update->k.k->k.p.offset - 552 update->op.pos.offset; 553 554 bch2_trans_begin(trans); 555 556 bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, 557 BTREE_ITER_slots); 558 ret = lockrestart_do(trans, ({ 559 k = bch2_btree_iter_peek_slot(trans, &iter); 560 bkey_err(k); 561 })); 562 bch2_trans_iter_exit(trans, &iter); 563 564 if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) 565 break; 566 567 e = bkey_extent_init(update->op.insert_keys.top); 568 e->k.p = update->op.pos; 569 570 ret = bch2_alloc_sectors_start_trans(trans, 571 update->op.target, 572 false, 573 update->op.write_point, 574 &update->op.devs_have, 575 update->op.nr_replicas, 576 update->op.nr_replicas, 577 update->op.watermark, 578 0, &cl, &wp); 579 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { 580 bch2_trans_unlock(trans); 581 closure_sync(&cl); 582 continue; 583 } 584 585 bch_err_fn_ratelimited(c, ret); 586 587 if (ret) 588 break; 589 590 sectors = min(sectors, wp->sectors_free); 591 592 bch2_key_resize(&e->k, sectors); 593 594 bch2_open_bucket_get(c, wp, &update->op.open_buckets); 595 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); 596 bch2_alloc_sectors_done(c, wp); 597 598 update->op.pos.offset += sectors; 599 600 extent_for_each_ptr(extent_i_to_s(e), ptr) 601 ptr->unwritten = true; 602 bch2_keylist_push(&update->op.insert_keys); 603 604 ret = __bch2_data_update_index_update(trans, &update->op); 605 606 bch2_open_buckets_put(c, &update->op.open_buckets); 607 608 if (ret) 609 break; 610 } 611 612 if (closure_nr_remaining(&cl) != 1) { 613 bch2_trans_unlock(trans); 614 closure_sync(&cl); 615 } 616 617 return ret; 618 } 619 620 void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 621 struct bch_io_opts *io_opts, 622 struct data_update_opts *data_opts) 623 { 624 if (!out->nr_tabstops) 625 printbuf_tabstop_push(out, 20); 626 627 prt_str_indented(out, "rewrite ptrs:\t"); 628 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 629 prt_newline(out); 630 631 prt_str_indented(out, "kill ptrs:\t"); 632 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 633 prt_newline(out); 634 635 prt_str_indented(out, "target:\t"); 636 bch2_target_to_text(out, c, data_opts->target); 637 prt_newline(out); 638 639 prt_str_indented(out, "compression:\t"); 640 bch2_compression_opt_to_text(out, io_opts->background_compression); 641 prt_newline(out); 642 643 prt_str_indented(out, "opts.replicas:\t"); 644 prt_u64(out, io_opts->data_replicas); 645 prt_newline(out); 646 647 prt_str_indented(out, "extra replicas:\t"); 648 prt_u64(out, data_opts->extra_replicas); 649 prt_newline(out); 650 651 prt_str_indented(out, "scrub:\t"); 652 prt_u64(out, data_opts->scrub); 653 } 654 655 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) 656 { 657 prt_str(out, bch2_data_update_type_strs[m->type]); 658 prt_newline(out); 659 660 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 661 prt_newline(out); 662 663 prt_str_indented(out, "old key:\t"); 664 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 665 } 666 667 void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) 668 { 669 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 670 prt_newline(out); 671 printbuf_indent_add(out, 2); 672 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 673 674 if (!m->read_done) { 675 prt_printf(out, "read:\n"); 676 printbuf_indent_add(out, 2); 677 bch2_read_bio_to_text(out, &m->rbio); 678 } else { 679 prt_printf(out, "write:\n"); 680 printbuf_indent_add(out, 2); 681 bch2_write_op_to_text(out, &m->op); 682 } 683 printbuf_indent_sub(out, 4); 684 } 685 686 int bch2_extent_drop_ptrs(struct btree_trans *trans, 687 struct btree_iter *iter, 688 struct bkey_s_c k, 689 struct bch_io_opts *io_opts, 690 struct data_update_opts *data_opts) 691 { 692 struct bch_fs *c = trans->c; 693 struct bkey_i *n; 694 int ret; 695 696 n = bch2_bkey_make_mut_noupdate(trans, k); 697 ret = PTR_ERR_OR_ZERO(n); 698 if (ret) 699 return ret; 700 701 while (data_opts->kill_ptrs) { 702 unsigned i = 0, drop = __fls(data_opts->kill_ptrs); 703 704 bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); 705 data_opts->kill_ptrs ^= 1U << drop; 706 } 707 708 /* 709 * If the new extent no longer has any pointers, bch2_extent_normalize() 710 * will do the appropriate thing with it (turning it into a 711 * KEY_TYPE_error key, or just a discard if it was a cached extent) 712 */ 713 bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); 714 715 /* 716 * Since we're not inserting through an extent iterator 717 * (BTREE_ITER_all_snapshots iterators aren't extent iterators), 718 * we aren't using the extent overwrite path to delete, we're 719 * just using the normal key deletion path: 720 */ 721 if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) 722 n->k.size = 0; 723 724 return bch2_trans_relock(trans) ?: 725 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: 726 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 727 } 728 729 static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, 730 struct bch_io_opts *io_opts, 731 unsigned buf_bytes) 732 { 733 unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); 734 735 m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); 736 if (!m->bvecs) 737 return -ENOMEM; 738 739 bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); 740 bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); 741 742 if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { 743 kfree(m->bvecs); 744 m->bvecs = NULL; 745 return -ENOMEM; 746 } 747 748 rbio_init(&m->rbio.bio, c, *io_opts, NULL); 749 m->rbio.data_update = true; 750 m->rbio.bio.bi_iter.bi_size = buf_bytes; 751 m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); 752 m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); 753 return 0; 754 } 755 756 int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, 757 struct bch_io_opts *io_opts) 758 { 759 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); 760 const union bch_extent_entry *entry; 761 struct extent_ptr_decoded p; 762 763 /* write path might have to decompress data: */ 764 unsigned buf_bytes = 0; 765 bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) 766 buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); 767 768 return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); 769 } 770 771 static int can_write_extent(struct bch_fs *c, struct data_update *m) 772 { 773 if ((m->op.flags & BCH_WRITE_alloc_nowait) && 774 unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) 775 return bch_err_throw(c, data_update_done_would_block); 776 777 unsigned target = m->op.flags & BCH_WRITE_only_specified_devs 778 ? m->op.target 779 : 0; 780 struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); 781 782 darray_for_each(m->op.devs_have, i) 783 __clear_bit(*i, devs.d); 784 785 guard(rcu)(); 786 787 unsigned nr_replicas = 0, i; 788 for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { 789 struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); 790 if (!ca) 791 continue; 792 793 struct bch_dev_usage usage; 794 bch2_dev_usage_read_fast(ca, &usage); 795 796 if (!dev_buckets_free(ca, usage, m->op.watermark)) 797 continue; 798 799 nr_replicas += ca->mi.durability; 800 if (nr_replicas >= m->op.nr_replicas) 801 break; 802 } 803 804 if (!nr_replicas) 805 return bch_err_throw(c, data_update_done_no_rw_devs); 806 if (nr_replicas < m->op.nr_replicas) 807 return bch_err_throw(c, insufficient_devices); 808 return 0; 809 } 810 811 int bch2_data_update_init(struct btree_trans *trans, 812 struct btree_iter *iter, 813 struct moving_context *ctxt, 814 struct data_update *m, 815 struct write_point_specifier wp, 816 struct bch_io_opts *io_opts, 817 struct data_update_opts data_opts, 818 enum btree_id btree_id, 819 struct bkey_s_c k) 820 { 821 struct bch_fs *c = trans->c; 822 int ret = 0; 823 824 if (k.k->p.snapshot) { 825 ret = bch2_check_key_has_snapshot(trans, iter, k); 826 if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { 827 /* Can't repair yet, waiting on other recovery passes */ 828 return bch_err_throw(c, data_update_done_no_snapshot); 829 } 830 if (ret < 0) 831 return ret; 832 if (ret) /* key was deleted */ 833 return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: 834 bch_err_throw(c, data_update_done_no_snapshot); 835 ret = 0; 836 } 837 838 bch2_bkey_buf_init(&m->k); 839 bch2_bkey_buf_reassemble(&m->k, c, k); 840 m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc 841 ? BCH_DATA_UPDATE_copygc 842 : BCH_DATA_UPDATE_rebalance; 843 m->btree_id = btree_id; 844 m->data_opts = data_opts; 845 m->ctxt = ctxt; 846 m->stats = ctxt ? ctxt->stats : NULL; 847 848 bch2_write_op_init(&m->op, c, *io_opts); 849 m->op.pos = bkey_start_pos(k.k); 850 m->op.version = k.k->bversion; 851 m->op.target = data_opts.target; 852 m->op.write_point = wp; 853 m->op.nr_replicas = 0; 854 m->op.flags |= BCH_WRITE_pages_stable| 855 BCH_WRITE_pages_owned| 856 BCH_WRITE_data_encoded| 857 BCH_WRITE_move| 858 m->data_opts.write_flags; 859 m->op.compression_opt = io_opts->background_compression; 860 m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; 861 862 unsigned durability_have = 0, durability_removing = 0; 863 864 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); 865 const union bch_extent_entry *entry; 866 struct extent_ptr_decoded p; 867 unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; 868 unsigned buf_bytes = 0; 869 bool unwritten = false; 870 871 unsigned ptr_bit = 1; 872 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 873 if (!p.ptr.cached) { 874 guard(rcu)(); 875 if (ptr_bit & m->data_opts.rewrite_ptrs) { 876 if (crc_is_compressed(p.crc)) 877 reserve_sectors += k.k->size; 878 879 m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); 880 durability_removing += bch2_extent_ptr_desired_durability(c, &p); 881 } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { 882 bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); 883 durability_have += bch2_extent_ptr_durability(c, &p); 884 } 885 } 886 887 /* 888 * op->csum_type is normally initialized from the fs/file's 889 * current options - but if an extent is encrypted, we require 890 * that it stays encrypted: 891 */ 892 if (bch2_csum_type_is_encryption(p.crc.csum_type)) { 893 m->op.nonce = p.crc.nonce + p.crc.offset; 894 m->op.csum_type = p.crc.csum_type; 895 } 896 897 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) 898 m->op.incompressible = true; 899 900 buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); 901 unwritten |= p.ptr.unwritten; 902 903 ptr_bit <<= 1; 904 } 905 906 unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); 907 908 /* 909 * If current extent durability is less than io_opts.data_replicas, 910 * we're not trying to rereplicate the extent up to data_replicas here - 911 * unless extra_replicas was specified 912 * 913 * Increasing replication is an explicit operation triggered by 914 * rereplicate, currently, so that users don't get an unexpected -ENOSPC 915 */ 916 m->op.nr_replicas = min(durability_removing, durability_required) + 917 m->data_opts.extra_replicas; 918 919 /* 920 * If device(s) were set to durability=0 after data was written to them 921 * we can end up with a duribilty=0 extent, and the normal algorithm 922 * that tries not to increase durability doesn't work: 923 */ 924 if (!(durability_have + durability_removing)) 925 m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); 926 927 m->op.nr_replicas_required = m->op.nr_replicas; 928 929 /* 930 * It might turn out that we don't need any new replicas, if the 931 * replicas or durability settings have been changed since the extent 932 * was written: 933 */ 934 if (!m->op.nr_replicas) { 935 m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; 936 m->data_opts.rewrite_ptrs = 0; 937 /* if iter == NULL, it's just a promote */ 938 if (iter) 939 ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); 940 if (!ret) 941 ret = bch_err_throw(c, data_update_done_no_writes_needed); 942 goto out_bkey_buf_exit; 943 } 944 945 /* 946 * Check if the allocation will succeed, to avoid getting an error later 947 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless 948 * read: 949 * 950 * This guards against 951 * - BCH_WRITE_alloc_nowait allocations failing (promotes) 952 * - Destination target full 953 * - Device(s) in destination target offline 954 * - Insufficient durability available in destination target 955 * (i.e. trying to move a durability=2 replica to a target with a 956 * single durability=2 device) 957 */ 958 ret = can_write_extent(c, m); 959 if (ret) 960 goto out_bkey_buf_exit; 961 962 if (reserve_sectors) { 963 ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, 964 m->data_opts.extra_replicas 965 ? 0 966 : BCH_DISK_RESERVATION_NOFAIL); 967 if (ret) 968 goto out_bkey_buf_exit; 969 } 970 971 if (!bkey_get_dev_refs(c, k)) { 972 ret = bch_err_throw(c, data_update_done_no_dev_refs); 973 goto out_put_disk_res; 974 } 975 976 if (c->opts.nocow_enabled && 977 !bkey_nocow_lock(c, ctxt, ptrs)) { 978 ret = bch_err_throw(c, nocow_lock_blocked); 979 goto out_put_dev_refs; 980 } 981 982 if (unwritten) { 983 ret = bch2_update_unwritten_extent(trans, m) ?: 984 bch_err_throw(c, data_update_done_unwritten); 985 goto out_nocow_unlock; 986 } 987 988 bch2_trans_unlock(trans); 989 990 ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); 991 if (ret) 992 goto out_nocow_unlock; 993 994 return 0; 995 out_nocow_unlock: 996 if (c->opts.nocow_enabled) 997 bkey_nocow_unlock(c, k); 998 out_put_dev_refs: 999 bkey_put_dev_refs(c, k); 1000 out_put_disk_res: 1001 bch2_disk_reservation_put(c, &m->op.res); 1002 out_bkey_buf_exit: 1003 bch2_bkey_buf_exit(&m->k, c); 1004 return ret; 1005 } 1006 1007 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) 1008 { 1009 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1010 unsigned ptr_bit = 1; 1011 1012 bkey_for_each_ptr(ptrs, ptr) { 1013 if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { 1014 opts->kill_ptrs |= ptr_bit; 1015 opts->rewrite_ptrs ^= ptr_bit; 1016 } 1017 1018 ptr_bit <<= 1; 1019 } 1020 } 1021