1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_foreground.h" 5 #include "bkey_buf.h" 6 #include "btree_update.h" 7 #include "buckets.h" 8 #include "compress.h" 9 #include "data_update.h" 10 #include "disk_groups.h" 11 #include "ec.h" 12 #include "error.h" 13 #include "extents.h" 14 #include "io_write.h" 15 #include "keylist.h" 16 #include "move.h" 17 #include "nocow_locking.h" 18 #include "rebalance.h" 19 #include "snapshot.h" 20 #include "subvolume.h" 21 #include "trace.h" 22 23 #include <linux/ioprio.h> 24 25 static const char * const bch2_data_update_type_strs[] = { 26 #define x(t, n, ...) [n] = #t, 27 BCH_DATA_UPDATE_TYPES() 28 #undef x 29 NULL 30 }; 31 32 static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) 33 { 34 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 35 36 bkey_for_each_ptr(ptrs, ptr) 37 bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); 38 } 39 40 static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) 41 { 42 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 43 44 bkey_for_each_ptr(ptrs, ptr) { 45 if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { 46 bkey_for_each_ptr(ptrs, ptr2) { 47 if (ptr2 == ptr) 48 break; 49 bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev)); 50 } 51 return false; 52 } 53 } 54 return true; 55 } 56 57 static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) 58 { 59 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 60 61 bkey_for_each_ptr(ptrs, ptr) { 62 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 63 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 64 65 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 66 } 67 } 68 69 static noinline_for_stack 70 bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, 71 const struct bch_extent_ptr *start) 72 { 73 if (!ctxt) { 74 bkey_for_each_ptr(ptrs, ptr) { 75 if (ptr == start) 76 break; 77 78 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 79 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 80 bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); 81 } 82 return false; 83 } 84 85 __bkey_for_each_ptr(start, ptrs.end, ptr) { 86 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 87 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 88 89 bool locked; 90 move_ctxt_wait_event(ctxt, 91 (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || 92 list_empty(&ctxt->ios)); 93 if (!locked) 94 bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); 95 } 96 return true; 97 } 98 99 static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) 100 { 101 bkey_for_each_ptr(ptrs, ptr) { 102 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 103 struct bpos bucket = PTR_BUCKET_POS(ca, ptr); 104 105 if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) 106 return __bkey_nocow_lock(c, ctxt, ptrs, ptr); 107 } 108 109 return true; 110 } 111 112 noinline_for_stack 113 static void trace_io_move_finish2(struct data_update *u, 114 struct bkey_i *new, 115 struct bkey_i *insert) 116 { 117 struct bch_fs *c = u->op.c; 118 struct printbuf buf = PRINTBUF; 119 120 prt_newline(&buf); 121 122 bch2_data_update_to_text(&buf, u); 123 prt_newline(&buf); 124 125 prt_str_indented(&buf, "new replicas:\t"); 126 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); 127 prt_newline(&buf); 128 129 prt_str_indented(&buf, "insert:\t"); 130 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 131 prt_newline(&buf); 132 133 trace_io_move_finish(c, buf.buf); 134 printbuf_exit(&buf); 135 } 136 137 noinline_for_stack 138 static void trace_io_move_fail2(struct data_update *m, 139 struct bkey_s_c new, 140 struct bkey_s_c wrote, 141 struct bkey_i *insert, 142 const char *msg) 143 { 144 struct bch_fs *c = m->op.c; 145 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 146 struct printbuf buf = PRINTBUF; 147 unsigned rewrites_found = 0; 148 149 if (!trace_io_move_fail_enabled()) 150 return; 151 152 prt_str(&buf, msg); 153 154 if (insert) { 155 const union bch_extent_entry *entry; 156 struct bch_extent_ptr *ptr; 157 struct extent_ptr_decoded p; 158 159 unsigned ptr_bit = 1; 160 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { 161 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 162 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 163 !ptr->cached) 164 rewrites_found |= ptr_bit; 165 ptr_bit <<= 1; 166 } 167 } 168 169 prt_str(&buf, "rewrites found:\t"); 170 bch2_prt_u64_base2(&buf, rewrites_found); 171 prt_newline(&buf); 172 173 bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); 174 175 prt_str(&buf, "\nold: "); 176 bch2_bkey_val_to_text(&buf, c, old); 177 178 prt_str(&buf, "\nnew: "); 179 bch2_bkey_val_to_text(&buf, c, new); 180 181 prt_str(&buf, "\nwrote: "); 182 bch2_bkey_val_to_text(&buf, c, wrote); 183 184 if (insert) { 185 prt_str(&buf, "\ninsert: "); 186 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 187 } 188 189 trace_io_move_fail(c, buf.buf); 190 printbuf_exit(&buf); 191 } 192 193 noinline_for_stack 194 static void trace_data_update2(struct data_update *m, 195 struct bkey_s_c old, struct bkey_s_c k, 196 struct bkey_i *insert) 197 { 198 struct bch_fs *c = m->op.c; 199 struct printbuf buf = PRINTBUF; 200 201 prt_str(&buf, "\nold: "); 202 bch2_bkey_val_to_text(&buf, c, old); 203 prt_str(&buf, "\nk: "); 204 bch2_bkey_val_to_text(&buf, c, k); 205 prt_str(&buf, "\nnew: "); 206 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 207 208 trace_data_update(c, buf.buf); 209 printbuf_exit(&buf); 210 } 211 212 noinline_for_stack 213 static void trace_io_move_created_rebalance2(struct data_update *m, 214 struct bkey_s_c old, struct bkey_s_c k, 215 struct bkey_i *insert) 216 { 217 struct bch_fs *c = m->op.c; 218 struct printbuf buf = PRINTBUF; 219 220 bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); 221 222 prt_str(&buf, "\nold: "); 223 bch2_bkey_val_to_text(&buf, c, old); 224 prt_str(&buf, "\nk: "); 225 bch2_bkey_val_to_text(&buf, c, k); 226 prt_str(&buf, "\nnew: "); 227 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 228 229 trace_io_move_created_rebalance(c, buf.buf); 230 printbuf_exit(&buf); 231 232 this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); 233 } 234 235 noinline_for_stack 236 static int data_update_invalid_bkey(struct data_update *m, 237 struct bkey_s_c old, struct bkey_s_c k, 238 struct bkey_i *insert) 239 { 240 struct bch_fs *c = m->op.c; 241 struct printbuf buf = PRINTBUF; 242 bch2_log_msg_start(c, &buf); 243 244 prt_str(&buf, "about to insert invalid key in data update path"); 245 prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); 246 prt_str(&buf, "\nold: "); 247 bch2_bkey_val_to_text(&buf, c, old); 248 prt_str(&buf, "\nk: "); 249 bch2_bkey_val_to_text(&buf, c, k); 250 prt_str(&buf, "\nnew: "); 251 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); 252 prt_newline(&buf); 253 254 bch2_fs_emergency_read_only2(c, &buf); 255 256 bch2_print_str(c, KERN_ERR, buf.buf); 257 printbuf_exit(&buf); 258 259 return bch_err_throw(c, invalid_bkey); 260 } 261 262 static int __bch2_data_update_index_update(struct btree_trans *trans, 263 struct bch_write_op *op) 264 { 265 struct bch_fs *c = op->c; 266 struct btree_iter iter; 267 struct data_update *m = container_of(op, struct data_update, op); 268 int ret = 0; 269 270 bch2_trans_iter_init(trans, &iter, m->btree_id, 271 bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), 272 BTREE_ITER_slots|BTREE_ITER_intent); 273 274 while (1) { 275 struct bkey_s_c k; 276 struct bkey_s_c old = bkey_i_to_s_c(m->k.k); 277 struct bkey_i *insert = NULL; 278 struct bkey_i_extent *new; 279 const union bch_extent_entry *entry_c; 280 union bch_extent_entry *entry; 281 struct extent_ptr_decoded p; 282 struct bch_extent_ptr *ptr; 283 const struct bch_extent_ptr *ptr_c; 284 struct bpos next_pos; 285 bool should_check_enospc; 286 s64 i_sectors_delta = 0, disk_sectors_delta = 0; 287 unsigned rewrites_found = 0, durability, ptr_bit; 288 289 bch2_trans_begin(trans); 290 291 k = bch2_btree_iter_peek_slot(trans, &iter); 292 ret = bkey_err(k); 293 if (ret) 294 goto err; 295 296 new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); 297 298 if (!bch2_extents_match(k, old)) { 299 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), 300 NULL, "no match:"); 301 goto nowork; 302 } 303 304 insert = bch2_trans_kmalloc(trans, 305 bkey_bytes(k.k) + 306 bkey_val_bytes(&new->k) + 307 sizeof(struct bch_extent_rebalance)); 308 ret = PTR_ERR_OR_ZERO(insert); 309 if (ret) 310 goto err; 311 312 bkey_reassemble(insert, k); 313 314 new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); 315 ret = PTR_ERR_OR_ZERO(new); 316 if (ret) 317 goto err; 318 319 bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); 320 bch2_cut_front(iter.pos, &new->k_i); 321 322 bch2_cut_front(iter.pos, insert); 323 bch2_cut_back(new->k.p, insert); 324 bch2_cut_back(insert->k.p, &new->k_i); 325 326 /* 327 * @old: extent that we read from 328 * @insert: key that we're going to update, initialized from 329 * extent currently in btree - same as @old unless we raced with 330 * other updates 331 * @new: extent with new pointers that we'll be adding to @insert 332 * 333 * Fist, drop rewrite_ptrs from @new: 334 */ 335 ptr_bit = 1; 336 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { 337 if ((ptr_bit & m->data_opts.rewrite_ptrs) && 338 (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && 339 !ptr->cached) { 340 bch2_extent_ptr_set_cached(c, &m->op.opts, 341 bkey_i_to_s(insert), ptr); 342 rewrites_found |= ptr_bit; 343 } 344 ptr_bit <<= 1; 345 } 346 347 if (m->data_opts.rewrite_ptrs && 348 !rewrites_found && 349 bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { 350 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); 351 goto nowork; 352 } 353 354 /* 355 * A replica that we just wrote might conflict with a replica 356 * that we want to keep, due to racing with another move: 357 */ 358 restart_drop_conflicting_replicas: 359 extent_for_each_ptr(extent_i_to_s(new), ptr) 360 if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && 361 !ptr_c->cached) { 362 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); 363 goto restart_drop_conflicting_replicas; 364 } 365 366 if (!bkey_val_u64s(&new->k)) { 367 trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); 368 goto nowork; 369 } 370 371 /* Now, drop pointers that conflict with what we just wrote: */ 372 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 373 if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) 374 bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); 375 376 durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + 377 bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); 378 379 /* Now, drop excess replicas: */ 380 scoped_guard(rcu) { 381 restart_drop_extra_replicas: 382 bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { 383 unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); 384 385 if (!p.ptr.cached && 386 durability - ptr_durability >= m->op.opts.data_replicas) { 387 durability -= ptr_durability; 388 389 bch2_extent_ptr_set_cached(c, &m->op.opts, 390 bkey_i_to_s(insert), &entry->ptr); 391 goto restart_drop_extra_replicas; 392 } 393 } 394 } 395 396 /* Finally, add the pointers we just wrote: */ 397 extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) 398 bch2_extent_ptr_decoded_append(insert, &p); 399 400 bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); 401 bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert)); 402 403 ret = bch2_sum_sector_overwrites(trans, &iter, insert, 404 &should_check_enospc, 405 &i_sectors_delta, 406 &disk_sectors_delta); 407 if (ret) 408 goto err; 409 410 if (disk_sectors_delta > (s64) op->res.sectors) { 411 ret = bch2_disk_reservation_add(c, &op->res, 412 disk_sectors_delta - op->res.sectors, 413 !should_check_enospc 414 ? BCH_DISK_RESERVATION_NOFAIL : 0); 415 if (ret) 416 goto out; 417 } 418 419 next_pos = insert->k.p; 420 421 /* 422 * Check for nonce offset inconsistency: 423 * This is debug code - we've been seeing this bug rarely, and 424 * it's been hard to reproduce, so this should give us some more 425 * information when it does occur: 426 */ 427 int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), 428 (struct bkey_validate_context) { 429 .btree = m->btree_id, 430 .flags = BCH_VALIDATE_commit, 431 }); 432 if (unlikely(invalid)) { 433 ret = data_update_invalid_bkey(m, old, k, insert); 434 goto out; 435 } 436 437 ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: 438 bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: 439 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 440 k.k->p, bkey_start_pos(&insert->k)) ?: 441 bch2_insert_snapshot_whiteouts(trans, m->btree_id, 442 k.k->p, insert->k.p) ?: 443 bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: 444 bch2_trans_update(trans, &iter, insert, 445 BTREE_UPDATE_internal_snapshot_node); 446 if (ret) 447 goto err; 448 449 if (trace_data_update_enabled()) 450 trace_data_update2(m, old, k, insert); 451 452 if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > 453 bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) 454 trace_io_move_created_rebalance2(m, old, k, insert); 455 456 ret = bch2_trans_commit(trans, &op->res, 457 NULL, 458 BCH_TRANS_COMMIT_no_check_rw| 459 BCH_TRANS_COMMIT_no_enospc| 460 m->data_opts.btree_insert_flags); 461 if (ret) 462 goto err; 463 464 bch2_btree_iter_set_pos(trans, &iter, next_pos); 465 466 this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); 467 if (trace_io_move_finish_enabled()) 468 trace_io_move_finish2(m, &new->k_i, insert); 469 err: 470 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 471 ret = 0; 472 if (ret) 473 break; 474 next: 475 while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { 476 bch2_keylist_pop_front(&op->insert_keys); 477 if (bch2_keylist_empty(&op->insert_keys)) 478 goto out; 479 } 480 continue; 481 nowork: 482 if (m->stats) { 483 BUG_ON(k.k->p.offset <= iter.pos.offset); 484 atomic64_inc(&m->stats->keys_raced); 485 atomic64_add(k.k->p.offset - iter.pos.offset, 486 &m->stats->sectors_raced); 487 } 488 489 count_event(c, io_move_fail); 490 491 bch2_btree_iter_advance(trans, &iter); 492 goto next; 493 } 494 out: 495 bch2_trans_iter_exit(trans, &iter); 496 BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); 497 return ret; 498 } 499 500 int bch2_data_update_index_update(struct bch_write_op *op) 501 { 502 return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); 503 } 504 505 void bch2_data_update_read_done(struct data_update *m) 506 { 507 m->read_done = true; 508 509 /* write bio must own pages: */ 510 BUG_ON(!m->op.wbio.bio.bi_vcnt); 511 512 m->op.crc = m->rbio.pick.crc; 513 m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; 514 515 this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); 516 517 closure_call(&m->op.cl, bch2_write, NULL, NULL); 518 } 519 520 void bch2_data_update_exit(struct data_update *update) 521 { 522 struct bch_fs *c = update->op.c; 523 struct bkey_s_c k = bkey_i_to_s_c(update->k.k); 524 525 bch2_bio_free_pages_pool(c, &update->op.wbio.bio); 526 kfree(update->bvecs); 527 update->bvecs = NULL; 528 529 if (c->opts.nocow_enabled) 530 bkey_nocow_unlock(c, k); 531 bkey_put_dev_refs(c, k); 532 bch2_disk_reservation_put(c, &update->op.res); 533 bch2_bkey_buf_exit(&update->k, c); 534 } 535 536 static noinline_for_stack 537 int bch2_update_unwritten_extent(struct btree_trans *trans, 538 struct data_update *update) 539 { 540 struct bch_fs *c = update->op.c; 541 struct bkey_i_extent *e; 542 struct write_point *wp; 543 struct closure cl; 544 struct btree_iter iter; 545 struct bkey_s_c k; 546 int ret = 0; 547 548 closure_init_stack(&cl); 549 bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); 550 551 while (bpos_lt(update->op.pos, update->k.k->k.p)) { 552 unsigned sectors = update->k.k->k.p.offset - 553 update->op.pos.offset; 554 555 bch2_trans_begin(trans); 556 557 bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, 558 BTREE_ITER_slots); 559 ret = lockrestart_do(trans, ({ 560 k = bch2_btree_iter_peek_slot(trans, &iter); 561 bkey_err(k); 562 })); 563 bch2_trans_iter_exit(trans, &iter); 564 565 if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) 566 break; 567 568 e = bkey_extent_init(update->op.insert_keys.top); 569 e->k.p = update->op.pos; 570 571 ret = bch2_alloc_sectors_start_trans(trans, 572 update->op.target, 573 false, 574 update->op.write_point, 575 &update->op.devs_have, 576 update->op.nr_replicas, 577 update->op.nr_replicas, 578 update->op.watermark, 579 0, &cl, &wp); 580 if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { 581 bch2_trans_unlock(trans); 582 closure_sync(&cl); 583 continue; 584 } 585 586 bch_err_fn_ratelimited(c, ret); 587 588 if (ret) 589 break; 590 591 sectors = min(sectors, wp->sectors_free); 592 593 bch2_key_resize(&e->k, sectors); 594 595 bch2_open_bucket_get(c, wp, &update->op.open_buckets); 596 bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); 597 bch2_alloc_sectors_done(c, wp); 598 599 update->op.pos.offset += sectors; 600 601 extent_for_each_ptr(extent_i_to_s(e), ptr) 602 ptr->unwritten = true; 603 bch2_keylist_push(&update->op.insert_keys); 604 605 ret = __bch2_data_update_index_update(trans, &update->op); 606 607 bch2_open_buckets_put(c, &update->op.open_buckets); 608 609 if (ret) 610 break; 611 } 612 613 if (closure_nr_remaining(&cl) != 1) { 614 bch2_trans_unlock(trans); 615 closure_sync(&cl); 616 } 617 618 return ret; 619 } 620 621 void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 622 struct bch_io_opts *io_opts, 623 struct data_update_opts *data_opts) 624 { 625 if (!out->nr_tabstops) 626 printbuf_tabstop_push(out, 20); 627 628 prt_str_indented(out, "rewrite ptrs:\t"); 629 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 630 prt_newline(out); 631 632 prt_str_indented(out, "kill ptrs:\t"); 633 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 634 prt_newline(out); 635 636 prt_str_indented(out, "target:\t"); 637 bch2_target_to_text(out, c, data_opts->target); 638 prt_newline(out); 639 640 prt_str_indented(out, "compression:\t"); 641 bch2_compression_opt_to_text(out, io_opts->background_compression); 642 prt_newline(out); 643 644 prt_str_indented(out, "opts.replicas:\t"); 645 prt_u64(out, io_opts->data_replicas); 646 prt_newline(out); 647 648 prt_str_indented(out, "extra replicas:\t"); 649 prt_u64(out, data_opts->extra_replicas); 650 prt_newline(out); 651 652 prt_str_indented(out, "scrub:\t"); 653 prt_u64(out, data_opts->scrub); 654 } 655 656 void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) 657 { 658 prt_str(out, bch2_data_update_type_strs[m->type]); 659 prt_newline(out); 660 661 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 662 prt_newline(out); 663 664 prt_str_indented(out, "old key:\t"); 665 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 666 } 667 668 void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) 669 { 670 bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); 671 prt_newline(out); 672 printbuf_indent_add(out, 2); 673 bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); 674 675 if (!m->read_done) { 676 prt_printf(out, "read:\n"); 677 printbuf_indent_add(out, 2); 678 bch2_read_bio_to_text(out, &m->rbio); 679 } else { 680 prt_printf(out, "write:\n"); 681 printbuf_indent_add(out, 2); 682 bch2_write_op_to_text(out, &m->op); 683 } 684 printbuf_indent_sub(out, 4); 685 } 686 687 int bch2_extent_drop_ptrs(struct btree_trans *trans, 688 struct btree_iter *iter, 689 struct bkey_s_c k, 690 struct bch_io_opts *io_opts, 691 struct data_update_opts *data_opts) 692 { 693 struct bch_fs *c = trans->c; 694 struct bkey_i *n; 695 int ret; 696 697 n = bch2_bkey_make_mut_noupdate(trans, k); 698 ret = PTR_ERR_OR_ZERO(n); 699 if (ret) 700 return ret; 701 702 while (data_opts->kill_ptrs) { 703 unsigned i = 0, drop = __fls(data_opts->kill_ptrs); 704 705 bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); 706 data_opts->kill_ptrs ^= 1U << drop; 707 } 708 709 /* 710 * If the new extent no longer has any pointers, bch2_extent_normalize() 711 * will do the appropriate thing with it (turning it into a 712 * KEY_TYPE_error key, or just a discard if it was a cached extent) 713 */ 714 bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n)); 715 716 /* 717 * Since we're not inserting through an extent iterator 718 * (BTREE_ITER_all_snapshots iterators aren't extent iterators), 719 * we aren't using the extent overwrite path to delete, we're 720 * just using the normal key deletion path: 721 */ 722 if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_is_extents)) 723 n->k.size = 0; 724 725 return bch2_trans_relock(trans) ?: 726 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: 727 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 728 } 729 730 static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, 731 struct bch_io_opts *io_opts, 732 unsigned buf_bytes) 733 { 734 unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); 735 736 m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); 737 if (!m->bvecs) 738 return -ENOMEM; 739 740 bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); 741 bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); 742 743 if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { 744 kfree(m->bvecs); 745 m->bvecs = NULL; 746 return -ENOMEM; 747 } 748 749 rbio_init(&m->rbio.bio, c, *io_opts, NULL); 750 m->rbio.data_update = true; 751 m->rbio.bio.bi_iter.bi_size = buf_bytes; 752 m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); 753 m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); 754 return 0; 755 } 756 757 int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, 758 struct bch_io_opts *io_opts) 759 { 760 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); 761 const union bch_extent_entry *entry; 762 struct extent_ptr_decoded p; 763 764 /* write path might have to decompress data: */ 765 unsigned buf_bytes = 0; 766 bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) 767 buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); 768 769 return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); 770 } 771 772 static int can_write_extent(struct bch_fs *c, struct data_update *m) 773 { 774 if ((m->op.flags & BCH_WRITE_alloc_nowait) && 775 unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) 776 return bch_err_throw(c, data_update_done_would_block); 777 778 unsigned target = m->op.flags & BCH_WRITE_only_specified_devs 779 ? m->op.target 780 : 0; 781 struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); 782 783 darray_for_each(m->op.devs_have, i) 784 __clear_bit(*i, devs.d); 785 786 guard(rcu)(); 787 788 unsigned nr_replicas = 0, i; 789 for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { 790 struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); 791 if (!ca) 792 continue; 793 794 struct bch_dev_usage usage; 795 bch2_dev_usage_read_fast(ca, &usage); 796 797 if (!dev_buckets_free(ca, usage, m->op.watermark)) 798 continue; 799 800 nr_replicas += ca->mi.durability; 801 if (nr_replicas >= m->op.nr_replicas) 802 break; 803 } 804 805 if (!nr_replicas) 806 return bch_err_throw(c, data_update_done_no_rw_devs); 807 if (nr_replicas < m->op.nr_replicas) 808 return bch_err_throw(c, insufficient_devices); 809 return 0; 810 } 811 812 int bch2_data_update_init(struct btree_trans *trans, 813 struct btree_iter *iter, 814 struct moving_context *ctxt, 815 struct data_update *m, 816 struct write_point_specifier wp, 817 struct bch_io_opts *io_opts, 818 struct data_update_opts data_opts, 819 enum btree_id btree_id, 820 struct bkey_s_c k) 821 { 822 struct bch_fs *c = trans->c; 823 int ret = 0; 824 825 if (k.k->p.snapshot) { 826 ret = bch2_check_key_has_snapshot(trans, iter, k); 827 if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { 828 /* Can't repair yet, waiting on other recovery passes */ 829 return bch_err_throw(c, data_update_done_no_snapshot); 830 } 831 if (ret < 0) 832 return ret; 833 if (ret) /* key was deleted */ 834 return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: 835 bch_err_throw(c, data_update_done_no_snapshot); 836 ret = 0; 837 } 838 839 bch2_bkey_buf_init(&m->k); 840 bch2_bkey_buf_reassemble(&m->k, c, k); 841 m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc 842 ? BCH_DATA_UPDATE_copygc 843 : BCH_DATA_UPDATE_rebalance; 844 m->btree_id = btree_id; 845 m->data_opts = data_opts; 846 m->ctxt = ctxt; 847 m->stats = ctxt ? ctxt->stats : NULL; 848 849 bch2_write_op_init(&m->op, c, *io_opts); 850 m->op.pos = bkey_start_pos(k.k); 851 m->op.version = k.k->bversion; 852 m->op.target = data_opts.target; 853 m->op.write_point = wp; 854 m->op.nr_replicas = 0; 855 m->op.flags |= BCH_WRITE_pages_stable| 856 BCH_WRITE_pages_owned| 857 BCH_WRITE_data_encoded| 858 BCH_WRITE_move| 859 m->data_opts.write_flags; 860 m->op.compression_opt = io_opts->background_compression; 861 m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; 862 863 unsigned durability_have = 0, durability_removing = 0; 864 865 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); 866 const union bch_extent_entry *entry; 867 struct extent_ptr_decoded p; 868 unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; 869 unsigned buf_bytes = 0; 870 bool unwritten = false; 871 872 unsigned ptr_bit = 1; 873 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 874 if (!p.ptr.cached) { 875 guard(rcu)(); 876 if (ptr_bit & m->data_opts.rewrite_ptrs) { 877 if (crc_is_compressed(p.crc)) 878 reserve_sectors += k.k->size; 879 880 m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); 881 durability_removing += bch2_extent_ptr_desired_durability(c, &p); 882 } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { 883 bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); 884 durability_have += bch2_extent_ptr_durability(c, &p); 885 } 886 } 887 888 /* 889 * op->csum_type is normally initialized from the fs/file's 890 * current options - but if an extent is encrypted, we require 891 * that it stays encrypted: 892 */ 893 if (bch2_csum_type_is_encryption(p.crc.csum_type)) { 894 m->op.nonce = p.crc.nonce + p.crc.offset; 895 m->op.csum_type = p.crc.csum_type; 896 } 897 898 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) 899 m->op.incompressible = true; 900 901 buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); 902 unwritten |= p.ptr.unwritten; 903 904 ptr_bit <<= 1; 905 } 906 907 unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); 908 909 /* 910 * If current extent durability is less than io_opts.data_replicas, 911 * we're not trying to rereplicate the extent up to data_replicas here - 912 * unless extra_replicas was specified 913 * 914 * Increasing replication is an explicit operation triggered by 915 * rereplicate, currently, so that users don't get an unexpected -ENOSPC 916 */ 917 m->op.nr_replicas = min(durability_removing, durability_required) + 918 m->data_opts.extra_replicas; 919 920 /* 921 * If device(s) were set to durability=0 after data was written to them 922 * we can end up with a duribilty=0 extent, and the normal algorithm 923 * that tries not to increase durability doesn't work: 924 */ 925 if (!(durability_have + durability_removing)) 926 m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); 927 928 m->op.nr_replicas_required = m->op.nr_replicas; 929 930 /* 931 * It might turn out that we don't need any new replicas, if the 932 * replicas or durability settings have been changed since the extent 933 * was written: 934 */ 935 if (!m->op.nr_replicas) { 936 m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; 937 m->data_opts.rewrite_ptrs = 0; 938 /* if iter == NULL, it's just a promote */ 939 if (iter) 940 ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); 941 if (!ret) 942 ret = bch_err_throw(c, data_update_done_no_writes_needed); 943 goto out_bkey_buf_exit; 944 } 945 946 /* 947 * Check if the allocation will succeed, to avoid getting an error later 948 * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless 949 * read: 950 * 951 * This guards against 952 * - BCH_WRITE_alloc_nowait allocations failing (promotes) 953 * - Destination target full 954 * - Device(s) in destination target offline 955 * - Insufficient durability available in destination target 956 * (i.e. trying to move a durability=2 replica to a target with a 957 * single durability=2 device) 958 */ 959 ret = can_write_extent(c, m); 960 if (ret) 961 goto out_bkey_buf_exit; 962 963 if (reserve_sectors) { 964 ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, 965 m->data_opts.extra_replicas 966 ? 0 967 : BCH_DISK_RESERVATION_NOFAIL); 968 if (ret) 969 goto out_bkey_buf_exit; 970 } 971 972 if (!bkey_get_dev_refs(c, k)) { 973 ret = bch_err_throw(c, data_update_done_no_dev_refs); 974 goto out_put_disk_res; 975 } 976 977 if (c->opts.nocow_enabled && 978 !bkey_nocow_lock(c, ctxt, ptrs)) { 979 ret = bch_err_throw(c, nocow_lock_blocked); 980 goto out_put_dev_refs; 981 } 982 983 if (unwritten) { 984 ret = bch2_update_unwritten_extent(trans, m) ?: 985 bch_err_throw(c, data_update_done_unwritten); 986 goto out_nocow_unlock; 987 } 988 989 bch2_trans_unlock(trans); 990 991 ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); 992 if (ret) 993 goto out_nocow_unlock; 994 995 return 0; 996 out_nocow_unlock: 997 if (c->opts.nocow_enabled) 998 bkey_nocow_unlock(c, k); 999 out_put_dev_refs: 1000 bkey_put_dev_refs(c, k); 1001 out_put_disk_res: 1002 bch2_disk_reservation_put(c, &m->op.res); 1003 out_bkey_buf_exit: 1004 bch2_bkey_buf_exit(&m->k, c); 1005 return ret; 1006 } 1007 1008 void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) 1009 { 1010 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1011 unsigned ptr_bit = 1; 1012 1013 bkey_for_each_ptr(ptrs, ptr) { 1014 if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { 1015 opts->kill_ptrs |= ptr_bit; 1016 opts->rewrite_ptrs ^= ptr_bit; 1017 } 1018 1019 ptr_bit <<= 1; 1020 } 1021 } 1022