1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_io.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_write_buffer.h" 13 #include "compress.h" 14 #include "disk_groups.h" 15 #include "ec.h" 16 #include "errcode.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "io_read.h" 20 #include "io_write.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "move.h" 24 #include "rebalance.h" 25 #include "reflink.h" 26 #include "replicas.h" 27 #include "snapshot.h" 28 #include "super-io.h" 29 #include "trace.h" 30 31 #include <linux/ioprio.h> 32 #include <linux/kthread.h> 33 34 const char * const bch2_data_ops_strs[] = { 35 #define x(t, n, ...) [n] = #t, 36 BCH_DATA_OPS() 37 #undef x 38 NULL 39 }; 40 41 struct evacuate_bucket_arg { 42 struct bpos bucket; 43 int gen; 44 struct data_update_opts data_opts; 45 }; 46 47 static bool evacuate_bucket_pred(struct bch_fs *, void *, 48 enum btree_id, struct bkey_s_c, 49 struct bch_io_opts *, 50 struct data_update_opts *); 51 52 static noinline void 53 trace_io_move2(struct bch_fs *c, struct bkey_s_c k, 54 struct bch_io_opts *io_opts, 55 struct data_update_opts *data_opts) 56 { 57 struct printbuf buf = PRINTBUF; 58 59 bch2_bkey_val_to_text(&buf, c, k); 60 prt_newline(&buf); 61 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 62 trace_io_move(c, buf.buf); 63 printbuf_exit(&buf); 64 } 65 66 static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) 67 { 68 struct printbuf buf = PRINTBUF; 69 70 bch2_bkey_val_to_text(&buf, c, k); 71 trace_io_move_read(c, buf.buf); 72 printbuf_exit(&buf); 73 } 74 75 static noinline void 76 trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, 77 struct bch_io_opts *io_opts, 78 struct data_update_opts *data_opts, 79 move_pred_fn pred, void *_arg, bool p) 80 { 81 struct printbuf buf = PRINTBUF; 82 83 prt_printf(&buf, "%ps: %u", pred, p); 84 85 if (pred == evacuate_bucket_pred) { 86 struct evacuate_bucket_arg *arg = _arg; 87 prt_printf(&buf, " gen=%u", arg->gen); 88 } 89 90 prt_newline(&buf); 91 bch2_bkey_val_to_text(&buf, c, k); 92 prt_newline(&buf); 93 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 94 trace_io_move_pred(c, buf.buf); 95 printbuf_exit(&buf); 96 } 97 98 static noinline void 99 trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) 100 { 101 struct printbuf buf = PRINTBUF; 102 103 prt_printf(&buf, "bucket: "); 104 bch2_bpos_to_text(&buf, bucket); 105 prt_printf(&buf, " gen: %i\n", gen); 106 107 trace_io_move_evacuate_bucket(c, buf.buf); 108 printbuf_exit(&buf); 109 } 110 111 struct moving_io { 112 struct list_head read_list; 113 struct list_head io_list; 114 struct move_bucket *b; 115 struct closure cl; 116 bool read_completed; 117 118 unsigned read_sectors; 119 unsigned write_sectors; 120 121 struct data_update write; 122 }; 123 124 static void move_free(struct moving_io *io) 125 { 126 struct moving_context *ctxt = io->write.ctxt; 127 128 if (io->b) 129 atomic_dec(&io->b->count); 130 131 mutex_lock(&ctxt->lock); 132 list_del(&io->io_list); 133 wake_up(&ctxt->wait); 134 mutex_unlock(&ctxt->lock); 135 136 if (!io->write.data_opts.scrub) { 137 bch2_data_update_exit(&io->write); 138 } else { 139 bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); 140 kfree(io->write.bvecs); 141 } 142 kfree(io); 143 } 144 145 static void move_write_done(struct bch_write_op *op) 146 { 147 struct moving_io *io = container_of(op, struct moving_io, write.op); 148 struct bch_fs *c = op->c; 149 struct moving_context *ctxt = io->write.ctxt; 150 151 if (op->error) { 152 if (trace_io_move_write_fail_enabled()) { 153 struct printbuf buf = PRINTBUF; 154 155 bch2_write_op_to_text(&buf, op); 156 trace_io_move_write_fail(c, buf.buf); 157 printbuf_exit(&buf); 158 } 159 this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); 160 161 ctxt->write_error = true; 162 } 163 164 atomic_sub(io->write_sectors, &ctxt->write_sectors); 165 atomic_dec(&ctxt->write_ios); 166 move_free(io); 167 closure_put(&ctxt->cl); 168 } 169 170 static void move_write(struct moving_io *io) 171 { 172 struct bch_fs *c = io->write.op.c; 173 struct moving_context *ctxt = io->write.ctxt; 174 struct bch_read_bio *rbio = &io->write.rbio; 175 176 if (ctxt->stats) { 177 if (rbio->bio.bi_status) 178 atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, 179 &ctxt->stats->sectors_error_uncorrected); 180 else if (rbio->saw_error) 181 atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, 182 &ctxt->stats->sectors_error_corrected); 183 } 184 185 /* 186 * If the extent has been bitrotted, we're going to have to give it a 187 * new checksum in order to move it - but the poison bit will ensure 188 * that userspace still gets the appropriate error. 189 */ 190 if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && 191 (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { 192 struct bch_extent_crc_unpacked crc = rbio->pick.crc; 193 struct nonce nonce = extent_nonce(rbio->version, crc); 194 195 rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, 196 nonce, &rbio->bio); 197 rbio->ret = 0; 198 } 199 200 if (unlikely(rbio->ret || io->write.data_opts.scrub)) { 201 move_free(io); 202 return; 203 } 204 205 if (trace_io_move_write_enabled()) { 206 struct printbuf buf = PRINTBUF; 207 208 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); 209 trace_io_move_write(c, buf.buf); 210 printbuf_exit(&buf); 211 } 212 213 closure_get(&io->write.ctxt->cl); 214 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 215 atomic_inc(&io->write.ctxt->write_ios); 216 217 bch2_data_update_read_done(&io->write); 218 } 219 220 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 221 { 222 struct moving_io *io = 223 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 224 225 return io && io->read_completed ? io : NULL; 226 } 227 228 static void move_read_endio(struct bio *bio) 229 { 230 struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); 231 struct moving_context *ctxt = io->write.ctxt; 232 233 atomic_sub(io->read_sectors, &ctxt->read_sectors); 234 atomic_dec(&ctxt->read_ios); 235 io->read_completed = true; 236 237 wake_up(&ctxt->wait); 238 closure_put(&ctxt->cl); 239 } 240 241 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 242 { 243 struct moving_io *io; 244 245 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 246 bch2_trans_unlock_long(ctxt->trans); 247 list_del(&io->read_list); 248 move_write(io); 249 } 250 } 251 252 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 253 { 254 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 255 256 move_ctxt_wait_event(ctxt, 257 !atomic_read(&ctxt->write_sectors) || 258 atomic_read(&ctxt->write_sectors) != sectors_pending); 259 } 260 261 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) 262 { 263 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 264 bch2_trans_unlock_long(ctxt->trans); 265 closure_sync(&ctxt->cl); 266 } 267 268 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 269 { 270 struct bch_fs *c = ctxt->trans->c; 271 272 bch2_moving_ctxt_flush_all(ctxt); 273 274 EBUG_ON(atomic_read(&ctxt->write_sectors)); 275 EBUG_ON(atomic_read(&ctxt->write_ios)); 276 EBUG_ON(atomic_read(&ctxt->read_sectors)); 277 EBUG_ON(atomic_read(&ctxt->read_ios)); 278 279 mutex_lock(&c->moving_context_lock); 280 list_del(&ctxt->list); 281 mutex_unlock(&c->moving_context_lock); 282 283 /* 284 * Generally, releasing a transaction within a transaction restart means 285 * an unhandled transaction restart: but this can happen legitimately 286 * within the move code, e.g. when bch2_move_ratelimit() tells us to 287 * exit before we've retried 288 */ 289 bch2_trans_begin(ctxt->trans); 290 bch2_trans_put(ctxt->trans); 291 memset(ctxt, 0, sizeof(*ctxt)); 292 } 293 294 void bch2_moving_ctxt_init(struct moving_context *ctxt, 295 struct bch_fs *c, 296 struct bch_ratelimit *rate, 297 struct bch_move_stats *stats, 298 struct write_point_specifier wp, 299 bool wait_on_copygc) 300 { 301 memset(ctxt, 0, sizeof(*ctxt)); 302 303 ctxt->trans = bch2_trans_get(c); 304 ctxt->fn = (void *) _RET_IP_; 305 ctxt->rate = rate; 306 ctxt->stats = stats; 307 ctxt->wp = wp; 308 ctxt->wait_on_copygc = wait_on_copygc; 309 310 closure_init_stack(&ctxt->cl); 311 312 mutex_init(&ctxt->lock); 313 INIT_LIST_HEAD(&ctxt->reads); 314 INIT_LIST_HEAD(&ctxt->ios); 315 init_waitqueue_head(&ctxt->wait); 316 317 mutex_lock(&c->moving_context_lock); 318 list_add(&ctxt->list, &c->moving_context_list); 319 mutex_unlock(&c->moving_context_lock); 320 } 321 322 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 323 { 324 trace_move_data(c, stats); 325 } 326 327 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) 328 { 329 memset(stats, 0, sizeof(*stats)); 330 stats->data_type = BCH_DATA_user; 331 scnprintf(stats->name, sizeof(stats->name), "%s", name); 332 } 333 334 int bch2_move_extent(struct moving_context *ctxt, 335 struct move_bucket *bucket_in_flight, 336 struct btree_iter *iter, 337 struct bkey_s_c k, 338 struct bch_io_opts io_opts, 339 struct data_update_opts data_opts) 340 { 341 struct btree_trans *trans = ctxt->trans; 342 struct bch_fs *c = trans->c; 343 int ret = -ENOMEM; 344 345 if (trace_io_move_enabled()) 346 trace_io_move2(c, k, &io_opts, &data_opts); 347 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 348 349 if (ctxt->stats) 350 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 351 352 bch2_data_update_opts_normalize(k, &data_opts); 353 354 if (!data_opts.rewrite_ptrs && 355 !data_opts.extra_replicas && 356 !data_opts.scrub) { 357 if (data_opts.kill_ptrs) 358 return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); 359 return 0; 360 } 361 362 struct moving_io *io = allocate_dropping_locks(trans, ret, 363 kzalloc(sizeof(struct moving_io), _gfp)); 364 if (!io) 365 goto err; 366 367 if (ret) 368 goto err_free; 369 370 INIT_LIST_HEAD(&io->io_list); 371 io->write.ctxt = ctxt; 372 io->read_sectors = k.k->size; 373 io->write_sectors = k.k->size; 374 375 if (!data_opts.scrub) { 376 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, 377 &io_opts, data_opts, iter->btree_id, k); 378 if (ret) 379 goto err_free; 380 381 io->write.op.end_io = move_write_done; 382 } else { 383 bch2_bkey_buf_init(&io->write.k); 384 bch2_bkey_buf_reassemble(&io->write.k, c, k); 385 386 io->write.op.c = c; 387 io->write.data_opts = data_opts; 388 389 bch2_trans_unlock(trans); 390 391 ret = bch2_data_update_bios_init(&io->write, c, &io_opts); 392 if (ret) 393 goto err_free; 394 } 395 396 io->write.rbio.bio.bi_end_io = move_read_endio; 397 io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); 398 399 if (ctxt->rate) 400 bch2_ratelimit_increment(ctxt->rate, k.k->size); 401 402 if (ctxt->stats) { 403 atomic64_inc(&ctxt->stats->keys_moved); 404 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 405 } 406 407 if (bucket_in_flight) { 408 io->b = bucket_in_flight; 409 atomic_inc(&io->b->count); 410 } 411 412 if (trace_io_move_read_enabled()) 413 trace_io_move_read2(c, k); 414 415 mutex_lock(&ctxt->lock); 416 atomic_add(io->read_sectors, &ctxt->read_sectors); 417 atomic_inc(&ctxt->read_ios); 418 419 list_add_tail(&io->read_list, &ctxt->reads); 420 list_add_tail(&io->io_list, &ctxt->ios); 421 mutex_unlock(&ctxt->lock); 422 423 /* 424 * dropped by move_read_endio() - guards against use after free of 425 * ctxt when doing wakeup 426 */ 427 closure_get(&ctxt->cl); 428 __bch2_read_extent(trans, &io->write.rbio, 429 io->write.rbio.bio.bi_iter, 430 bkey_start_pos(k.k), 431 iter->btree_id, k, 0, 432 NULL, 433 BCH_READ_last_fragment, 434 data_opts.scrub ? data_opts.read_dev : -1); 435 return 0; 436 err_free: 437 kfree(io); 438 err: 439 if (bch2_err_matches(ret, EROFS) || 440 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 441 return ret; 442 443 count_event(c, io_move_start_fail); 444 445 if (trace_io_move_start_fail_enabled()) { 446 struct printbuf buf = PRINTBUF; 447 448 bch2_bkey_val_to_text(&buf, c, k); 449 prt_str(&buf, ": "); 450 prt_str(&buf, bch2_err_str(ret)); 451 trace_io_move_start_fail(c, buf.buf); 452 printbuf_exit(&buf); 453 } 454 455 if (bch2_err_matches(ret, BCH_ERR_data_update_done)) 456 return 0; 457 return ret; 458 } 459 460 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 461 struct per_snapshot_io_opts *io_opts, 462 struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ 463 struct btree_iter *extent_iter, 464 struct bkey_s_c extent_k) 465 { 466 struct bch_fs *c = trans->c; 467 u32 restart_count = trans->restart_count; 468 struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; 469 int ret = 0; 470 471 if (extent_iter->min_depth) 472 return opts_ret; 473 474 if (extent_k.k->type == KEY_TYPE_reflink_v) 475 goto out; 476 477 if (io_opts->cur_inum != extent_pos.inode) { 478 io_opts->d.nr = 0; 479 480 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), 481 BTREE_ITER_all_snapshots, k, ({ 482 if (k.k->p.offset != extent_pos.inode) 483 break; 484 485 if (!bkey_is_inode(k.k)) 486 continue; 487 488 struct bch_inode_unpacked inode; 489 _ret3 = bch2_inode_unpack(k, &inode); 490 if (_ret3) 491 break; 492 493 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 494 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 495 496 darray_push(&io_opts->d, e); 497 })); 498 io_opts->cur_inum = extent_pos.inode; 499 } 500 501 ret = ret ?: trans_was_restarted(trans, restart_count); 502 if (ret) 503 return ERR_PTR(ret); 504 505 if (extent_k.k->p.snapshot) 506 darray_for_each(io_opts->d, i) 507 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { 508 opts_ret = &i->io_opts; 509 break; 510 } 511 out: 512 ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); 513 if (ret) 514 return ERR_PTR(ret); 515 return opts_ret; 516 } 517 518 int bch2_move_get_io_opts_one(struct btree_trans *trans, 519 struct bch_io_opts *io_opts, 520 struct btree_iter *extent_iter, 521 struct bkey_s_c extent_k) 522 { 523 struct bch_fs *c = trans->c; 524 525 *io_opts = bch2_opts_to_inode_opts(c->opts); 526 527 /* reflink btree? */ 528 if (!extent_k.k->p.inode) 529 goto out; 530 531 struct btree_iter inode_iter; 532 struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, 533 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 534 BTREE_ITER_cached); 535 int ret = bkey_err(inode_k); 536 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 537 return ret; 538 539 if (!ret && bkey_is_inode(inode_k.k)) { 540 struct bch_inode_unpacked inode; 541 bch2_inode_unpack(inode_k, &inode); 542 bch2_inode_opts_get(io_opts, c, &inode); 543 } 544 bch2_trans_iter_exit(trans, &inode_iter); 545 /* seem to be spinning here? */ 546 out: 547 return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); 548 } 549 550 int bch2_move_ratelimit(struct moving_context *ctxt) 551 { 552 struct bch_fs *c = ctxt->trans->c; 553 bool is_kthread = current->flags & PF_KTHREAD; 554 u64 delay; 555 556 if (ctxt->wait_on_copygc && c->copygc_running) { 557 bch2_moving_ctxt_flush_all(ctxt); 558 wait_event_killable(c->copygc_running_wq, 559 !c->copygc_running || 560 (is_kthread && kthread_should_stop())); 561 } 562 563 do { 564 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 565 566 if (is_kthread && kthread_should_stop()) 567 return 1; 568 569 if (delay) 570 move_ctxt_wait_event_timeout(ctxt, 571 freezing(current) || 572 (is_kthread && kthread_should_stop()), 573 delay); 574 575 if (unlikely(freezing(current))) { 576 bch2_moving_ctxt_flush_all(ctxt); 577 try_to_freeze(); 578 } 579 } while (delay); 580 581 /* 582 * XXX: these limits really ought to be per device, SSDs and hard drives 583 * will want different limits 584 */ 585 move_ctxt_wait_event(ctxt, 586 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 587 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 588 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 589 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 590 591 return 0; 592 } 593 594 /* 595 * Move requires non extents iterators, and there's also no need for it to 596 * signal indirect_extent_missing_error: 597 */ 598 static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, 599 struct btree_iter *iter, 600 struct bkey_s_c_reflink_p p) 601 { 602 if (unlikely(REFLINK_P_ERROR(p.v))) 603 return bkey_s_c_null; 604 605 struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); 606 607 bch2_trans_iter_init(trans, iter, 608 BTREE_ID_reflink, reflink_pos, 609 BTREE_ITER_not_extents); 610 611 struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); 612 if (!k.k || bkey_err(k)) { 613 bch2_trans_iter_exit(trans, iter); 614 return k; 615 } 616 617 if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { 618 bch2_trans_iter_exit(trans, iter); 619 return bkey_s_c_null; 620 } 621 622 return k; 623 } 624 625 int bch2_move_data_btree(struct moving_context *ctxt, 626 struct bpos start, 627 struct bpos end, 628 move_pred_fn pred, void *arg, 629 enum btree_id btree_id, unsigned level) 630 { 631 struct btree_trans *trans = ctxt->trans; 632 struct bch_fs *c = trans->c; 633 struct per_snapshot_io_opts snapshot_io_opts; 634 struct bch_io_opts *io_opts; 635 struct bkey_buf sk; 636 struct btree_iter iter, reflink_iter = {}; 637 struct bkey_s_c k; 638 struct data_update_opts data_opts; 639 /* 640 * If we're moving a single file, also process reflinked data it points 641 * to (this includes propagating changed io_opts from the inode to the 642 * extent): 643 */ 644 bool walk_indirect = start.inode == end.inode; 645 int ret = 0, ret2; 646 647 per_snapshot_io_opts_init(&snapshot_io_opts, c); 648 bch2_bkey_buf_init(&sk); 649 650 if (ctxt->stats) { 651 ctxt->stats->data_type = BCH_DATA_user; 652 ctxt->stats->pos = BBPOS(btree_id, start); 653 } 654 655 retry_root: 656 bch2_trans_begin(trans); 657 658 if (level == bch2_btree_id_root(c, btree_id)->level + 1) { 659 bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, 660 BTREE_ITER_prefetch| 661 BTREE_ITER_not_extents| 662 BTREE_ITER_all_snapshots); 663 struct btree *b = bch2_btree_iter_peek_node(trans, &iter); 664 ret = PTR_ERR_OR_ZERO(b); 665 if (ret) 666 goto root_err; 667 668 if (b != btree_node_root(c, b)) { 669 bch2_trans_iter_exit(trans, &iter); 670 goto retry_root; 671 } 672 673 k = bkey_i_to_s_c(&b->key); 674 675 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, 676 iter.pos, &iter, k); 677 ret = PTR_ERR_OR_ZERO(io_opts); 678 if (ret) 679 goto root_err; 680 681 memset(&data_opts, 0, sizeof(data_opts)); 682 if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) 683 goto out; 684 685 686 if (!data_opts.scrub) 687 ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, 688 k.k->p, data_opts.target, 0); 689 else 690 ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); 691 692 root_err: 693 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 694 bch2_trans_iter_exit(trans, &iter); 695 goto retry_root; 696 } 697 698 goto out; 699 } 700 701 bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, 702 BTREE_ITER_prefetch| 703 BTREE_ITER_not_extents| 704 BTREE_ITER_all_snapshots); 705 706 if (ctxt->rate) 707 bch2_ratelimit_reset(ctxt->rate); 708 709 while (!bch2_move_ratelimit(ctxt)) { 710 struct btree_iter *extent_iter = &iter; 711 712 bch2_trans_begin(trans); 713 714 k = bch2_btree_iter_peek(trans, &iter); 715 if (!k.k) 716 break; 717 718 ret = bkey_err(k); 719 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 720 continue; 721 if (ret) 722 break; 723 724 if (bkey_gt(bkey_start_pos(k.k), end)) 725 break; 726 727 if (ctxt->stats) 728 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 729 730 if (walk_indirect && 731 k.k->type == KEY_TYPE_reflink_p && 732 REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { 733 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); 734 735 bch2_trans_iter_exit(trans, &reflink_iter); 736 k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); 737 ret = bkey_err(k); 738 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 739 continue; 740 if (ret) 741 break; 742 743 if (!k.k) 744 goto next_nondata; 745 746 /* 747 * XXX: reflink pointers may point to multiple indirect 748 * extents, so don't advance past the entire reflink 749 * pointer - need to fixup iter->k 750 */ 751 extent_iter = &reflink_iter; 752 } 753 754 if (!bkey_extent_is_direct_data(k.k)) 755 goto next_nondata; 756 757 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, 758 iter.pos, extent_iter, k); 759 ret = PTR_ERR_OR_ZERO(io_opts); 760 if (ret) 761 continue; 762 763 memset(&data_opts, 0, sizeof(data_opts)); 764 if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) 765 goto next; 766 767 /* 768 * The iterator gets unlocked by __bch2_read_extent - need to 769 * save a copy of @k elsewhere: 770 */ 771 bch2_bkey_buf_reassemble(&sk, c, k); 772 k = bkey_i_to_s_c(sk.k); 773 774 if (!level) 775 ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); 776 else if (!data_opts.scrub) 777 ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, 778 k.k->p, data_opts.target, 0); 779 else 780 ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); 781 782 if (ret2) { 783 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 784 continue; 785 786 if (bch2_err_matches(ret2, ENOMEM)) { 787 /* memory allocation failure, wait for some IO to finish */ 788 bch2_move_ctxt_wait_for_io(ctxt); 789 continue; 790 } 791 792 /* XXX signal failure */ 793 goto next; 794 } 795 next: 796 if (ctxt->stats) 797 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 798 next_nondata: 799 if (!bch2_btree_iter_advance(trans, &iter)) 800 break; 801 } 802 out: 803 bch2_trans_iter_exit(trans, &reflink_iter); 804 bch2_trans_iter_exit(trans, &iter); 805 bch2_bkey_buf_exit(&sk, c); 806 per_snapshot_io_opts_exit(&snapshot_io_opts); 807 808 return ret; 809 } 810 811 int __bch2_move_data(struct moving_context *ctxt, 812 struct bbpos start, 813 struct bbpos end, 814 move_pred_fn pred, void *arg) 815 { 816 struct bch_fs *c = ctxt->trans->c; 817 enum btree_id id; 818 int ret = 0; 819 820 for (id = start.btree; 821 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 822 id++) { 823 ctxt->stats->pos = BBPOS(id, POS_MIN); 824 825 if (!btree_type_has_ptrs(id) || 826 !bch2_btree_id_root(c, id)->b) 827 continue; 828 829 ret = bch2_move_data_btree(ctxt, 830 id == start.btree ? start.pos : POS_MIN, 831 id == end.btree ? end.pos : POS_MAX, 832 pred, arg, id, 0); 833 if (ret) 834 break; 835 } 836 837 return ret; 838 } 839 840 int bch2_move_data(struct bch_fs *c, 841 struct bbpos start, 842 struct bbpos end, 843 struct bch_ratelimit *rate, 844 struct bch_move_stats *stats, 845 struct write_point_specifier wp, 846 bool wait_on_copygc, 847 move_pred_fn pred, void *arg) 848 { 849 struct moving_context ctxt; 850 851 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 852 int ret = __bch2_move_data(&ctxt, start, end, pred, arg); 853 bch2_moving_ctxt_exit(&ctxt); 854 855 return ret; 856 } 857 858 static int __bch2_move_data_phys(struct moving_context *ctxt, 859 struct move_bucket *bucket_in_flight, 860 unsigned dev, 861 u64 bucket_start, 862 u64 bucket_end, 863 unsigned data_types, 864 bool copygc, 865 move_pred_fn pred, void *arg) 866 { 867 struct btree_trans *trans = ctxt->trans; 868 struct bch_fs *c = trans->c; 869 bool is_kthread = current->flags & PF_KTHREAD; 870 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 871 struct btree_iter iter = {}, bp_iter = {}; 872 struct bkey_buf sk; 873 struct bkey_s_c k; 874 struct bkey_buf last_flushed; 875 u64 check_mismatch_done = bucket_start; 876 int ret = 0; 877 878 struct bch_dev *ca = bch2_dev_tryget(c, dev); 879 if (!ca) 880 return 0; 881 882 bucket_end = min(bucket_end, ca->mi.nbuckets); 883 884 struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); 885 struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); 886 887 bch2_bkey_buf_init(&last_flushed); 888 bkey_init(&last_flushed.k->k); 889 bch2_bkey_buf_init(&sk); 890 891 /* 892 * We're not run in a context that handles transaction restarts: 893 */ 894 bch2_trans_begin(trans); 895 896 bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); 897 898 ret = bch2_btree_write_buffer_tryflush(trans); 899 if (!bch2_err_matches(ret, EROFS)) 900 bch_err_msg(c, ret, "flushing btree write buffer"); 901 if (ret) 902 goto err; 903 904 while (!(ret = bch2_move_ratelimit(ctxt))) { 905 if (is_kthread && kthread_should_stop()) 906 break; 907 908 bch2_trans_begin(trans); 909 910 k = bch2_btree_iter_peek(trans, &bp_iter); 911 ret = bkey_err(k); 912 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 913 continue; 914 if (ret) 915 goto err; 916 917 if (!k.k || bkey_gt(k.k->p, bp_end)) 918 break; 919 920 if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { 921 while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { 922 bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, 923 copygc, &last_flushed); 924 } 925 continue; 926 } 927 928 if (k.k->type != KEY_TYPE_backpointer) 929 goto next; 930 931 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 932 933 if (ctxt->stats) 934 ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; 935 936 if (!(data_types & BIT(bp.v->data_type))) 937 goto next; 938 939 if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) 940 goto next; 941 942 k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); 943 ret = bkey_err(k); 944 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 945 continue; 946 if (ret) 947 goto err; 948 if (!k.k) 949 goto next; 950 951 if (!bp.v->level) { 952 ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); 953 if (ret) { 954 bch2_trans_iter_exit(trans, &iter); 955 continue; 956 } 957 } 958 959 struct data_update_opts data_opts = {}; 960 bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); 961 962 if (trace_io_move_pred_enabled()) 963 trace_io_move_pred2(c, k, &io_opts, &data_opts, 964 pred, arg, p); 965 966 if (!p) { 967 bch2_trans_iter_exit(trans, &iter); 968 goto next; 969 } 970 971 if (data_opts.scrub && 972 !bch2_dev_idx_is_online(c, data_opts.read_dev)) { 973 bch2_trans_iter_exit(trans, &iter); 974 ret = bch_err_throw(c, device_offline); 975 break; 976 } 977 978 bch2_bkey_buf_reassemble(&sk, c, k); 979 k = bkey_i_to_s_c(sk.k); 980 981 /* move_extent will drop locks */ 982 unsigned sectors = bp.v->bucket_len; 983 984 if (!bp.v->level) 985 ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); 986 else if (!data_opts.scrub) 987 ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, 988 k.k->p, data_opts.target, 0); 989 else 990 ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); 991 992 bch2_trans_iter_exit(trans, &iter); 993 994 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 995 continue; 996 if (ret == -ENOMEM) { 997 /* memory allocation failure, wait for some IO to finish */ 998 bch2_move_ctxt_wait_for_io(ctxt); 999 continue; 1000 } 1001 if (ret) 1002 goto err; 1003 1004 if (ctxt->stats) 1005 atomic64_add(sectors, &ctxt->stats->sectors_seen); 1006 next: 1007 bch2_btree_iter_advance(trans, &bp_iter); 1008 } 1009 1010 while (check_mismatch_done < bucket_end) 1011 bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, 1012 copygc, &last_flushed); 1013 err: 1014 bch2_trans_iter_exit(trans, &bp_iter); 1015 bch2_bkey_buf_exit(&sk, c); 1016 bch2_bkey_buf_exit(&last_flushed, c); 1017 bch2_dev_put(ca); 1018 return ret; 1019 } 1020 1021 int bch2_move_data_phys(struct bch_fs *c, 1022 unsigned dev, 1023 u64 start, 1024 u64 end, 1025 unsigned data_types, 1026 struct bch_ratelimit *rate, 1027 struct bch_move_stats *stats, 1028 struct write_point_specifier wp, 1029 bool wait_on_copygc, 1030 move_pred_fn pred, void *arg) 1031 { 1032 struct moving_context ctxt; 1033 1034 bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); 1035 1036 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 1037 if (ctxt.stats) { 1038 ctxt.stats->phys = true; 1039 ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; 1040 } 1041 1042 int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, 1043 data_types, false, pred, arg); 1044 bch2_moving_ctxt_exit(&ctxt); 1045 1046 return ret; 1047 } 1048 1049 static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, 1050 enum btree_id btree, struct bkey_s_c k, 1051 struct bch_io_opts *io_opts, 1052 struct data_update_opts *data_opts) 1053 { 1054 struct evacuate_bucket_arg *arg = _arg; 1055 1056 *data_opts = arg->data_opts; 1057 1058 unsigned i = 0; 1059 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 1060 if (ptr->dev == arg->bucket.inode && 1061 (arg->gen < 0 || arg->gen == ptr->gen) && 1062 !ptr->cached) 1063 data_opts->rewrite_ptrs |= BIT(i); 1064 i++; 1065 } 1066 1067 return data_opts->rewrite_ptrs != 0; 1068 } 1069 1070 int bch2_evacuate_bucket(struct moving_context *ctxt, 1071 struct move_bucket *bucket_in_flight, 1072 struct bpos bucket, int gen, 1073 struct data_update_opts data_opts) 1074 { 1075 struct bch_fs *c = ctxt->trans->c; 1076 struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; 1077 1078 count_event(c, io_move_evacuate_bucket); 1079 if (trace_io_move_evacuate_bucket_enabled()) 1080 trace_io_move_evacuate_bucket2(c, bucket, gen); 1081 1082 return __bch2_move_data_phys(ctxt, bucket_in_flight, 1083 bucket.inode, 1084 bucket.offset, 1085 bucket.offset + 1, 1086 ~0, 1087 true, 1088 evacuate_bucket_pred, &arg); 1089 } 1090 1091 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 1092 struct btree *, struct bch_io_opts *, 1093 struct data_update_opts *); 1094 1095 static int bch2_move_btree(struct bch_fs *c, 1096 struct bbpos start, 1097 struct bbpos end, 1098 move_btree_pred pred, void *arg, 1099 struct bch_move_stats *stats) 1100 { 1101 bool kthread = (current->flags & PF_KTHREAD) != 0; 1102 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 1103 struct moving_context ctxt; 1104 struct btree_trans *trans; 1105 struct btree_iter iter; 1106 struct btree *b; 1107 enum btree_id btree; 1108 struct data_update_opts data_opts; 1109 int ret = 0; 1110 1111 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 1112 writepoint_ptr(&c->btree_write_point), 1113 true); 1114 trans = ctxt.trans; 1115 1116 stats->data_type = BCH_DATA_btree; 1117 1118 for (btree = start.btree; 1119 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 1120 btree ++) { 1121 stats->pos = BBPOS(btree, POS_MIN); 1122 1123 if (!bch2_btree_id_root(c, btree)->b) 1124 continue; 1125 1126 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, 1127 BTREE_ITER_prefetch); 1128 retry: 1129 ret = 0; 1130 while (bch2_trans_begin(trans), 1131 (b = bch2_btree_iter_peek_node(trans, &iter)) && 1132 !(ret = PTR_ERR_OR_ZERO(b))) { 1133 if (kthread && kthread_should_stop()) 1134 break; 1135 1136 if ((cmp_int(btree, end.btree) ?: 1137 bpos_cmp(b->key.k.p, end.pos)) > 0) 1138 break; 1139 1140 stats->pos = BBPOS(iter.btree_id, iter.pos); 1141 1142 if (!pred(c, arg, b, &io_opts, &data_opts)) 1143 goto next; 1144 1145 ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; 1146 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1147 continue; 1148 if (ret) 1149 break; 1150 next: 1151 bch2_btree_iter_next_node(trans, &iter); 1152 } 1153 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1154 goto retry; 1155 1156 bch2_trans_iter_exit(trans, &iter); 1157 1158 if (kthread && kthread_should_stop()) 1159 break; 1160 } 1161 1162 bch_err_fn(c, ret); 1163 bch2_moving_ctxt_exit(&ctxt); 1164 bch2_btree_interior_updates_flush(c); 1165 1166 return ret; 1167 } 1168 1169 static bool rereplicate_pred(struct bch_fs *c, void *arg, 1170 enum btree_id btree, struct bkey_s_c k, 1171 struct bch_io_opts *io_opts, 1172 struct data_update_opts *data_opts) 1173 { 1174 unsigned nr_good = bch2_bkey_durability(c, k); 1175 unsigned replicas = bkey_is_btree_ptr(k.k) 1176 ? c->opts.metadata_replicas 1177 : io_opts->data_replicas; 1178 1179 guard(rcu)(); 1180 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1181 unsigned i = 0; 1182 bkey_for_each_ptr(ptrs, ptr) { 1183 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); 1184 if (!ptr->cached && 1185 (!ca || !ca->mi.durability)) 1186 data_opts->kill_ptrs |= BIT(i); 1187 i++; 1188 } 1189 1190 if (!data_opts->kill_ptrs && 1191 (!nr_good || nr_good >= replicas)) 1192 return false; 1193 1194 data_opts->target = 0; 1195 data_opts->extra_replicas = replicas - nr_good; 1196 data_opts->btree_insert_flags = 0; 1197 return true; 1198 } 1199 1200 static bool migrate_pred(struct bch_fs *c, void *arg, 1201 enum btree_id btree, struct bkey_s_c k, 1202 struct bch_io_opts *io_opts, 1203 struct data_update_opts *data_opts) 1204 { 1205 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1206 struct bch_ioctl_data *op = arg; 1207 unsigned i = 0; 1208 1209 data_opts->rewrite_ptrs = 0; 1210 data_opts->target = 0; 1211 data_opts->extra_replicas = 0; 1212 data_opts->btree_insert_flags = 0; 1213 1214 bkey_for_each_ptr(ptrs, ptr) { 1215 if (ptr->dev == op->migrate.dev) 1216 data_opts->rewrite_ptrs |= 1U << i; 1217 i++; 1218 } 1219 1220 return data_opts->rewrite_ptrs != 0; 1221 } 1222 1223 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 1224 struct btree *b, 1225 struct bch_io_opts *io_opts, 1226 struct data_update_opts *data_opts) 1227 { 1228 return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); 1229 } 1230 1231 /* 1232 * Ancient versions of bcachefs produced packed formats which could represent 1233 * keys that the in memory format cannot represent; this checks for those 1234 * formats so we can get rid of them. 1235 */ 1236 static bool bformat_needs_redo(struct bkey_format *f) 1237 { 1238 for (unsigned i = 0; i < f->nr_fields; i++) 1239 if (bch2_bkey_format_field_overflows(f, i)) 1240 return true; 1241 1242 return false; 1243 } 1244 1245 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 1246 struct btree *b, 1247 struct bch_io_opts *io_opts, 1248 struct data_update_opts *data_opts) 1249 { 1250 if (b->version_ondisk != c->sb.version || 1251 btree_node_need_rewrite(b) || 1252 bformat_needs_redo(&b->format)) { 1253 data_opts->target = 0; 1254 data_opts->extra_replicas = 0; 1255 data_opts->btree_insert_flags = 0; 1256 return true; 1257 } 1258 1259 return false; 1260 } 1261 1262 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1263 { 1264 int ret; 1265 1266 ret = bch2_move_btree(c, 1267 BBPOS_MIN, 1268 BBPOS_MAX, 1269 rewrite_old_nodes_pred, c, stats); 1270 if (!ret) { 1271 mutex_lock(&c->sb_lock); 1272 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1273 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1274 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1275 bch2_write_super(c); 1276 mutex_unlock(&c->sb_lock); 1277 } 1278 1279 bch_err_fn(c, ret); 1280 return ret; 1281 } 1282 1283 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, 1284 enum btree_id btree, struct bkey_s_c k, 1285 struct bch_io_opts *io_opts, 1286 struct data_update_opts *data_opts) 1287 { 1288 unsigned durability = bch2_bkey_durability(c, k); 1289 unsigned replicas = bkey_is_btree_ptr(k.k) 1290 ? c->opts.metadata_replicas 1291 : io_opts->data_replicas; 1292 const union bch_extent_entry *entry; 1293 struct extent_ptr_decoded p; 1294 unsigned i = 0; 1295 1296 guard(rcu)(); 1297 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { 1298 unsigned d = bch2_extent_ptr_durability(c, &p); 1299 1300 if (d && durability - d >= replicas) { 1301 data_opts->kill_ptrs |= BIT(i); 1302 durability -= d; 1303 } 1304 1305 i++; 1306 } 1307 1308 return data_opts->kill_ptrs != 0; 1309 } 1310 1311 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, 1312 struct btree *b, 1313 struct bch_io_opts *io_opts, 1314 struct data_update_opts *data_opts) 1315 { 1316 return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), 1317 io_opts, data_opts); 1318 } 1319 1320 static bool scrub_pred(struct bch_fs *c, void *_arg, 1321 enum btree_id btree, struct bkey_s_c k, 1322 struct bch_io_opts *io_opts, 1323 struct data_update_opts *data_opts) 1324 { 1325 struct bch_ioctl_data *arg = _arg; 1326 1327 if (k.k->type != KEY_TYPE_btree_ptr_v2) { 1328 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 1329 const union bch_extent_entry *entry; 1330 struct extent_ptr_decoded p; 1331 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 1332 if (p.ptr.dev == arg->migrate.dev) { 1333 if (!p.crc.csum_type) 1334 return false; 1335 break; 1336 } 1337 } 1338 1339 data_opts->scrub = true; 1340 data_opts->read_dev = arg->migrate.dev; 1341 return true; 1342 } 1343 1344 int bch2_data_job(struct bch_fs *c, 1345 struct bch_move_stats *stats, 1346 struct bch_ioctl_data op) 1347 { 1348 struct bbpos start = BBPOS(op.start_btree, op.start_pos); 1349 struct bbpos end = BBPOS(op.end_btree, op.end_pos); 1350 int ret = 0; 1351 1352 if (op.op >= BCH_DATA_OP_NR) 1353 return -EINVAL; 1354 1355 bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); 1356 1357 switch (op.op) { 1358 case BCH_DATA_OP_scrub: 1359 /* 1360 * prevent tests from spuriously failing, make sure we see all 1361 * btree nodes that need to be repaired 1362 */ 1363 bch2_btree_interior_updates_flush(c); 1364 1365 ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, 1366 op.scrub.data_types, 1367 NULL, 1368 stats, 1369 writepoint_hashed((unsigned long) current), 1370 false, 1371 scrub_pred, &op) ?: ret; 1372 break; 1373 1374 case BCH_DATA_OP_rereplicate: 1375 stats->data_type = BCH_DATA_journal; 1376 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1377 ret = bch2_move_btree(c, start, end, 1378 rereplicate_btree_pred, c, stats) ?: ret; 1379 ret = bch2_move_data(c, start, end, 1380 NULL, 1381 stats, 1382 writepoint_hashed((unsigned long) current), 1383 true, 1384 rereplicate_pred, c) ?: ret; 1385 ret = bch2_replicas_gc2(c) ?: ret; 1386 break; 1387 case BCH_DATA_OP_migrate: 1388 if (op.migrate.dev >= c->sb.nr_devices) 1389 return -EINVAL; 1390 1391 stats->data_type = BCH_DATA_journal; 1392 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1393 ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, 1394 ~0, 1395 NULL, 1396 stats, 1397 writepoint_hashed((unsigned long) current), 1398 true, 1399 migrate_pred, &op) ?: ret; 1400 bch2_btree_interior_updates_flush(c); 1401 ret = bch2_replicas_gc2(c) ?: ret; 1402 break; 1403 case BCH_DATA_OP_rewrite_old_nodes: 1404 ret = bch2_scan_old_btree_nodes(c, stats); 1405 break; 1406 case BCH_DATA_OP_drop_extra_replicas: 1407 ret = bch2_move_btree(c, start, end, 1408 drop_extra_replicas_btree_pred, c, stats) ?: ret; 1409 ret = bch2_move_data(c, start, end, NULL, stats, 1410 writepoint_hashed((unsigned long) current), 1411 true, 1412 drop_extra_replicas_pred, c) ?: ret; 1413 ret = bch2_replicas_gc2(c) ?: ret; 1414 break; 1415 default: 1416 ret = -EINVAL; 1417 } 1418 1419 bch2_move_stats_exit(stats, c); 1420 return ret; 1421 } 1422 1423 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1424 { 1425 prt_printf(out, "%s: data type==", stats->name); 1426 bch2_prt_data_type(out, stats->data_type); 1427 prt_str(out, " pos="); 1428 bch2_bbpos_to_text(out, stats->pos); 1429 prt_newline(out); 1430 printbuf_indent_add(out, 2); 1431 1432 prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); 1433 prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); 1434 prt_printf(out, "bytes seen:\t"); 1435 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1436 prt_newline(out); 1437 1438 prt_printf(out, "bytes moved:\t"); 1439 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1440 prt_newline(out); 1441 1442 prt_printf(out, "bytes raced:\t"); 1443 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1444 prt_newline(out); 1445 1446 printbuf_indent_sub(out, 2); 1447 } 1448 1449 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1450 { 1451 if (!out->nr_tabstops) 1452 printbuf_tabstop_push(out, 32); 1453 1454 bch2_move_stats_to_text(out, ctxt->stats); 1455 printbuf_indent_add(out, 2); 1456 1457 prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", 1458 atomic_read(&ctxt->read_ios), 1459 c->opts.move_ios_in_flight, 1460 atomic_read(&ctxt->read_sectors), 1461 c->opts.move_bytes_in_flight >> 9); 1462 1463 prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", 1464 atomic_read(&ctxt->write_ios), 1465 c->opts.move_ios_in_flight, 1466 atomic_read(&ctxt->write_sectors), 1467 c->opts.move_bytes_in_flight >> 9); 1468 1469 printbuf_indent_add(out, 2); 1470 1471 mutex_lock(&ctxt->lock); 1472 struct moving_io *io; 1473 list_for_each_entry(io, &ctxt->ios, io_list) 1474 bch2_data_update_inflight_to_text(out, &io->write); 1475 mutex_unlock(&ctxt->lock); 1476 1477 printbuf_indent_sub(out, 4); 1478 } 1479 1480 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1481 { 1482 struct moving_context *ctxt; 1483 1484 mutex_lock(&c->moving_context_lock); 1485 list_for_each_entry(ctxt, &c->moving_context_list, list) 1486 bch2_moving_ctxt_to_text(out, c, ctxt); 1487 mutex_unlock(&c->moving_context_lock); 1488 } 1489 1490 void bch2_fs_move_init(struct bch_fs *c) 1491 { 1492 INIT_LIST_HEAD(&c->moving_context_list); 1493 mutex_init(&c->moving_context_lock); 1494 } 1495