1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_io.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_write_buffer.h" 13 #include "compress.h" 14 #include "disk_groups.h" 15 #include "ec.h" 16 #include "errcode.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "io_read.h" 20 #include "io_write.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "move.h" 24 #include "replicas.h" 25 #include "snapshot.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/ioprio.h> 30 #include <linux/kthread.h> 31 32 const char * const bch2_data_ops_strs[] = { 33 #define x(t, n, ...) [n] = #t, 34 BCH_DATA_OPS() 35 #undef x 36 NULL 37 }; 38 39 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, 40 struct bch_io_opts *io_opts, 41 struct data_update_opts *data_opts) 42 { 43 if (trace_move_extent_enabled()) { 44 struct printbuf buf = PRINTBUF; 45 46 bch2_bkey_val_to_text(&buf, c, k); 47 prt_newline(&buf); 48 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 49 trace_move_extent(c, buf.buf); 50 printbuf_exit(&buf); 51 } 52 } 53 54 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 55 { 56 if (trace_move_extent_read_enabled()) { 57 struct printbuf buf = PRINTBUF; 58 59 bch2_bkey_val_to_text(&buf, c, k); 60 trace_move_extent_read(c, buf.buf); 61 printbuf_exit(&buf); 62 } 63 } 64 65 struct moving_io { 66 struct list_head read_list; 67 struct list_head io_list; 68 struct move_bucket_in_flight *b; 69 struct closure cl; 70 bool read_completed; 71 72 unsigned read_sectors; 73 unsigned write_sectors; 74 75 struct bch_read_bio rbio; 76 77 struct data_update write; 78 /* Must be last since it is variable size */ 79 struct bio_vec bi_inline_vecs[]; 80 }; 81 82 static void move_free(struct moving_io *io) 83 { 84 struct moving_context *ctxt = io->write.ctxt; 85 86 if (io->b) 87 atomic_dec(&io->b->count); 88 89 bch2_data_update_exit(&io->write); 90 91 mutex_lock(&ctxt->lock); 92 list_del(&io->io_list); 93 wake_up(&ctxt->wait); 94 mutex_unlock(&ctxt->lock); 95 96 kfree(io); 97 } 98 99 static void move_write_done(struct bch_write_op *op) 100 { 101 struct moving_io *io = container_of(op, struct moving_io, write.op); 102 struct moving_context *ctxt = io->write.ctxt; 103 104 if (io->write.op.error) 105 ctxt->write_error = true; 106 107 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 108 atomic_dec(&io->write.ctxt->write_ios); 109 move_free(io); 110 closure_put(&ctxt->cl); 111 } 112 113 static void move_write(struct moving_io *io) 114 { 115 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 116 move_free(io); 117 return; 118 } 119 120 if (trace_move_extent_write_enabled()) { 121 struct bch_fs *c = io->write.op.c; 122 struct printbuf buf = PRINTBUF; 123 124 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); 125 trace_move_extent_write(c, buf.buf); 126 printbuf_exit(&buf); 127 } 128 129 closure_get(&io->write.ctxt->cl); 130 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 131 atomic_inc(&io->write.ctxt->write_ios); 132 133 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 134 } 135 136 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 137 { 138 struct moving_io *io = 139 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 140 141 return io && io->read_completed ? io : NULL; 142 } 143 144 static void move_read_endio(struct bio *bio) 145 { 146 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 147 struct moving_context *ctxt = io->write.ctxt; 148 149 atomic_sub(io->read_sectors, &ctxt->read_sectors); 150 atomic_dec(&ctxt->read_ios); 151 io->read_completed = true; 152 153 wake_up(&ctxt->wait); 154 closure_put(&ctxt->cl); 155 } 156 157 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 158 { 159 struct moving_io *io; 160 161 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 162 bch2_trans_unlock_long(ctxt->trans); 163 list_del(&io->read_list); 164 move_write(io); 165 } 166 } 167 168 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 169 { 170 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 171 172 move_ctxt_wait_event(ctxt, 173 !atomic_read(&ctxt->write_sectors) || 174 atomic_read(&ctxt->write_sectors) != sectors_pending); 175 } 176 177 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) 178 { 179 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 180 bch2_trans_unlock_long(ctxt->trans); 181 closure_sync(&ctxt->cl); 182 } 183 184 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 185 { 186 struct bch_fs *c = ctxt->trans->c; 187 188 bch2_moving_ctxt_flush_all(ctxt); 189 190 EBUG_ON(atomic_read(&ctxt->write_sectors)); 191 EBUG_ON(atomic_read(&ctxt->write_ios)); 192 EBUG_ON(atomic_read(&ctxt->read_sectors)); 193 EBUG_ON(atomic_read(&ctxt->read_ios)); 194 195 mutex_lock(&c->moving_context_lock); 196 list_del(&ctxt->list); 197 mutex_unlock(&c->moving_context_lock); 198 199 bch2_trans_put(ctxt->trans); 200 memset(ctxt, 0, sizeof(*ctxt)); 201 } 202 203 void bch2_moving_ctxt_init(struct moving_context *ctxt, 204 struct bch_fs *c, 205 struct bch_ratelimit *rate, 206 struct bch_move_stats *stats, 207 struct write_point_specifier wp, 208 bool wait_on_copygc) 209 { 210 memset(ctxt, 0, sizeof(*ctxt)); 211 212 ctxt->trans = bch2_trans_get(c); 213 ctxt->fn = (void *) _RET_IP_; 214 ctxt->rate = rate; 215 ctxt->stats = stats; 216 ctxt->wp = wp; 217 ctxt->wait_on_copygc = wait_on_copygc; 218 219 closure_init_stack(&ctxt->cl); 220 221 mutex_init(&ctxt->lock); 222 INIT_LIST_HEAD(&ctxt->reads); 223 INIT_LIST_HEAD(&ctxt->ios); 224 init_waitqueue_head(&ctxt->wait); 225 226 mutex_lock(&c->moving_context_lock); 227 list_add(&ctxt->list, &c->moving_context_list); 228 mutex_unlock(&c->moving_context_lock); 229 } 230 231 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 232 { 233 trace_move_data(c, stats); 234 } 235 236 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) 237 { 238 memset(stats, 0, sizeof(*stats)); 239 stats->data_type = BCH_DATA_user; 240 scnprintf(stats->name, sizeof(stats->name), "%s", name); 241 } 242 243 int bch2_move_extent(struct moving_context *ctxt, 244 struct move_bucket_in_flight *bucket_in_flight, 245 struct btree_iter *iter, 246 struct bkey_s_c k, 247 struct bch_io_opts io_opts, 248 struct data_update_opts data_opts) 249 { 250 struct btree_trans *trans = ctxt->trans; 251 struct bch_fs *c = trans->c; 252 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 253 struct moving_io *io; 254 const union bch_extent_entry *entry; 255 struct extent_ptr_decoded p; 256 unsigned sectors = k.k->size, pages; 257 int ret = -ENOMEM; 258 259 trace_move_extent2(c, k, &io_opts, &data_opts); 260 261 if (ctxt->stats) 262 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 263 264 bch2_data_update_opts_normalize(k, &data_opts); 265 266 if (!data_opts.rewrite_ptrs && 267 !data_opts.extra_replicas) { 268 if (data_opts.kill_ptrs) 269 return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); 270 return 0; 271 } 272 273 /* 274 * Before memory allocations & taking nocow locks in 275 * bch2_data_update_init(): 276 */ 277 bch2_trans_unlock(trans); 278 279 /* write path might have to decompress data: */ 280 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 281 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 282 283 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 284 io = kzalloc(sizeof(struct moving_io) + 285 sizeof(struct bio_vec) * pages, GFP_KERNEL); 286 if (!io) 287 goto err; 288 289 INIT_LIST_HEAD(&io->io_list); 290 io->write.ctxt = ctxt; 291 io->read_sectors = k.k->size; 292 io->write_sectors = k.k->size; 293 294 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 295 bio_set_prio(&io->write.op.wbio.bio, 296 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 297 298 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 299 GFP_KERNEL)) 300 goto err_free; 301 302 io->rbio.c = c; 303 io->rbio.opts = io_opts; 304 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 305 io->rbio.bio.bi_vcnt = pages; 306 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 307 io->rbio.bio.bi_iter.bi_size = sectors << 9; 308 309 io->rbio.bio.bi_opf = REQ_OP_READ; 310 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 311 io->rbio.bio.bi_end_io = move_read_endio; 312 313 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, 314 io_opts, data_opts, iter->btree_id, k); 315 if (ret) 316 goto err_free_pages; 317 318 io->write.op.end_io = move_write_done; 319 320 if (ctxt->rate) 321 bch2_ratelimit_increment(ctxt->rate, k.k->size); 322 323 if (ctxt->stats) { 324 atomic64_inc(&ctxt->stats->keys_moved); 325 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 326 } 327 328 if (bucket_in_flight) { 329 io->b = bucket_in_flight; 330 atomic_inc(&io->b->count); 331 } 332 333 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 334 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 335 trace_move_extent_read2(c, k); 336 337 mutex_lock(&ctxt->lock); 338 atomic_add(io->read_sectors, &ctxt->read_sectors); 339 atomic_inc(&ctxt->read_ios); 340 341 list_add_tail(&io->read_list, &ctxt->reads); 342 list_add_tail(&io->io_list, &ctxt->ios); 343 mutex_unlock(&ctxt->lock); 344 345 /* 346 * dropped by move_read_endio() - guards against use after free of 347 * ctxt when doing wakeup 348 */ 349 closure_get(&ctxt->cl); 350 bch2_read_extent(trans, &io->rbio, 351 bkey_start_pos(k.k), 352 iter->btree_id, k, 0, 353 BCH_READ_NODECODE| 354 BCH_READ_LAST_FRAGMENT); 355 return 0; 356 err_free_pages: 357 bio_free_pages(&io->write.op.wbio.bio); 358 err_free: 359 kfree(io); 360 err: 361 if (ret == -BCH_ERR_data_update_done) 362 return 0; 363 364 if (bch2_err_matches(ret, EROFS) || 365 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 366 return ret; 367 368 count_event(c, move_extent_start_fail); 369 370 if (trace_move_extent_start_fail_enabled()) { 371 struct printbuf buf = PRINTBUF; 372 373 bch2_bkey_val_to_text(&buf, c, k); 374 prt_str(&buf, ": "); 375 prt_str(&buf, bch2_err_str(ret)); 376 trace_move_extent_start_fail(c, buf.buf); 377 printbuf_exit(&buf); 378 } 379 return ret; 380 } 381 382 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 383 struct per_snapshot_io_opts *io_opts, 384 struct bkey_s_c extent_k) 385 { 386 struct bch_fs *c = trans->c; 387 u32 restart_count = trans->restart_count; 388 int ret = 0; 389 390 if (io_opts->cur_inum != extent_k.k->p.inode) { 391 io_opts->d.nr = 0; 392 393 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), 394 BTREE_ITER_all_snapshots, k, ({ 395 if (k.k->p.offset != extent_k.k->p.inode) 396 break; 397 398 if (!bkey_is_inode(k.k)) 399 continue; 400 401 struct bch_inode_unpacked inode; 402 BUG_ON(bch2_inode_unpack(k, &inode)); 403 404 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 405 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 406 407 darray_push(&io_opts->d, e); 408 })); 409 io_opts->cur_inum = extent_k.k->p.inode; 410 } 411 412 ret = ret ?: trans_was_restarted(trans, restart_count); 413 if (ret) 414 return ERR_PTR(ret); 415 416 if (extent_k.k->p.snapshot) 417 darray_for_each(io_opts->d, i) 418 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) 419 return &i->io_opts; 420 421 return &io_opts->fs_io_opts; 422 } 423 424 int bch2_move_get_io_opts_one(struct btree_trans *trans, 425 struct bch_io_opts *io_opts, 426 struct bkey_s_c extent_k) 427 { 428 struct btree_iter iter; 429 struct bkey_s_c k; 430 int ret; 431 432 /* reflink btree? */ 433 if (!extent_k.k->p.inode) { 434 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 435 return 0; 436 } 437 438 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 439 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 440 BTREE_ITER_cached); 441 ret = bkey_err(k); 442 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 443 return ret; 444 445 if (!ret && bkey_is_inode(k.k)) { 446 struct bch_inode_unpacked inode; 447 bch2_inode_unpack(k, &inode); 448 bch2_inode_opts_get(io_opts, trans->c, &inode); 449 } else { 450 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 451 } 452 453 bch2_trans_iter_exit(trans, &iter); 454 return 0; 455 } 456 457 int bch2_move_ratelimit(struct moving_context *ctxt) 458 { 459 struct bch_fs *c = ctxt->trans->c; 460 bool is_kthread = current->flags & PF_KTHREAD; 461 u64 delay; 462 463 if (ctxt->wait_on_copygc && c->copygc_running) { 464 bch2_moving_ctxt_flush_all(ctxt); 465 wait_event_killable(c->copygc_running_wq, 466 !c->copygc_running || 467 (is_kthread && kthread_should_stop())); 468 } 469 470 do { 471 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 472 473 if (is_kthread && kthread_should_stop()) 474 return 1; 475 476 if (delay) 477 move_ctxt_wait_event_timeout(ctxt, 478 freezing(current) || 479 (is_kthread && kthread_should_stop()), 480 delay); 481 482 if (unlikely(freezing(current))) { 483 bch2_moving_ctxt_flush_all(ctxt); 484 try_to_freeze(); 485 } 486 } while (delay); 487 488 /* 489 * XXX: these limits really ought to be per device, SSDs and hard drives 490 * will want different limits 491 */ 492 move_ctxt_wait_event(ctxt, 493 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 494 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 495 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 496 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 497 498 return 0; 499 } 500 501 static int bch2_move_data_btree(struct moving_context *ctxt, 502 struct bpos start, 503 struct bpos end, 504 move_pred_fn pred, void *arg, 505 enum btree_id btree_id) 506 { 507 struct btree_trans *trans = ctxt->trans; 508 struct bch_fs *c = trans->c; 509 struct per_snapshot_io_opts snapshot_io_opts; 510 struct bch_io_opts *io_opts; 511 struct bkey_buf sk; 512 struct btree_iter iter; 513 struct bkey_s_c k; 514 struct data_update_opts data_opts; 515 int ret = 0, ret2; 516 517 per_snapshot_io_opts_init(&snapshot_io_opts, c); 518 bch2_bkey_buf_init(&sk); 519 520 if (ctxt->stats) { 521 ctxt->stats->data_type = BCH_DATA_user; 522 ctxt->stats->pos = BBPOS(btree_id, start); 523 } 524 525 bch2_trans_begin(trans); 526 bch2_trans_iter_init(trans, &iter, btree_id, start, 527 BTREE_ITER_prefetch| 528 BTREE_ITER_all_snapshots); 529 530 if (ctxt->rate) 531 bch2_ratelimit_reset(ctxt->rate); 532 533 while (!bch2_move_ratelimit(ctxt)) { 534 bch2_trans_begin(trans); 535 536 k = bch2_btree_iter_peek(&iter); 537 if (!k.k) 538 break; 539 540 ret = bkey_err(k); 541 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 542 continue; 543 if (ret) 544 break; 545 546 if (bkey_ge(bkey_start_pos(k.k), end)) 547 break; 548 549 if (ctxt->stats) 550 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 551 552 if (!bkey_extent_is_direct_data(k.k)) 553 goto next_nondata; 554 555 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); 556 ret = PTR_ERR_OR_ZERO(io_opts); 557 if (ret) 558 continue; 559 560 memset(&data_opts, 0, sizeof(data_opts)); 561 if (!pred(c, arg, k, io_opts, &data_opts)) 562 goto next; 563 564 /* 565 * The iterator gets unlocked by __bch2_read_extent - need to 566 * save a copy of @k elsewhere: 567 */ 568 bch2_bkey_buf_reassemble(&sk, c, k); 569 k = bkey_i_to_s_c(sk.k); 570 571 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); 572 if (ret2) { 573 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 574 continue; 575 576 if (ret2 == -ENOMEM) { 577 /* memory allocation failure, wait for some IO to finish */ 578 bch2_move_ctxt_wait_for_io(ctxt); 579 continue; 580 } 581 582 /* XXX signal failure */ 583 goto next; 584 } 585 next: 586 if (ctxt->stats) 587 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 588 next_nondata: 589 bch2_btree_iter_advance(&iter); 590 } 591 592 bch2_trans_iter_exit(trans, &iter); 593 bch2_bkey_buf_exit(&sk, c); 594 per_snapshot_io_opts_exit(&snapshot_io_opts); 595 596 return ret; 597 } 598 599 int __bch2_move_data(struct moving_context *ctxt, 600 struct bbpos start, 601 struct bbpos end, 602 move_pred_fn pred, void *arg) 603 { 604 struct bch_fs *c = ctxt->trans->c; 605 enum btree_id id; 606 int ret = 0; 607 608 for (id = start.btree; 609 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 610 id++) { 611 ctxt->stats->pos = BBPOS(id, POS_MIN); 612 613 if (!btree_type_has_ptrs(id) || 614 !bch2_btree_id_root(c, id)->b) 615 continue; 616 617 ret = bch2_move_data_btree(ctxt, 618 id == start.btree ? start.pos : POS_MIN, 619 id == end.btree ? end.pos : POS_MAX, 620 pred, arg, id); 621 if (ret) 622 break; 623 } 624 625 return ret; 626 } 627 628 int bch2_move_data(struct bch_fs *c, 629 struct bbpos start, 630 struct bbpos end, 631 struct bch_ratelimit *rate, 632 struct bch_move_stats *stats, 633 struct write_point_specifier wp, 634 bool wait_on_copygc, 635 move_pred_fn pred, void *arg) 636 { 637 638 struct moving_context ctxt; 639 int ret; 640 641 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 642 ret = __bch2_move_data(&ctxt, start, end, pred, arg); 643 bch2_moving_ctxt_exit(&ctxt); 644 645 return ret; 646 } 647 648 int bch2_evacuate_bucket(struct moving_context *ctxt, 649 struct move_bucket_in_flight *bucket_in_flight, 650 struct bpos bucket, int gen, 651 struct data_update_opts _data_opts) 652 { 653 struct btree_trans *trans = ctxt->trans; 654 struct bch_fs *c = trans->c; 655 bool is_kthread = current->flags & PF_KTHREAD; 656 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 657 struct btree_iter iter; 658 struct bkey_buf sk; 659 struct bch_backpointer bp; 660 struct bch_alloc_v4 a_convert; 661 const struct bch_alloc_v4 *a; 662 struct bkey_s_c k; 663 struct data_update_opts data_opts; 664 unsigned dirty_sectors, bucket_size; 665 u64 fragmentation; 666 struct bpos bp_pos = POS_MIN; 667 int ret = 0; 668 669 struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); 670 if (!ca) 671 return 0; 672 673 trace_bucket_evacuate(c, &bucket); 674 675 bch2_bkey_buf_init(&sk); 676 677 /* 678 * We're not run in a context that handles transaction restarts: 679 */ 680 bch2_trans_begin(trans); 681 682 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 683 bucket, BTREE_ITER_cached); 684 ret = lockrestart_do(trans, 685 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 686 bch2_trans_iter_exit(trans, &iter); 687 688 bch_err_msg(c, ret, "looking up alloc key"); 689 if (ret) 690 goto err; 691 692 a = bch2_alloc_to_v4(k, &a_convert); 693 dirty_sectors = bch2_bucket_sectors_dirty(*a); 694 bucket_size = ca->mi.bucket_size; 695 fragmentation = alloc_lru_idx_fragmentation(*a, ca); 696 697 ret = bch2_btree_write_buffer_tryflush(trans); 698 bch_err_msg(c, ret, "flushing btree write buffer"); 699 if (ret) 700 goto err; 701 702 while (!(ret = bch2_move_ratelimit(ctxt))) { 703 if (is_kthread && kthread_should_stop()) 704 break; 705 706 bch2_trans_begin(trans); 707 708 ret = bch2_get_next_backpointer(trans, ca, bucket, gen, 709 &bp_pos, &bp, 710 BTREE_ITER_cached); 711 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 712 continue; 713 if (ret) 714 goto err; 715 if (bkey_eq(bp_pos, POS_MAX)) 716 break; 717 718 if (!bp.level) { 719 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 720 ret = bkey_err(k); 721 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 722 continue; 723 if (ret) 724 goto err; 725 if (!k.k) 726 goto next; 727 728 bch2_bkey_buf_reassemble(&sk, c, k); 729 k = bkey_i_to_s_c(sk.k); 730 731 ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 732 if (ret) { 733 bch2_trans_iter_exit(trans, &iter); 734 continue; 735 } 736 737 data_opts = _data_opts; 738 data_opts.target = io_opts.background_target; 739 data_opts.rewrite_ptrs = 0; 740 741 unsigned i = 0; 742 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 743 if (ptr->dev == bucket.inode) { 744 data_opts.rewrite_ptrs |= 1U << i; 745 if (ptr->cached) { 746 bch2_trans_iter_exit(trans, &iter); 747 goto next; 748 } 749 } 750 i++; 751 } 752 753 ret = bch2_move_extent(ctxt, bucket_in_flight, 754 &iter, k, io_opts, data_opts); 755 bch2_trans_iter_exit(trans, &iter); 756 757 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 758 continue; 759 if (ret == -ENOMEM) { 760 /* memory allocation failure, wait for some IO to finish */ 761 bch2_move_ctxt_wait_for_io(ctxt); 762 continue; 763 } 764 if (ret) 765 goto err; 766 767 if (ctxt->stats) 768 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 769 } else { 770 struct btree *b; 771 772 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 773 ret = PTR_ERR_OR_ZERO(b); 774 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 775 continue; 776 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 777 continue; 778 if (ret) 779 goto err; 780 if (!b) 781 goto next; 782 783 unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); 784 785 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 786 bch2_trans_iter_exit(trans, &iter); 787 788 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 789 continue; 790 if (ret) 791 goto err; 792 793 if (ctxt->rate) 794 bch2_ratelimit_increment(ctxt->rate, sectors); 795 if (ctxt->stats) { 796 atomic64_add(sectors, &ctxt->stats->sectors_seen); 797 atomic64_add(sectors, &ctxt->stats->sectors_moved); 798 } 799 } 800 next: 801 bp_pos = bpos_nosnap_successor(bp_pos); 802 } 803 804 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 805 err: 806 bch2_dev_put(ca); 807 bch2_bkey_buf_exit(&sk, c); 808 return ret; 809 } 810 811 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 812 struct btree *, struct bch_io_opts *, 813 struct data_update_opts *); 814 815 static int bch2_move_btree(struct bch_fs *c, 816 struct bbpos start, 817 struct bbpos end, 818 move_btree_pred pred, void *arg, 819 struct bch_move_stats *stats) 820 { 821 bool kthread = (current->flags & PF_KTHREAD) != 0; 822 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 823 struct moving_context ctxt; 824 struct btree_trans *trans; 825 struct btree_iter iter; 826 struct btree *b; 827 enum btree_id btree; 828 struct data_update_opts data_opts; 829 int ret = 0; 830 831 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 832 writepoint_ptr(&c->btree_write_point), 833 true); 834 trans = ctxt.trans; 835 836 stats->data_type = BCH_DATA_btree; 837 838 for (btree = start.btree; 839 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 840 btree ++) { 841 stats->pos = BBPOS(btree, POS_MIN); 842 843 if (!bch2_btree_id_root(c, btree)->b) 844 continue; 845 846 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, 847 BTREE_ITER_prefetch); 848 retry: 849 ret = 0; 850 while (bch2_trans_begin(trans), 851 (b = bch2_btree_iter_peek_node(&iter)) && 852 !(ret = PTR_ERR_OR_ZERO(b))) { 853 if (kthread && kthread_should_stop()) 854 break; 855 856 if ((cmp_int(btree, end.btree) ?: 857 bpos_cmp(b->key.k.p, end.pos)) > 0) 858 break; 859 860 stats->pos = BBPOS(iter.btree_id, iter.pos); 861 862 if (!pred(c, arg, b, &io_opts, &data_opts)) 863 goto next; 864 865 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 866 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 867 continue; 868 if (ret) 869 break; 870 next: 871 bch2_btree_iter_next_node(&iter); 872 } 873 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 874 goto retry; 875 876 bch2_trans_iter_exit(trans, &iter); 877 878 if (kthread && kthread_should_stop()) 879 break; 880 } 881 882 bch_err_fn(c, ret); 883 bch2_moving_ctxt_exit(&ctxt); 884 bch2_btree_interior_updates_flush(c); 885 886 return ret; 887 } 888 889 static bool rereplicate_pred(struct bch_fs *c, void *arg, 890 struct bkey_s_c k, 891 struct bch_io_opts *io_opts, 892 struct data_update_opts *data_opts) 893 { 894 unsigned nr_good = bch2_bkey_durability(c, k); 895 unsigned replicas = bkey_is_btree_ptr(k.k) 896 ? c->opts.metadata_replicas 897 : io_opts->data_replicas; 898 899 rcu_read_lock(); 900 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 901 unsigned i = 0; 902 bkey_for_each_ptr(ptrs, ptr) { 903 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); 904 if (!ptr->cached && 905 (!ca || !ca->mi.durability)) 906 data_opts->kill_ptrs |= BIT(i); 907 i++; 908 } 909 rcu_read_unlock(); 910 911 if (!data_opts->kill_ptrs && 912 (!nr_good || nr_good >= replicas)) 913 return false; 914 915 data_opts->target = 0; 916 data_opts->extra_replicas = replicas - nr_good; 917 data_opts->btree_insert_flags = 0; 918 return true; 919 } 920 921 static bool migrate_pred(struct bch_fs *c, void *arg, 922 struct bkey_s_c k, 923 struct bch_io_opts *io_opts, 924 struct data_update_opts *data_opts) 925 { 926 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 927 struct bch_ioctl_data *op = arg; 928 unsigned i = 0; 929 930 data_opts->rewrite_ptrs = 0; 931 data_opts->target = 0; 932 data_opts->extra_replicas = 0; 933 data_opts->btree_insert_flags = 0; 934 935 bkey_for_each_ptr(ptrs, ptr) { 936 if (ptr->dev == op->migrate.dev) 937 data_opts->rewrite_ptrs |= 1U << i; 938 i++; 939 } 940 941 return data_opts->rewrite_ptrs != 0; 942 } 943 944 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 945 struct btree *b, 946 struct bch_io_opts *io_opts, 947 struct data_update_opts *data_opts) 948 { 949 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 950 } 951 952 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 953 struct btree *b, 954 struct bch_io_opts *io_opts, 955 struct data_update_opts *data_opts) 956 { 957 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 958 } 959 960 /* 961 * Ancient versions of bcachefs produced packed formats which could represent 962 * keys that the in memory format cannot represent; this checks for those 963 * formats so we can get rid of them. 964 */ 965 static bool bformat_needs_redo(struct bkey_format *f) 966 { 967 for (unsigned i = 0; i < f->nr_fields; i++) 968 if (bch2_bkey_format_field_overflows(f, i)) 969 return true; 970 971 return false; 972 } 973 974 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 975 struct btree *b, 976 struct bch_io_opts *io_opts, 977 struct data_update_opts *data_opts) 978 { 979 if (b->version_ondisk != c->sb.version || 980 btree_node_need_rewrite(b) || 981 bformat_needs_redo(&b->format)) { 982 data_opts->target = 0; 983 data_opts->extra_replicas = 0; 984 data_opts->btree_insert_flags = 0; 985 return true; 986 } 987 988 return false; 989 } 990 991 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 992 { 993 int ret; 994 995 ret = bch2_move_btree(c, 996 BBPOS_MIN, 997 BBPOS_MAX, 998 rewrite_old_nodes_pred, c, stats); 999 if (!ret) { 1000 mutex_lock(&c->sb_lock); 1001 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1002 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1003 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1004 bch2_write_super(c); 1005 mutex_unlock(&c->sb_lock); 1006 } 1007 1008 bch_err_fn(c, ret); 1009 return ret; 1010 } 1011 1012 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, 1013 struct bkey_s_c k, 1014 struct bch_io_opts *io_opts, 1015 struct data_update_opts *data_opts) 1016 { 1017 unsigned durability = bch2_bkey_durability(c, k); 1018 unsigned replicas = bkey_is_btree_ptr(k.k) 1019 ? c->opts.metadata_replicas 1020 : io_opts->data_replicas; 1021 const union bch_extent_entry *entry; 1022 struct extent_ptr_decoded p; 1023 unsigned i = 0; 1024 1025 rcu_read_lock(); 1026 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { 1027 unsigned d = bch2_extent_ptr_durability(c, &p); 1028 1029 if (d && durability - d >= replicas) { 1030 data_opts->kill_ptrs |= BIT(i); 1031 durability -= d; 1032 } 1033 1034 i++; 1035 } 1036 rcu_read_unlock(); 1037 1038 return data_opts->kill_ptrs != 0; 1039 } 1040 1041 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, 1042 struct btree *b, 1043 struct bch_io_opts *io_opts, 1044 struct data_update_opts *data_opts) 1045 { 1046 return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 1047 } 1048 1049 int bch2_data_job(struct bch_fs *c, 1050 struct bch_move_stats *stats, 1051 struct bch_ioctl_data op) 1052 { 1053 struct bbpos start = BBPOS(op.start_btree, op.start_pos); 1054 struct bbpos end = BBPOS(op.end_btree, op.end_pos); 1055 int ret = 0; 1056 1057 if (op.op >= BCH_DATA_OP_NR) 1058 return -EINVAL; 1059 1060 bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); 1061 1062 switch (op.op) { 1063 case BCH_DATA_OP_rereplicate: 1064 stats->data_type = BCH_DATA_journal; 1065 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1066 ret = bch2_move_btree(c, start, end, 1067 rereplicate_btree_pred, c, stats) ?: ret; 1068 ret = bch2_move_data(c, start, end, 1069 NULL, 1070 stats, 1071 writepoint_hashed((unsigned long) current), 1072 true, 1073 rereplicate_pred, c) ?: ret; 1074 ret = bch2_replicas_gc2(c) ?: ret; 1075 break; 1076 case BCH_DATA_OP_migrate: 1077 if (op.migrate.dev >= c->sb.nr_devices) 1078 return -EINVAL; 1079 1080 stats->data_type = BCH_DATA_journal; 1081 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1082 ret = bch2_move_btree(c, start, end, 1083 migrate_btree_pred, &op, stats) ?: ret; 1084 ret = bch2_move_data(c, start, end, 1085 NULL, 1086 stats, 1087 writepoint_hashed((unsigned long) current), 1088 true, 1089 migrate_pred, &op) ?: ret; 1090 ret = bch2_replicas_gc2(c) ?: ret; 1091 break; 1092 case BCH_DATA_OP_rewrite_old_nodes: 1093 ret = bch2_scan_old_btree_nodes(c, stats); 1094 break; 1095 case BCH_DATA_OP_drop_extra_replicas: 1096 ret = bch2_move_btree(c, start, end, 1097 drop_extra_replicas_btree_pred, c, stats) ?: ret; 1098 ret = bch2_move_data(c, start, end, NULL, stats, 1099 writepoint_hashed((unsigned long) current), 1100 true, 1101 drop_extra_replicas_pred, c) ?: ret; 1102 ret = bch2_replicas_gc2(c) ?: ret; 1103 break; 1104 default: 1105 ret = -EINVAL; 1106 } 1107 1108 bch2_move_stats_exit(stats, c); 1109 return ret; 1110 } 1111 1112 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1113 { 1114 prt_printf(out, "%s: data type==", stats->name); 1115 bch2_prt_data_type(out, stats->data_type); 1116 prt_str(out, " pos="); 1117 bch2_bbpos_to_text(out, stats->pos); 1118 prt_newline(out); 1119 printbuf_indent_add(out, 2); 1120 1121 prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); 1122 prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); 1123 prt_printf(out, "bytes seen: "); 1124 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1125 prt_newline(out); 1126 1127 prt_printf(out, "bytes moved: "); 1128 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1129 prt_newline(out); 1130 1131 prt_printf(out, "bytes raced: "); 1132 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1133 prt_newline(out); 1134 1135 printbuf_indent_sub(out, 2); 1136 } 1137 1138 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1139 { 1140 struct moving_io *io; 1141 1142 bch2_move_stats_to_text(out, ctxt->stats); 1143 printbuf_indent_add(out, 2); 1144 1145 prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", 1146 atomic_read(&ctxt->read_ios), 1147 c->opts.move_ios_in_flight, 1148 atomic_read(&ctxt->read_sectors), 1149 c->opts.move_bytes_in_flight >> 9); 1150 1151 prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", 1152 atomic_read(&ctxt->write_ios), 1153 c->opts.move_ios_in_flight, 1154 atomic_read(&ctxt->write_sectors), 1155 c->opts.move_bytes_in_flight >> 9); 1156 1157 printbuf_indent_add(out, 2); 1158 1159 mutex_lock(&ctxt->lock); 1160 list_for_each_entry(io, &ctxt->ios, io_list) 1161 bch2_write_op_to_text(out, &io->write.op); 1162 mutex_unlock(&ctxt->lock); 1163 1164 printbuf_indent_sub(out, 4); 1165 } 1166 1167 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1168 { 1169 struct moving_context *ctxt; 1170 1171 mutex_lock(&c->moving_context_lock); 1172 list_for_each_entry(ctxt, &c->moving_context_list, list) 1173 bch2_moving_ctxt_to_text(out, c, ctxt); 1174 mutex_unlock(&c->moving_context_lock); 1175 } 1176 1177 void bch2_fs_move_init(struct bch_fs *c) 1178 { 1179 INIT_LIST_HEAD(&c->moving_context_list); 1180 mutex_init(&c->moving_context_lock); 1181 } 1182