1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_io.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_write_buffer.h" 13 #include "compress.h" 14 #include "disk_groups.h" 15 #include "ec.h" 16 #include "errcode.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "io_read.h" 20 #include "io_write.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "move.h" 24 #include "replicas.h" 25 #include "snapshot.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/ioprio.h> 30 #include <linux/kthread.h> 31 32 const char * const bch2_data_ops_strs[] = { 33 #define x(t, n, ...) [n] = #t, 34 BCH_DATA_OPS() 35 #undef x 36 NULL 37 }; 38 39 static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 40 struct bch_io_opts *io_opts, 41 struct data_update_opts *data_opts) 42 { 43 printbuf_tabstop_push(out, 20); 44 prt_str(out, "rewrite ptrs:\t"); 45 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 46 prt_newline(out); 47 48 prt_str(out, "kill ptrs:\t"); 49 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 50 prt_newline(out); 51 52 prt_str(out, "target:\t"); 53 bch2_target_to_text(out, c, data_opts->target); 54 prt_newline(out); 55 56 prt_str(out, "compression:\t"); 57 bch2_compression_opt_to_text(out, background_compression(*io_opts)); 58 prt_newline(out); 59 60 prt_str(out, "extra replicas:\t"); 61 prt_u64(out, data_opts->extra_replicas); 62 } 63 64 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, 65 struct bch_io_opts *io_opts, 66 struct data_update_opts *data_opts) 67 { 68 if (trace_move_extent_enabled()) { 69 struct printbuf buf = PRINTBUF; 70 71 bch2_bkey_val_to_text(&buf, c, k); 72 prt_newline(&buf); 73 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 74 trace_move_extent(c, buf.buf); 75 printbuf_exit(&buf); 76 } 77 } 78 79 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 80 { 81 if (trace_move_extent_read_enabled()) { 82 struct printbuf buf = PRINTBUF; 83 84 bch2_bkey_val_to_text(&buf, c, k); 85 trace_move_extent_read(c, buf.buf); 86 printbuf_exit(&buf); 87 } 88 } 89 90 struct moving_io { 91 struct list_head read_list; 92 struct list_head io_list; 93 struct move_bucket_in_flight *b; 94 struct closure cl; 95 bool read_completed; 96 97 unsigned read_sectors; 98 unsigned write_sectors; 99 100 struct bch_read_bio rbio; 101 102 struct data_update write; 103 /* Must be last since it is variable size */ 104 struct bio_vec bi_inline_vecs[]; 105 }; 106 107 static void move_free(struct moving_io *io) 108 { 109 struct moving_context *ctxt = io->write.ctxt; 110 111 if (io->b) 112 atomic_dec(&io->b->count); 113 114 bch2_data_update_exit(&io->write); 115 116 mutex_lock(&ctxt->lock); 117 list_del(&io->io_list); 118 wake_up(&ctxt->wait); 119 mutex_unlock(&ctxt->lock); 120 121 kfree(io); 122 } 123 124 static void move_write_done(struct bch_write_op *op) 125 { 126 struct moving_io *io = container_of(op, struct moving_io, write.op); 127 struct moving_context *ctxt = io->write.ctxt; 128 129 if (io->write.op.error) 130 ctxt->write_error = true; 131 132 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 133 atomic_dec(&io->write.ctxt->write_ios); 134 move_free(io); 135 closure_put(&ctxt->cl); 136 } 137 138 static void move_write(struct moving_io *io) 139 { 140 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 141 move_free(io); 142 return; 143 } 144 145 if (trace_move_extent_write_enabled()) { 146 struct bch_fs *c = io->write.op.c; 147 struct printbuf buf = PRINTBUF; 148 149 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); 150 trace_move_extent_write(c, buf.buf); 151 printbuf_exit(&buf); 152 } 153 154 closure_get(&io->write.ctxt->cl); 155 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 156 atomic_inc(&io->write.ctxt->write_ios); 157 158 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 159 } 160 161 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 162 { 163 struct moving_io *io = 164 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 165 166 return io && io->read_completed ? io : NULL; 167 } 168 169 static void move_read_endio(struct bio *bio) 170 { 171 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 172 struct moving_context *ctxt = io->write.ctxt; 173 174 atomic_sub(io->read_sectors, &ctxt->read_sectors); 175 atomic_dec(&ctxt->read_ios); 176 io->read_completed = true; 177 178 wake_up(&ctxt->wait); 179 closure_put(&ctxt->cl); 180 } 181 182 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 183 { 184 struct moving_io *io; 185 186 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 187 bch2_trans_unlock_long(ctxt->trans); 188 list_del(&io->read_list); 189 move_write(io); 190 } 191 } 192 193 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 194 { 195 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 196 197 move_ctxt_wait_event(ctxt, 198 !atomic_read(&ctxt->write_sectors) || 199 atomic_read(&ctxt->write_sectors) != sectors_pending); 200 } 201 202 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) 203 { 204 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 205 bch2_trans_unlock_long(ctxt->trans); 206 closure_sync(&ctxt->cl); 207 } 208 209 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 210 { 211 struct bch_fs *c = ctxt->trans->c; 212 213 bch2_moving_ctxt_flush_all(ctxt); 214 215 EBUG_ON(atomic_read(&ctxt->write_sectors)); 216 EBUG_ON(atomic_read(&ctxt->write_ios)); 217 EBUG_ON(atomic_read(&ctxt->read_sectors)); 218 EBUG_ON(atomic_read(&ctxt->read_ios)); 219 220 mutex_lock(&c->moving_context_lock); 221 list_del(&ctxt->list); 222 mutex_unlock(&c->moving_context_lock); 223 224 bch2_trans_put(ctxt->trans); 225 memset(ctxt, 0, sizeof(*ctxt)); 226 } 227 228 void bch2_moving_ctxt_init(struct moving_context *ctxt, 229 struct bch_fs *c, 230 struct bch_ratelimit *rate, 231 struct bch_move_stats *stats, 232 struct write_point_specifier wp, 233 bool wait_on_copygc) 234 { 235 memset(ctxt, 0, sizeof(*ctxt)); 236 237 ctxt->trans = bch2_trans_get(c); 238 ctxt->fn = (void *) _RET_IP_; 239 ctxt->rate = rate; 240 ctxt->stats = stats; 241 ctxt->wp = wp; 242 ctxt->wait_on_copygc = wait_on_copygc; 243 244 closure_init_stack(&ctxt->cl); 245 246 mutex_init(&ctxt->lock); 247 INIT_LIST_HEAD(&ctxt->reads); 248 INIT_LIST_HEAD(&ctxt->ios); 249 init_waitqueue_head(&ctxt->wait); 250 251 mutex_lock(&c->moving_context_lock); 252 list_add(&ctxt->list, &c->moving_context_list); 253 mutex_unlock(&c->moving_context_lock); 254 } 255 256 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 257 { 258 trace_move_data(c, stats); 259 } 260 261 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) 262 { 263 memset(stats, 0, sizeof(*stats)); 264 stats->data_type = BCH_DATA_user; 265 scnprintf(stats->name, sizeof(stats->name), "%s", name); 266 } 267 268 int bch2_move_extent(struct moving_context *ctxt, 269 struct move_bucket_in_flight *bucket_in_flight, 270 struct btree_iter *iter, 271 struct bkey_s_c k, 272 struct bch_io_opts io_opts, 273 struct data_update_opts data_opts) 274 { 275 struct btree_trans *trans = ctxt->trans; 276 struct bch_fs *c = trans->c; 277 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 278 struct moving_io *io; 279 const union bch_extent_entry *entry; 280 struct extent_ptr_decoded p; 281 unsigned sectors = k.k->size, pages; 282 int ret = -ENOMEM; 283 284 trace_move_extent2(c, k, &io_opts, &data_opts); 285 286 if (ctxt->stats) 287 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 288 289 bch2_data_update_opts_normalize(k, &data_opts); 290 291 if (!data_opts.rewrite_ptrs && 292 !data_opts.extra_replicas) { 293 if (data_opts.kill_ptrs) 294 return bch2_extent_drop_ptrs(trans, iter, k, data_opts); 295 return 0; 296 } 297 298 /* 299 * Before memory allocations & taking nocow locks in 300 * bch2_data_update_init(): 301 */ 302 bch2_trans_unlock(trans); 303 304 /* write path might have to decompress data: */ 305 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 306 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 307 308 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 309 io = kzalloc(sizeof(struct moving_io) + 310 sizeof(struct bio_vec) * pages, GFP_KERNEL); 311 if (!io) 312 goto err; 313 314 INIT_LIST_HEAD(&io->io_list); 315 io->write.ctxt = ctxt; 316 io->read_sectors = k.k->size; 317 io->write_sectors = k.k->size; 318 319 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 320 bio_set_prio(&io->write.op.wbio.bio, 321 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 322 323 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 324 GFP_KERNEL)) 325 goto err_free; 326 327 io->rbio.c = c; 328 io->rbio.opts = io_opts; 329 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 330 io->rbio.bio.bi_vcnt = pages; 331 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 332 io->rbio.bio.bi_iter.bi_size = sectors << 9; 333 334 io->rbio.bio.bi_opf = REQ_OP_READ; 335 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 336 io->rbio.bio.bi_end_io = move_read_endio; 337 338 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, 339 io_opts, data_opts, iter->btree_id, k); 340 if (ret) 341 goto err_free_pages; 342 343 io->write.op.end_io = move_write_done; 344 345 if (ctxt->rate) 346 bch2_ratelimit_increment(ctxt->rate, k.k->size); 347 348 if (ctxt->stats) { 349 atomic64_inc(&ctxt->stats->keys_moved); 350 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 351 } 352 353 if (bucket_in_flight) { 354 io->b = bucket_in_flight; 355 atomic_inc(&io->b->count); 356 } 357 358 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 359 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 360 trace_move_extent_read2(c, k); 361 362 mutex_lock(&ctxt->lock); 363 atomic_add(io->read_sectors, &ctxt->read_sectors); 364 atomic_inc(&ctxt->read_ios); 365 366 list_add_tail(&io->read_list, &ctxt->reads); 367 list_add_tail(&io->io_list, &ctxt->ios); 368 mutex_unlock(&ctxt->lock); 369 370 /* 371 * dropped by move_read_endio() - guards against use after free of 372 * ctxt when doing wakeup 373 */ 374 closure_get(&ctxt->cl); 375 bch2_read_extent(trans, &io->rbio, 376 bkey_start_pos(k.k), 377 iter->btree_id, k, 0, 378 BCH_READ_NODECODE| 379 BCH_READ_LAST_FRAGMENT); 380 return 0; 381 err_free_pages: 382 bio_free_pages(&io->write.op.wbio.bio); 383 err_free: 384 kfree(io); 385 err: 386 if (ret == -BCH_ERR_data_update_done) 387 return 0; 388 389 if (bch2_err_matches(ret, EROFS) || 390 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 391 return ret; 392 393 count_event(c, move_extent_start_fail); 394 395 if (trace_move_extent_start_fail_enabled()) { 396 struct printbuf buf = PRINTBUF; 397 398 bch2_bkey_val_to_text(&buf, c, k); 399 prt_str(&buf, ": "); 400 prt_str(&buf, bch2_err_str(ret)); 401 trace_move_extent_start_fail(c, buf.buf); 402 printbuf_exit(&buf); 403 } 404 return ret; 405 } 406 407 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 408 struct per_snapshot_io_opts *io_opts, 409 struct bkey_s_c extent_k) 410 { 411 struct bch_fs *c = trans->c; 412 u32 restart_count = trans->restart_count; 413 int ret = 0; 414 415 if (io_opts->cur_inum != extent_k.k->p.inode) { 416 io_opts->d.nr = 0; 417 418 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), 419 BTREE_ITER_all_snapshots, k, ({ 420 if (k.k->p.offset != extent_k.k->p.inode) 421 break; 422 423 if (!bkey_is_inode(k.k)) 424 continue; 425 426 struct bch_inode_unpacked inode; 427 BUG_ON(bch2_inode_unpack(k, &inode)); 428 429 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 430 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 431 432 darray_push(&io_opts->d, e); 433 })); 434 io_opts->cur_inum = extent_k.k->p.inode; 435 } 436 437 ret = ret ?: trans_was_restarted(trans, restart_count); 438 if (ret) 439 return ERR_PTR(ret); 440 441 if (extent_k.k->p.snapshot) 442 darray_for_each(io_opts->d, i) 443 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) 444 return &i->io_opts; 445 446 return &io_opts->fs_io_opts; 447 } 448 449 int bch2_move_get_io_opts_one(struct btree_trans *trans, 450 struct bch_io_opts *io_opts, 451 struct bkey_s_c extent_k) 452 { 453 struct btree_iter iter; 454 struct bkey_s_c k; 455 int ret; 456 457 /* reflink btree? */ 458 if (!extent_k.k->p.inode) { 459 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 460 return 0; 461 } 462 463 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 464 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 465 BTREE_ITER_cached); 466 ret = bkey_err(k); 467 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 468 return ret; 469 470 if (!ret && bkey_is_inode(k.k)) { 471 struct bch_inode_unpacked inode; 472 bch2_inode_unpack(k, &inode); 473 bch2_inode_opts_get(io_opts, trans->c, &inode); 474 } else { 475 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 476 } 477 478 bch2_trans_iter_exit(trans, &iter); 479 return 0; 480 } 481 482 int bch2_move_ratelimit(struct moving_context *ctxt) 483 { 484 struct bch_fs *c = ctxt->trans->c; 485 bool is_kthread = current->flags & PF_KTHREAD; 486 u64 delay; 487 488 if (ctxt->wait_on_copygc && c->copygc_running) { 489 bch2_moving_ctxt_flush_all(ctxt); 490 wait_event_killable(c->copygc_running_wq, 491 !c->copygc_running || 492 (is_kthread && kthread_should_stop())); 493 } 494 495 do { 496 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 497 498 if (is_kthread && kthread_should_stop()) 499 return 1; 500 501 if (delay) 502 move_ctxt_wait_event_timeout(ctxt, 503 freezing(current) || 504 (is_kthread && kthread_should_stop()), 505 delay); 506 507 if (unlikely(freezing(current))) { 508 bch2_moving_ctxt_flush_all(ctxt); 509 try_to_freeze(); 510 } 511 } while (delay); 512 513 /* 514 * XXX: these limits really ought to be per device, SSDs and hard drives 515 * will want different limits 516 */ 517 move_ctxt_wait_event(ctxt, 518 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 519 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 520 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 521 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 522 523 return 0; 524 } 525 526 static int bch2_move_data_btree(struct moving_context *ctxt, 527 struct bpos start, 528 struct bpos end, 529 move_pred_fn pred, void *arg, 530 enum btree_id btree_id) 531 { 532 struct btree_trans *trans = ctxt->trans; 533 struct bch_fs *c = trans->c; 534 struct per_snapshot_io_opts snapshot_io_opts; 535 struct bch_io_opts *io_opts; 536 struct bkey_buf sk; 537 struct btree_iter iter; 538 struct bkey_s_c k; 539 struct data_update_opts data_opts; 540 int ret = 0, ret2; 541 542 per_snapshot_io_opts_init(&snapshot_io_opts, c); 543 bch2_bkey_buf_init(&sk); 544 545 if (ctxt->stats) { 546 ctxt->stats->data_type = BCH_DATA_user; 547 ctxt->stats->pos = BBPOS(btree_id, start); 548 } 549 550 bch2_trans_iter_init(trans, &iter, btree_id, start, 551 BTREE_ITER_prefetch| 552 BTREE_ITER_all_snapshots); 553 554 if (ctxt->rate) 555 bch2_ratelimit_reset(ctxt->rate); 556 557 while (!bch2_move_ratelimit(ctxt)) { 558 bch2_trans_begin(trans); 559 560 k = bch2_btree_iter_peek(&iter); 561 if (!k.k) 562 break; 563 564 ret = bkey_err(k); 565 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 566 continue; 567 if (ret) 568 break; 569 570 if (bkey_ge(bkey_start_pos(k.k), end)) 571 break; 572 573 if (ctxt->stats) 574 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 575 576 if (!bkey_extent_is_direct_data(k.k)) 577 goto next_nondata; 578 579 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); 580 ret = PTR_ERR_OR_ZERO(io_opts); 581 if (ret) 582 continue; 583 584 memset(&data_opts, 0, sizeof(data_opts)); 585 if (!pred(c, arg, k, io_opts, &data_opts)) 586 goto next; 587 588 /* 589 * The iterator gets unlocked by __bch2_read_extent - need to 590 * save a copy of @k elsewhere: 591 */ 592 bch2_bkey_buf_reassemble(&sk, c, k); 593 k = bkey_i_to_s_c(sk.k); 594 595 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); 596 if (ret2) { 597 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 598 continue; 599 600 if (ret2 == -ENOMEM) { 601 /* memory allocation failure, wait for some IO to finish */ 602 bch2_move_ctxt_wait_for_io(ctxt); 603 continue; 604 } 605 606 /* XXX signal failure */ 607 goto next; 608 } 609 next: 610 if (ctxt->stats) 611 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 612 next_nondata: 613 bch2_btree_iter_advance(&iter); 614 } 615 616 bch2_trans_iter_exit(trans, &iter); 617 bch2_bkey_buf_exit(&sk, c); 618 per_snapshot_io_opts_exit(&snapshot_io_opts); 619 620 return ret; 621 } 622 623 int __bch2_move_data(struct moving_context *ctxt, 624 struct bbpos start, 625 struct bbpos end, 626 move_pred_fn pred, void *arg) 627 { 628 struct bch_fs *c = ctxt->trans->c; 629 enum btree_id id; 630 int ret = 0; 631 632 for (id = start.btree; 633 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 634 id++) { 635 ctxt->stats->pos = BBPOS(id, POS_MIN); 636 637 if (!btree_type_has_ptrs(id) || 638 !bch2_btree_id_root(c, id)->b) 639 continue; 640 641 ret = bch2_move_data_btree(ctxt, 642 id == start.btree ? start.pos : POS_MIN, 643 id == end.btree ? end.pos : POS_MAX, 644 pred, arg, id); 645 if (ret) 646 break; 647 } 648 649 return ret; 650 } 651 652 int bch2_move_data(struct bch_fs *c, 653 struct bbpos start, 654 struct bbpos end, 655 struct bch_ratelimit *rate, 656 struct bch_move_stats *stats, 657 struct write_point_specifier wp, 658 bool wait_on_copygc, 659 move_pred_fn pred, void *arg) 660 { 661 662 struct moving_context ctxt; 663 int ret; 664 665 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 666 ret = __bch2_move_data(&ctxt, start, end, pred, arg); 667 bch2_moving_ctxt_exit(&ctxt); 668 669 return ret; 670 } 671 672 int bch2_evacuate_bucket(struct moving_context *ctxt, 673 struct move_bucket_in_flight *bucket_in_flight, 674 struct bpos bucket, int gen, 675 struct data_update_opts _data_opts) 676 { 677 struct btree_trans *trans = ctxt->trans; 678 struct bch_fs *c = trans->c; 679 bool is_kthread = current->flags & PF_KTHREAD; 680 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 681 struct btree_iter iter; 682 struct bkey_buf sk; 683 struct bch_backpointer bp; 684 struct bch_alloc_v4 a_convert; 685 const struct bch_alloc_v4 *a; 686 struct bkey_s_c k; 687 struct data_update_opts data_opts; 688 unsigned dirty_sectors, bucket_size; 689 u64 fragmentation; 690 struct bpos bp_pos = POS_MIN; 691 int ret = 0; 692 693 struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); 694 if (!ca) 695 return 0; 696 697 trace_bucket_evacuate(c, &bucket); 698 699 bch2_bkey_buf_init(&sk); 700 701 /* 702 * We're not run in a context that handles transaction restarts: 703 */ 704 bch2_trans_begin(trans); 705 706 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 707 bucket, BTREE_ITER_cached); 708 ret = lockrestart_do(trans, 709 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 710 bch2_trans_iter_exit(trans, &iter); 711 712 bch_err_msg(c, ret, "looking up alloc key"); 713 if (ret) 714 goto err; 715 716 a = bch2_alloc_to_v4(k, &a_convert); 717 dirty_sectors = bch2_bucket_sectors_dirty(*a); 718 bucket_size = ca->mi.bucket_size; 719 fragmentation = a->fragmentation_lru; 720 721 ret = bch2_btree_write_buffer_tryflush(trans); 722 bch_err_msg(c, ret, "flushing btree write buffer"); 723 if (ret) 724 goto err; 725 726 while (!(ret = bch2_move_ratelimit(ctxt))) { 727 if (is_kthread && kthread_should_stop()) 728 break; 729 730 bch2_trans_begin(trans); 731 732 ret = bch2_get_next_backpointer(trans, ca, bucket, gen, 733 &bp_pos, &bp, 734 BTREE_ITER_cached); 735 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 736 continue; 737 if (ret) 738 goto err; 739 if (bkey_eq(bp_pos, POS_MAX)) 740 break; 741 742 if (!bp.level) { 743 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 744 ret = bkey_err(k); 745 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 746 continue; 747 if (ret) 748 goto err; 749 if (!k.k) 750 goto next; 751 752 bch2_bkey_buf_reassemble(&sk, c, k); 753 k = bkey_i_to_s_c(sk.k); 754 755 ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 756 if (ret) { 757 bch2_trans_iter_exit(trans, &iter); 758 continue; 759 } 760 761 data_opts = _data_opts; 762 data_opts.target = io_opts.background_target; 763 data_opts.rewrite_ptrs = 0; 764 765 unsigned i = 0; 766 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 767 if (ptr->dev == bucket.inode) { 768 data_opts.rewrite_ptrs |= 1U << i; 769 if (ptr->cached) { 770 bch2_trans_iter_exit(trans, &iter); 771 goto next; 772 } 773 } 774 i++; 775 } 776 777 ret = bch2_move_extent(ctxt, bucket_in_flight, 778 &iter, k, io_opts, data_opts); 779 bch2_trans_iter_exit(trans, &iter); 780 781 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 782 continue; 783 if (ret == -ENOMEM) { 784 /* memory allocation failure, wait for some IO to finish */ 785 bch2_move_ctxt_wait_for_io(ctxt); 786 continue; 787 } 788 if (ret) 789 goto err; 790 791 if (ctxt->stats) 792 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 793 } else { 794 struct btree *b; 795 796 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 797 ret = PTR_ERR_OR_ZERO(b); 798 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 799 continue; 800 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 801 continue; 802 if (ret) 803 goto err; 804 if (!b) 805 goto next; 806 807 unsigned sectors = btree_ptr_sectors_written(&b->key); 808 809 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 810 bch2_trans_iter_exit(trans, &iter); 811 812 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 813 continue; 814 if (ret) 815 goto err; 816 817 if (ctxt->rate) 818 bch2_ratelimit_increment(ctxt->rate, sectors); 819 if (ctxt->stats) { 820 atomic64_add(sectors, &ctxt->stats->sectors_seen); 821 atomic64_add(sectors, &ctxt->stats->sectors_moved); 822 } 823 } 824 next: 825 bp_pos = bpos_nosnap_successor(bp_pos); 826 } 827 828 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 829 err: 830 bch2_dev_put(ca); 831 bch2_bkey_buf_exit(&sk, c); 832 return ret; 833 } 834 835 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 836 struct btree *, struct bch_io_opts *, 837 struct data_update_opts *); 838 839 static int bch2_move_btree(struct bch_fs *c, 840 struct bbpos start, 841 struct bbpos end, 842 move_btree_pred pred, void *arg, 843 struct bch_move_stats *stats) 844 { 845 bool kthread = (current->flags & PF_KTHREAD) != 0; 846 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 847 struct moving_context ctxt; 848 struct btree_trans *trans; 849 struct btree_iter iter; 850 struct btree *b; 851 enum btree_id btree; 852 struct data_update_opts data_opts; 853 int ret = 0; 854 855 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 856 writepoint_ptr(&c->btree_write_point), 857 true); 858 trans = ctxt.trans; 859 860 stats->data_type = BCH_DATA_btree; 861 862 for (btree = start.btree; 863 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 864 btree ++) { 865 stats->pos = BBPOS(btree, POS_MIN); 866 867 if (!bch2_btree_id_root(c, btree)->b) 868 continue; 869 870 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, 871 BTREE_ITER_prefetch); 872 retry: 873 ret = 0; 874 while (bch2_trans_begin(trans), 875 (b = bch2_btree_iter_peek_node(&iter)) && 876 !(ret = PTR_ERR_OR_ZERO(b))) { 877 if (kthread && kthread_should_stop()) 878 break; 879 880 if ((cmp_int(btree, end.btree) ?: 881 bpos_cmp(b->key.k.p, end.pos)) > 0) 882 break; 883 884 stats->pos = BBPOS(iter.btree_id, iter.pos); 885 886 if (!pred(c, arg, b, &io_opts, &data_opts)) 887 goto next; 888 889 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 890 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 891 continue; 892 if (ret) 893 break; 894 next: 895 bch2_btree_iter_next_node(&iter); 896 } 897 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 898 goto retry; 899 900 bch2_trans_iter_exit(trans, &iter); 901 902 if (kthread && kthread_should_stop()) 903 break; 904 } 905 906 bch_err_fn(c, ret); 907 bch2_moving_ctxt_exit(&ctxt); 908 bch2_btree_interior_updates_flush(c); 909 910 return ret; 911 } 912 913 static bool rereplicate_pred(struct bch_fs *c, void *arg, 914 struct bkey_s_c k, 915 struct bch_io_opts *io_opts, 916 struct data_update_opts *data_opts) 917 { 918 unsigned nr_good = bch2_bkey_durability(c, k); 919 unsigned replicas = bkey_is_btree_ptr(k.k) 920 ? c->opts.metadata_replicas 921 : io_opts->data_replicas; 922 923 if (!nr_good || nr_good >= replicas) 924 return false; 925 926 data_opts->target = 0; 927 data_opts->extra_replicas = replicas - nr_good; 928 data_opts->btree_insert_flags = 0; 929 return true; 930 } 931 932 static bool migrate_pred(struct bch_fs *c, void *arg, 933 struct bkey_s_c k, 934 struct bch_io_opts *io_opts, 935 struct data_update_opts *data_opts) 936 { 937 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 938 struct bch_ioctl_data *op = arg; 939 unsigned i = 0; 940 941 data_opts->rewrite_ptrs = 0; 942 data_opts->target = 0; 943 data_opts->extra_replicas = 0; 944 data_opts->btree_insert_flags = 0; 945 946 bkey_for_each_ptr(ptrs, ptr) { 947 if (ptr->dev == op->migrate.dev) 948 data_opts->rewrite_ptrs |= 1U << i; 949 i++; 950 } 951 952 return data_opts->rewrite_ptrs != 0; 953 } 954 955 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 956 struct btree *b, 957 struct bch_io_opts *io_opts, 958 struct data_update_opts *data_opts) 959 { 960 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 961 } 962 963 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 964 struct btree *b, 965 struct bch_io_opts *io_opts, 966 struct data_update_opts *data_opts) 967 { 968 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 969 } 970 971 /* 972 * Ancient versions of bcachefs produced packed formats which could represent 973 * keys that the in memory format cannot represent; this checks for those 974 * formats so we can get rid of them. 975 */ 976 static bool bformat_needs_redo(struct bkey_format *f) 977 { 978 for (unsigned i = 0; i < f->nr_fields; i++) 979 if (bch2_bkey_format_field_overflows(f, i)) 980 return true; 981 982 return false; 983 } 984 985 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 986 struct btree *b, 987 struct bch_io_opts *io_opts, 988 struct data_update_opts *data_opts) 989 { 990 if (b->version_ondisk != c->sb.version || 991 btree_node_need_rewrite(b) || 992 bformat_needs_redo(&b->format)) { 993 data_opts->target = 0; 994 data_opts->extra_replicas = 0; 995 data_opts->btree_insert_flags = 0; 996 return true; 997 } 998 999 return false; 1000 } 1001 1002 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1003 { 1004 int ret; 1005 1006 ret = bch2_move_btree(c, 1007 BBPOS_MIN, 1008 BBPOS_MAX, 1009 rewrite_old_nodes_pred, c, stats); 1010 if (!ret) { 1011 mutex_lock(&c->sb_lock); 1012 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1013 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1014 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1015 bch2_write_super(c); 1016 mutex_unlock(&c->sb_lock); 1017 } 1018 1019 bch_err_fn(c, ret); 1020 return ret; 1021 } 1022 1023 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, 1024 struct bkey_s_c k, 1025 struct bch_io_opts *io_opts, 1026 struct data_update_opts *data_opts) 1027 { 1028 unsigned durability = bch2_bkey_durability(c, k); 1029 unsigned replicas = bkey_is_btree_ptr(k.k) 1030 ? c->opts.metadata_replicas 1031 : io_opts->data_replicas; 1032 const union bch_extent_entry *entry; 1033 struct extent_ptr_decoded p; 1034 unsigned i = 0; 1035 1036 rcu_read_lock(); 1037 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { 1038 unsigned d = bch2_extent_ptr_durability(c, &p); 1039 1040 if (d && durability - d >= replicas) { 1041 data_opts->kill_ptrs |= BIT(i); 1042 durability -= d; 1043 } 1044 1045 i++; 1046 } 1047 rcu_read_unlock(); 1048 1049 return data_opts->kill_ptrs != 0; 1050 } 1051 1052 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, 1053 struct btree *b, 1054 struct bch_io_opts *io_opts, 1055 struct data_update_opts *data_opts) 1056 { 1057 return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 1058 } 1059 1060 int bch2_data_job(struct bch_fs *c, 1061 struct bch_move_stats *stats, 1062 struct bch_ioctl_data op) 1063 { 1064 struct bbpos start = BBPOS(op.start_btree, op.start_pos); 1065 struct bbpos end = BBPOS(op.end_btree, op.end_pos); 1066 int ret = 0; 1067 1068 if (op.op >= BCH_DATA_OP_NR) 1069 return -EINVAL; 1070 1071 bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); 1072 1073 switch (op.op) { 1074 case BCH_DATA_OP_rereplicate: 1075 stats->data_type = BCH_DATA_journal; 1076 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1077 ret = bch2_move_btree(c, start, end, 1078 rereplicate_btree_pred, c, stats) ?: ret; 1079 ret = bch2_move_data(c, start, end, 1080 NULL, 1081 stats, 1082 writepoint_hashed((unsigned long) current), 1083 true, 1084 rereplicate_pred, c) ?: ret; 1085 ret = bch2_replicas_gc2(c) ?: ret; 1086 break; 1087 case BCH_DATA_OP_migrate: 1088 if (op.migrate.dev >= c->sb.nr_devices) 1089 return -EINVAL; 1090 1091 stats->data_type = BCH_DATA_journal; 1092 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1093 ret = bch2_move_btree(c, start, end, 1094 migrate_btree_pred, &op, stats) ?: ret; 1095 ret = bch2_move_data(c, start, end, 1096 NULL, 1097 stats, 1098 writepoint_hashed((unsigned long) current), 1099 true, 1100 migrate_pred, &op) ?: ret; 1101 ret = bch2_replicas_gc2(c) ?: ret; 1102 break; 1103 case BCH_DATA_OP_rewrite_old_nodes: 1104 ret = bch2_scan_old_btree_nodes(c, stats); 1105 break; 1106 case BCH_DATA_OP_drop_extra_replicas: 1107 ret = bch2_move_btree(c, start, end, 1108 drop_extra_replicas_btree_pred, c, stats) ?: ret; 1109 ret = bch2_move_data(c, start, end, NULL, stats, 1110 writepoint_hashed((unsigned long) current), 1111 true, 1112 drop_extra_replicas_pred, c) ?: ret; 1113 ret = bch2_replicas_gc2(c) ?: ret; 1114 break; 1115 default: 1116 ret = -EINVAL; 1117 } 1118 1119 bch2_move_stats_exit(stats, c); 1120 return ret; 1121 } 1122 1123 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1124 { 1125 prt_printf(out, "%s: data type==", stats->name); 1126 bch2_prt_data_type(out, stats->data_type); 1127 prt_str(out, " pos="); 1128 bch2_bbpos_to_text(out, stats->pos); 1129 prt_newline(out); 1130 printbuf_indent_add(out, 2); 1131 1132 prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); 1133 prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); 1134 prt_printf(out, "bytes seen: "); 1135 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1136 prt_newline(out); 1137 1138 prt_printf(out, "bytes moved: "); 1139 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1140 prt_newline(out); 1141 1142 prt_printf(out, "bytes raced: "); 1143 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1144 prt_newline(out); 1145 1146 printbuf_indent_sub(out, 2); 1147 } 1148 1149 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1150 { 1151 struct moving_io *io; 1152 1153 bch2_move_stats_to_text(out, ctxt->stats); 1154 printbuf_indent_add(out, 2); 1155 1156 prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", 1157 atomic_read(&ctxt->read_ios), 1158 c->opts.move_ios_in_flight, 1159 atomic_read(&ctxt->read_sectors), 1160 c->opts.move_bytes_in_flight >> 9); 1161 1162 prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", 1163 atomic_read(&ctxt->write_ios), 1164 c->opts.move_ios_in_flight, 1165 atomic_read(&ctxt->write_sectors), 1166 c->opts.move_bytes_in_flight >> 9); 1167 1168 printbuf_indent_add(out, 2); 1169 1170 mutex_lock(&ctxt->lock); 1171 list_for_each_entry(io, &ctxt->ios, io_list) 1172 bch2_write_op_to_text(out, &io->write.op); 1173 mutex_unlock(&ctxt->lock); 1174 1175 printbuf_indent_sub(out, 4); 1176 } 1177 1178 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1179 { 1180 struct moving_context *ctxt; 1181 1182 mutex_lock(&c->moving_context_lock); 1183 list_for_each_entry(ctxt, &c->moving_context_list, list) 1184 bch2_moving_ctxt_to_text(out, c, ctxt); 1185 mutex_unlock(&c->moving_context_lock); 1186 } 1187 1188 void bch2_fs_move_init(struct bch_fs *c) 1189 { 1190 INIT_LIST_HEAD(&c->moving_context_list); 1191 mutex_init(&c->moving_context_lock); 1192 } 1193