1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_io.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_write_buffer.h" 13 #include "compress.h" 14 #include "disk_groups.h" 15 #include "ec.h" 16 #include "errcode.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "io_read.h" 20 #include "io_write.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "move.h" 24 #include "replicas.h" 25 #include "snapshot.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/ioprio.h> 30 #include <linux/kthread.h> 31 32 const char * const bch2_data_ops_strs[] = { 33 #define x(t, n, ...) [n] = #t, 34 BCH_DATA_OPS() 35 #undef x 36 NULL 37 }; 38 39 static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 40 struct bch_io_opts *io_opts, 41 struct data_update_opts *data_opts) 42 { 43 printbuf_tabstop_push(out, 20); 44 prt_str(out, "rewrite ptrs:\t"); 45 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 46 prt_newline(out); 47 48 prt_str(out, "kill ptrs:\t"); 49 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 50 prt_newline(out); 51 52 prt_str(out, "target:\t"); 53 bch2_target_to_text(out, c, data_opts->target); 54 prt_newline(out); 55 56 prt_str(out, "compression:\t"); 57 bch2_compression_opt_to_text(out, background_compression(*io_opts)); 58 prt_newline(out); 59 60 prt_str(out, "extra replicas:\t"); 61 prt_u64(out, data_opts->extra_replicas); 62 } 63 64 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, 65 struct bch_io_opts *io_opts, 66 struct data_update_opts *data_opts) 67 { 68 if (trace_move_extent_enabled()) { 69 struct printbuf buf = PRINTBUF; 70 71 bch2_bkey_val_to_text(&buf, c, k); 72 prt_newline(&buf); 73 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 74 trace_move_extent(c, buf.buf); 75 printbuf_exit(&buf); 76 } 77 } 78 79 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 80 { 81 if (trace_move_extent_read_enabled()) { 82 struct printbuf buf = PRINTBUF; 83 84 bch2_bkey_val_to_text(&buf, c, k); 85 trace_move_extent_read(c, buf.buf); 86 printbuf_exit(&buf); 87 } 88 } 89 90 struct moving_io { 91 struct list_head read_list; 92 struct list_head io_list; 93 struct move_bucket_in_flight *b; 94 struct closure cl; 95 bool read_completed; 96 97 unsigned read_sectors; 98 unsigned write_sectors; 99 100 struct bch_read_bio rbio; 101 102 struct data_update write; 103 /* Must be last since it is variable size */ 104 struct bio_vec bi_inline_vecs[]; 105 }; 106 107 static void move_free(struct moving_io *io) 108 { 109 struct moving_context *ctxt = io->write.ctxt; 110 111 if (io->b) 112 atomic_dec(&io->b->count); 113 114 bch2_data_update_exit(&io->write); 115 116 mutex_lock(&ctxt->lock); 117 list_del(&io->io_list); 118 wake_up(&ctxt->wait); 119 mutex_unlock(&ctxt->lock); 120 121 kfree(io); 122 } 123 124 static void move_write_done(struct bch_write_op *op) 125 { 126 struct moving_io *io = container_of(op, struct moving_io, write.op); 127 struct moving_context *ctxt = io->write.ctxt; 128 129 if (io->write.op.error) 130 ctxt->write_error = true; 131 132 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 133 atomic_dec(&io->write.ctxt->write_ios); 134 move_free(io); 135 closure_put(&ctxt->cl); 136 } 137 138 static void move_write(struct moving_io *io) 139 { 140 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 141 move_free(io); 142 return; 143 } 144 145 if (trace_move_extent_write_enabled()) { 146 struct bch_fs *c = io->write.op.c; 147 struct printbuf buf = PRINTBUF; 148 149 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); 150 trace_move_extent_write(c, buf.buf); 151 printbuf_exit(&buf); 152 } 153 154 closure_get(&io->write.ctxt->cl); 155 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 156 atomic_inc(&io->write.ctxt->write_ios); 157 158 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 159 } 160 161 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 162 { 163 struct moving_io *io = 164 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 165 166 return io && io->read_completed ? io : NULL; 167 } 168 169 static void move_read_endio(struct bio *bio) 170 { 171 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 172 struct moving_context *ctxt = io->write.ctxt; 173 174 atomic_sub(io->read_sectors, &ctxt->read_sectors); 175 atomic_dec(&ctxt->read_ios); 176 io->read_completed = true; 177 178 wake_up(&ctxt->wait); 179 closure_put(&ctxt->cl); 180 } 181 182 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 183 { 184 struct moving_io *io; 185 186 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 187 bch2_trans_unlock_long(ctxt->trans); 188 list_del(&io->read_list); 189 move_write(io); 190 } 191 } 192 193 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 194 { 195 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 196 197 move_ctxt_wait_event(ctxt, 198 !atomic_read(&ctxt->write_sectors) || 199 atomic_read(&ctxt->write_sectors) != sectors_pending); 200 } 201 202 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) 203 { 204 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 205 bch2_trans_unlock_long(ctxt->trans); 206 closure_sync(&ctxt->cl); 207 } 208 209 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 210 { 211 struct bch_fs *c = ctxt->trans->c; 212 213 bch2_moving_ctxt_flush_all(ctxt); 214 215 EBUG_ON(atomic_read(&ctxt->write_sectors)); 216 EBUG_ON(atomic_read(&ctxt->write_ios)); 217 EBUG_ON(atomic_read(&ctxt->read_sectors)); 218 EBUG_ON(atomic_read(&ctxt->read_ios)); 219 220 mutex_lock(&c->moving_context_lock); 221 list_del(&ctxt->list); 222 mutex_unlock(&c->moving_context_lock); 223 224 bch2_trans_put(ctxt->trans); 225 memset(ctxt, 0, sizeof(*ctxt)); 226 } 227 228 void bch2_moving_ctxt_init(struct moving_context *ctxt, 229 struct bch_fs *c, 230 struct bch_ratelimit *rate, 231 struct bch_move_stats *stats, 232 struct write_point_specifier wp, 233 bool wait_on_copygc) 234 { 235 memset(ctxt, 0, sizeof(*ctxt)); 236 237 ctxt->trans = bch2_trans_get(c); 238 ctxt->fn = (void *) _RET_IP_; 239 ctxt->rate = rate; 240 ctxt->stats = stats; 241 ctxt->wp = wp; 242 ctxt->wait_on_copygc = wait_on_copygc; 243 244 closure_init_stack(&ctxt->cl); 245 246 mutex_init(&ctxt->lock); 247 INIT_LIST_HEAD(&ctxt->reads); 248 INIT_LIST_HEAD(&ctxt->ios); 249 init_waitqueue_head(&ctxt->wait); 250 251 mutex_lock(&c->moving_context_lock); 252 list_add(&ctxt->list, &c->moving_context_list); 253 mutex_unlock(&c->moving_context_lock); 254 } 255 256 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 257 { 258 trace_move_data(c, stats); 259 } 260 261 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) 262 { 263 memset(stats, 0, sizeof(*stats)); 264 stats->data_type = BCH_DATA_user; 265 scnprintf(stats->name, sizeof(stats->name), "%s", name); 266 } 267 268 int bch2_move_extent(struct moving_context *ctxt, 269 struct move_bucket_in_flight *bucket_in_flight, 270 struct btree_iter *iter, 271 struct bkey_s_c k, 272 struct bch_io_opts io_opts, 273 struct data_update_opts data_opts) 274 { 275 struct btree_trans *trans = ctxt->trans; 276 struct bch_fs *c = trans->c; 277 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 278 struct moving_io *io; 279 const union bch_extent_entry *entry; 280 struct extent_ptr_decoded p; 281 unsigned sectors = k.k->size, pages; 282 int ret = -ENOMEM; 283 284 trace_move_extent2(c, k, &io_opts, &data_opts); 285 286 if (ctxt->stats) 287 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 288 289 bch2_data_update_opts_normalize(k, &data_opts); 290 291 if (!data_opts.rewrite_ptrs && 292 !data_opts.extra_replicas) { 293 if (data_opts.kill_ptrs) 294 return bch2_extent_drop_ptrs(trans, iter, k, data_opts); 295 return 0; 296 } 297 298 /* 299 * Before memory allocations & taking nocow locks in 300 * bch2_data_update_init(): 301 */ 302 bch2_trans_unlock(trans); 303 304 /* write path might have to decompress data: */ 305 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 306 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 307 308 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 309 io = kzalloc(sizeof(struct moving_io) + 310 sizeof(struct bio_vec) * pages, GFP_KERNEL); 311 if (!io) 312 goto err; 313 314 INIT_LIST_HEAD(&io->io_list); 315 io->write.ctxt = ctxt; 316 io->read_sectors = k.k->size; 317 io->write_sectors = k.k->size; 318 319 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 320 bio_set_prio(&io->write.op.wbio.bio, 321 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 322 323 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 324 GFP_KERNEL)) 325 goto err_free; 326 327 io->rbio.c = c; 328 io->rbio.opts = io_opts; 329 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 330 io->rbio.bio.bi_vcnt = pages; 331 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 332 io->rbio.bio.bi_iter.bi_size = sectors << 9; 333 334 io->rbio.bio.bi_opf = REQ_OP_READ; 335 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 336 io->rbio.bio.bi_end_io = move_read_endio; 337 338 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, 339 io_opts, data_opts, iter->btree_id, k); 340 if (ret) 341 goto err_free_pages; 342 343 io->write.op.end_io = move_write_done; 344 345 if (ctxt->rate) 346 bch2_ratelimit_increment(ctxt->rate, k.k->size); 347 348 if (ctxt->stats) { 349 atomic64_inc(&ctxt->stats->keys_moved); 350 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 351 } 352 353 if (bucket_in_flight) { 354 io->b = bucket_in_flight; 355 atomic_inc(&io->b->count); 356 } 357 358 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 359 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 360 trace_move_extent_read2(c, k); 361 362 mutex_lock(&ctxt->lock); 363 atomic_add(io->read_sectors, &ctxt->read_sectors); 364 atomic_inc(&ctxt->read_ios); 365 366 list_add_tail(&io->read_list, &ctxt->reads); 367 list_add_tail(&io->io_list, &ctxt->ios); 368 mutex_unlock(&ctxt->lock); 369 370 /* 371 * dropped by move_read_endio() - guards against use after free of 372 * ctxt when doing wakeup 373 */ 374 closure_get(&ctxt->cl); 375 bch2_read_extent(trans, &io->rbio, 376 bkey_start_pos(k.k), 377 iter->btree_id, k, 0, 378 BCH_READ_NODECODE| 379 BCH_READ_LAST_FRAGMENT); 380 return 0; 381 err_free_pages: 382 bio_free_pages(&io->write.op.wbio.bio); 383 err_free: 384 kfree(io); 385 err: 386 if (ret == -BCH_ERR_data_update_done) 387 return 0; 388 389 if (bch2_err_matches(ret, EROFS) || 390 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 391 return ret; 392 393 count_event(c, move_extent_start_fail); 394 395 if (trace_move_extent_start_fail_enabled()) { 396 struct printbuf buf = PRINTBUF; 397 398 bch2_bkey_val_to_text(&buf, c, k); 399 prt_str(&buf, ": "); 400 prt_str(&buf, bch2_err_str(ret)); 401 trace_move_extent_start_fail(c, buf.buf); 402 printbuf_exit(&buf); 403 } 404 return ret; 405 } 406 407 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 408 struct per_snapshot_io_opts *io_opts, 409 struct bkey_s_c extent_k) 410 { 411 struct bch_fs *c = trans->c; 412 u32 restart_count = trans->restart_count; 413 int ret = 0; 414 415 if (io_opts->cur_inum != extent_k.k->p.inode) { 416 io_opts->d.nr = 0; 417 418 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), 419 BTREE_ITER_all_snapshots, k, ({ 420 if (k.k->p.offset != extent_k.k->p.inode) 421 break; 422 423 if (!bkey_is_inode(k.k)) 424 continue; 425 426 struct bch_inode_unpacked inode; 427 BUG_ON(bch2_inode_unpack(k, &inode)); 428 429 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 430 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 431 432 darray_push(&io_opts->d, e); 433 })); 434 io_opts->cur_inum = extent_k.k->p.inode; 435 } 436 437 ret = ret ?: trans_was_restarted(trans, restart_count); 438 if (ret) 439 return ERR_PTR(ret); 440 441 if (extent_k.k->p.snapshot) 442 darray_for_each(io_opts->d, i) 443 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) 444 return &i->io_opts; 445 446 return &io_opts->fs_io_opts; 447 } 448 449 int bch2_move_get_io_opts_one(struct btree_trans *trans, 450 struct bch_io_opts *io_opts, 451 struct bkey_s_c extent_k) 452 { 453 struct btree_iter iter; 454 struct bkey_s_c k; 455 int ret; 456 457 /* reflink btree? */ 458 if (!extent_k.k->p.inode) { 459 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 460 return 0; 461 } 462 463 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 464 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 465 BTREE_ITER_cached); 466 ret = bkey_err(k); 467 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 468 return ret; 469 470 if (!ret && bkey_is_inode(k.k)) { 471 struct bch_inode_unpacked inode; 472 bch2_inode_unpack(k, &inode); 473 bch2_inode_opts_get(io_opts, trans->c, &inode); 474 } else { 475 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 476 } 477 478 bch2_trans_iter_exit(trans, &iter); 479 return 0; 480 } 481 482 int bch2_move_ratelimit(struct moving_context *ctxt) 483 { 484 struct bch_fs *c = ctxt->trans->c; 485 bool is_kthread = current->flags & PF_KTHREAD; 486 u64 delay; 487 488 if (ctxt->wait_on_copygc && c->copygc_running) { 489 bch2_moving_ctxt_flush_all(ctxt); 490 wait_event_killable(c->copygc_running_wq, 491 !c->copygc_running || 492 (is_kthread && kthread_should_stop())); 493 } 494 495 do { 496 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 497 498 if (is_kthread && kthread_should_stop()) 499 return 1; 500 501 if (delay) 502 move_ctxt_wait_event_timeout(ctxt, 503 freezing(current) || 504 (is_kthread && kthread_should_stop()), 505 delay); 506 507 if (unlikely(freezing(current))) { 508 bch2_moving_ctxt_flush_all(ctxt); 509 try_to_freeze(); 510 } 511 } while (delay); 512 513 /* 514 * XXX: these limits really ought to be per device, SSDs and hard drives 515 * will want different limits 516 */ 517 move_ctxt_wait_event(ctxt, 518 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 519 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 520 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 521 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 522 523 return 0; 524 } 525 526 static int bch2_move_data_btree(struct moving_context *ctxt, 527 struct bpos start, 528 struct bpos end, 529 move_pred_fn pred, void *arg, 530 enum btree_id btree_id) 531 { 532 struct btree_trans *trans = ctxt->trans; 533 struct bch_fs *c = trans->c; 534 struct per_snapshot_io_opts snapshot_io_opts; 535 struct bch_io_opts *io_opts; 536 struct bkey_buf sk; 537 struct btree_iter iter; 538 struct bkey_s_c k; 539 struct data_update_opts data_opts; 540 int ret = 0, ret2; 541 542 per_snapshot_io_opts_init(&snapshot_io_opts, c); 543 bch2_bkey_buf_init(&sk); 544 545 if (ctxt->stats) { 546 ctxt->stats->data_type = BCH_DATA_user; 547 ctxt->stats->pos = BBPOS(btree_id, start); 548 } 549 550 bch2_trans_begin(trans); 551 bch2_trans_iter_init(trans, &iter, btree_id, start, 552 BTREE_ITER_prefetch| 553 BTREE_ITER_all_snapshots); 554 555 if (ctxt->rate) 556 bch2_ratelimit_reset(ctxt->rate); 557 558 while (!bch2_move_ratelimit(ctxt)) { 559 bch2_trans_begin(trans); 560 561 k = bch2_btree_iter_peek(&iter); 562 if (!k.k) 563 break; 564 565 ret = bkey_err(k); 566 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 567 continue; 568 if (ret) 569 break; 570 571 if (bkey_ge(bkey_start_pos(k.k), end)) 572 break; 573 574 if (ctxt->stats) 575 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 576 577 if (!bkey_extent_is_direct_data(k.k)) 578 goto next_nondata; 579 580 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); 581 ret = PTR_ERR_OR_ZERO(io_opts); 582 if (ret) 583 continue; 584 585 memset(&data_opts, 0, sizeof(data_opts)); 586 if (!pred(c, arg, k, io_opts, &data_opts)) 587 goto next; 588 589 /* 590 * The iterator gets unlocked by __bch2_read_extent - need to 591 * save a copy of @k elsewhere: 592 */ 593 bch2_bkey_buf_reassemble(&sk, c, k); 594 k = bkey_i_to_s_c(sk.k); 595 596 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); 597 if (ret2) { 598 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 599 continue; 600 601 if (ret2 == -ENOMEM) { 602 /* memory allocation failure, wait for some IO to finish */ 603 bch2_move_ctxt_wait_for_io(ctxt); 604 continue; 605 } 606 607 /* XXX signal failure */ 608 goto next; 609 } 610 next: 611 if (ctxt->stats) 612 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 613 next_nondata: 614 bch2_btree_iter_advance(&iter); 615 } 616 617 bch2_trans_iter_exit(trans, &iter); 618 bch2_bkey_buf_exit(&sk, c); 619 per_snapshot_io_opts_exit(&snapshot_io_opts); 620 621 return ret; 622 } 623 624 int __bch2_move_data(struct moving_context *ctxt, 625 struct bbpos start, 626 struct bbpos end, 627 move_pred_fn pred, void *arg) 628 { 629 struct bch_fs *c = ctxt->trans->c; 630 enum btree_id id; 631 int ret = 0; 632 633 for (id = start.btree; 634 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 635 id++) { 636 ctxt->stats->pos = BBPOS(id, POS_MIN); 637 638 if (!btree_type_has_ptrs(id) || 639 !bch2_btree_id_root(c, id)->b) 640 continue; 641 642 ret = bch2_move_data_btree(ctxt, 643 id == start.btree ? start.pos : POS_MIN, 644 id == end.btree ? end.pos : POS_MAX, 645 pred, arg, id); 646 if (ret) 647 break; 648 } 649 650 return ret; 651 } 652 653 int bch2_move_data(struct bch_fs *c, 654 struct bbpos start, 655 struct bbpos end, 656 struct bch_ratelimit *rate, 657 struct bch_move_stats *stats, 658 struct write_point_specifier wp, 659 bool wait_on_copygc, 660 move_pred_fn pred, void *arg) 661 { 662 663 struct moving_context ctxt; 664 int ret; 665 666 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 667 ret = __bch2_move_data(&ctxt, start, end, pred, arg); 668 bch2_moving_ctxt_exit(&ctxt); 669 670 return ret; 671 } 672 673 int bch2_evacuate_bucket(struct moving_context *ctxt, 674 struct move_bucket_in_flight *bucket_in_flight, 675 struct bpos bucket, int gen, 676 struct data_update_opts _data_opts) 677 { 678 struct btree_trans *trans = ctxt->trans; 679 struct bch_fs *c = trans->c; 680 bool is_kthread = current->flags & PF_KTHREAD; 681 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 682 struct btree_iter iter; 683 struct bkey_buf sk; 684 struct bch_backpointer bp; 685 struct bch_alloc_v4 a_convert; 686 const struct bch_alloc_v4 *a; 687 struct bkey_s_c k; 688 struct data_update_opts data_opts; 689 unsigned dirty_sectors, bucket_size; 690 u64 fragmentation; 691 struct bpos bp_pos = POS_MIN; 692 int ret = 0; 693 694 struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); 695 if (!ca) 696 return 0; 697 698 trace_bucket_evacuate(c, &bucket); 699 700 bch2_bkey_buf_init(&sk); 701 702 /* 703 * We're not run in a context that handles transaction restarts: 704 */ 705 bch2_trans_begin(trans); 706 707 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 708 bucket, BTREE_ITER_cached); 709 ret = lockrestart_do(trans, 710 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 711 bch2_trans_iter_exit(trans, &iter); 712 713 bch_err_msg(c, ret, "looking up alloc key"); 714 if (ret) 715 goto err; 716 717 a = bch2_alloc_to_v4(k, &a_convert); 718 dirty_sectors = bch2_bucket_sectors_dirty(*a); 719 bucket_size = ca->mi.bucket_size; 720 fragmentation = a->fragmentation_lru; 721 722 ret = bch2_btree_write_buffer_tryflush(trans); 723 bch_err_msg(c, ret, "flushing btree write buffer"); 724 if (ret) 725 goto err; 726 727 while (!(ret = bch2_move_ratelimit(ctxt))) { 728 if (is_kthread && kthread_should_stop()) 729 break; 730 731 bch2_trans_begin(trans); 732 733 ret = bch2_get_next_backpointer(trans, ca, bucket, gen, 734 &bp_pos, &bp, 735 BTREE_ITER_cached); 736 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 737 continue; 738 if (ret) 739 goto err; 740 if (bkey_eq(bp_pos, POS_MAX)) 741 break; 742 743 if (!bp.level) { 744 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 745 ret = bkey_err(k); 746 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 747 continue; 748 if (ret) 749 goto err; 750 if (!k.k) 751 goto next; 752 753 bch2_bkey_buf_reassemble(&sk, c, k); 754 k = bkey_i_to_s_c(sk.k); 755 756 ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 757 if (ret) { 758 bch2_trans_iter_exit(trans, &iter); 759 continue; 760 } 761 762 data_opts = _data_opts; 763 data_opts.target = io_opts.background_target; 764 data_opts.rewrite_ptrs = 0; 765 766 unsigned i = 0; 767 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 768 if (ptr->dev == bucket.inode) { 769 data_opts.rewrite_ptrs |= 1U << i; 770 if (ptr->cached) { 771 bch2_trans_iter_exit(trans, &iter); 772 goto next; 773 } 774 } 775 i++; 776 } 777 778 ret = bch2_move_extent(ctxt, bucket_in_flight, 779 &iter, k, io_opts, data_opts); 780 bch2_trans_iter_exit(trans, &iter); 781 782 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 783 continue; 784 if (ret == -ENOMEM) { 785 /* memory allocation failure, wait for some IO to finish */ 786 bch2_move_ctxt_wait_for_io(ctxt); 787 continue; 788 } 789 if (ret) 790 goto err; 791 792 if (ctxt->stats) 793 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 794 } else { 795 struct btree *b; 796 797 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 798 ret = PTR_ERR_OR_ZERO(b); 799 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 800 continue; 801 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 802 continue; 803 if (ret) 804 goto err; 805 if (!b) 806 goto next; 807 808 unsigned sectors = btree_ptr_sectors_written(&b->key); 809 810 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 811 bch2_trans_iter_exit(trans, &iter); 812 813 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 814 continue; 815 if (ret) 816 goto err; 817 818 if (ctxt->rate) 819 bch2_ratelimit_increment(ctxt->rate, sectors); 820 if (ctxt->stats) { 821 atomic64_add(sectors, &ctxt->stats->sectors_seen); 822 atomic64_add(sectors, &ctxt->stats->sectors_moved); 823 } 824 } 825 next: 826 bp_pos = bpos_nosnap_successor(bp_pos); 827 } 828 829 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 830 err: 831 bch2_dev_put(ca); 832 bch2_bkey_buf_exit(&sk, c); 833 return ret; 834 } 835 836 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 837 struct btree *, struct bch_io_opts *, 838 struct data_update_opts *); 839 840 static int bch2_move_btree(struct bch_fs *c, 841 struct bbpos start, 842 struct bbpos end, 843 move_btree_pred pred, void *arg, 844 struct bch_move_stats *stats) 845 { 846 bool kthread = (current->flags & PF_KTHREAD) != 0; 847 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 848 struct moving_context ctxt; 849 struct btree_trans *trans; 850 struct btree_iter iter; 851 struct btree *b; 852 enum btree_id btree; 853 struct data_update_opts data_opts; 854 int ret = 0; 855 856 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 857 writepoint_ptr(&c->btree_write_point), 858 true); 859 trans = ctxt.trans; 860 861 stats->data_type = BCH_DATA_btree; 862 863 for (btree = start.btree; 864 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 865 btree ++) { 866 stats->pos = BBPOS(btree, POS_MIN); 867 868 if (!bch2_btree_id_root(c, btree)->b) 869 continue; 870 871 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, 872 BTREE_ITER_prefetch); 873 retry: 874 ret = 0; 875 while (bch2_trans_begin(trans), 876 (b = bch2_btree_iter_peek_node(&iter)) && 877 !(ret = PTR_ERR_OR_ZERO(b))) { 878 if (kthread && kthread_should_stop()) 879 break; 880 881 if ((cmp_int(btree, end.btree) ?: 882 bpos_cmp(b->key.k.p, end.pos)) > 0) 883 break; 884 885 stats->pos = BBPOS(iter.btree_id, iter.pos); 886 887 if (!pred(c, arg, b, &io_opts, &data_opts)) 888 goto next; 889 890 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 891 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 892 continue; 893 if (ret) 894 break; 895 next: 896 bch2_btree_iter_next_node(&iter); 897 } 898 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 899 goto retry; 900 901 bch2_trans_iter_exit(trans, &iter); 902 903 if (kthread && kthread_should_stop()) 904 break; 905 } 906 907 bch_err_fn(c, ret); 908 bch2_moving_ctxt_exit(&ctxt); 909 bch2_btree_interior_updates_flush(c); 910 911 return ret; 912 } 913 914 static bool rereplicate_pred(struct bch_fs *c, void *arg, 915 struct bkey_s_c k, 916 struct bch_io_opts *io_opts, 917 struct data_update_opts *data_opts) 918 { 919 unsigned nr_good = bch2_bkey_durability(c, k); 920 unsigned replicas = bkey_is_btree_ptr(k.k) 921 ? c->opts.metadata_replicas 922 : io_opts->data_replicas; 923 924 rcu_read_lock(); 925 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 926 unsigned i = 0; 927 bkey_for_each_ptr(ptrs, ptr) { 928 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); 929 if (!ptr->cached && 930 (!ca || !ca->mi.durability)) 931 data_opts->kill_ptrs |= BIT(i); 932 i++; 933 } 934 rcu_read_unlock(); 935 936 if (!data_opts->kill_ptrs && 937 (!nr_good || nr_good >= replicas)) 938 return false; 939 940 data_opts->target = 0; 941 data_opts->extra_replicas = replicas - nr_good; 942 data_opts->btree_insert_flags = 0; 943 return true; 944 } 945 946 static bool migrate_pred(struct bch_fs *c, void *arg, 947 struct bkey_s_c k, 948 struct bch_io_opts *io_opts, 949 struct data_update_opts *data_opts) 950 { 951 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 952 struct bch_ioctl_data *op = arg; 953 unsigned i = 0; 954 955 data_opts->rewrite_ptrs = 0; 956 data_opts->target = 0; 957 data_opts->extra_replicas = 0; 958 data_opts->btree_insert_flags = 0; 959 960 bkey_for_each_ptr(ptrs, ptr) { 961 if (ptr->dev == op->migrate.dev) 962 data_opts->rewrite_ptrs |= 1U << i; 963 i++; 964 } 965 966 return data_opts->rewrite_ptrs != 0; 967 } 968 969 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 970 struct btree *b, 971 struct bch_io_opts *io_opts, 972 struct data_update_opts *data_opts) 973 { 974 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 975 } 976 977 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 978 struct btree *b, 979 struct bch_io_opts *io_opts, 980 struct data_update_opts *data_opts) 981 { 982 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 983 } 984 985 /* 986 * Ancient versions of bcachefs produced packed formats which could represent 987 * keys that the in memory format cannot represent; this checks for those 988 * formats so we can get rid of them. 989 */ 990 static bool bformat_needs_redo(struct bkey_format *f) 991 { 992 for (unsigned i = 0; i < f->nr_fields; i++) 993 if (bch2_bkey_format_field_overflows(f, i)) 994 return true; 995 996 return false; 997 } 998 999 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 1000 struct btree *b, 1001 struct bch_io_opts *io_opts, 1002 struct data_update_opts *data_opts) 1003 { 1004 if (b->version_ondisk != c->sb.version || 1005 btree_node_need_rewrite(b) || 1006 bformat_needs_redo(&b->format)) { 1007 data_opts->target = 0; 1008 data_opts->extra_replicas = 0; 1009 data_opts->btree_insert_flags = 0; 1010 return true; 1011 } 1012 1013 return false; 1014 } 1015 1016 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1017 { 1018 int ret; 1019 1020 ret = bch2_move_btree(c, 1021 BBPOS_MIN, 1022 BBPOS_MAX, 1023 rewrite_old_nodes_pred, c, stats); 1024 if (!ret) { 1025 mutex_lock(&c->sb_lock); 1026 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1027 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1028 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1029 bch2_write_super(c); 1030 mutex_unlock(&c->sb_lock); 1031 } 1032 1033 bch_err_fn(c, ret); 1034 return ret; 1035 } 1036 1037 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, 1038 struct bkey_s_c k, 1039 struct bch_io_opts *io_opts, 1040 struct data_update_opts *data_opts) 1041 { 1042 unsigned durability = bch2_bkey_durability(c, k); 1043 unsigned replicas = bkey_is_btree_ptr(k.k) 1044 ? c->opts.metadata_replicas 1045 : io_opts->data_replicas; 1046 const union bch_extent_entry *entry; 1047 struct extent_ptr_decoded p; 1048 unsigned i = 0; 1049 1050 rcu_read_lock(); 1051 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { 1052 unsigned d = bch2_extent_ptr_durability(c, &p); 1053 1054 if (d && durability - d >= replicas) { 1055 data_opts->kill_ptrs |= BIT(i); 1056 durability -= d; 1057 } 1058 1059 i++; 1060 } 1061 rcu_read_unlock(); 1062 1063 return data_opts->kill_ptrs != 0; 1064 } 1065 1066 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, 1067 struct btree *b, 1068 struct bch_io_opts *io_opts, 1069 struct data_update_opts *data_opts) 1070 { 1071 return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 1072 } 1073 1074 int bch2_data_job(struct bch_fs *c, 1075 struct bch_move_stats *stats, 1076 struct bch_ioctl_data op) 1077 { 1078 struct bbpos start = BBPOS(op.start_btree, op.start_pos); 1079 struct bbpos end = BBPOS(op.end_btree, op.end_pos); 1080 int ret = 0; 1081 1082 if (op.op >= BCH_DATA_OP_NR) 1083 return -EINVAL; 1084 1085 bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); 1086 1087 switch (op.op) { 1088 case BCH_DATA_OP_rereplicate: 1089 stats->data_type = BCH_DATA_journal; 1090 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1091 ret = bch2_move_btree(c, start, end, 1092 rereplicate_btree_pred, c, stats) ?: ret; 1093 ret = bch2_move_data(c, start, end, 1094 NULL, 1095 stats, 1096 writepoint_hashed((unsigned long) current), 1097 true, 1098 rereplicate_pred, c) ?: ret; 1099 ret = bch2_replicas_gc2(c) ?: ret; 1100 break; 1101 case BCH_DATA_OP_migrate: 1102 if (op.migrate.dev >= c->sb.nr_devices) 1103 return -EINVAL; 1104 1105 stats->data_type = BCH_DATA_journal; 1106 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1107 ret = bch2_move_btree(c, start, end, 1108 migrate_btree_pred, &op, stats) ?: ret; 1109 ret = bch2_move_data(c, start, end, 1110 NULL, 1111 stats, 1112 writepoint_hashed((unsigned long) current), 1113 true, 1114 migrate_pred, &op) ?: ret; 1115 ret = bch2_replicas_gc2(c) ?: ret; 1116 break; 1117 case BCH_DATA_OP_rewrite_old_nodes: 1118 ret = bch2_scan_old_btree_nodes(c, stats); 1119 break; 1120 case BCH_DATA_OP_drop_extra_replicas: 1121 ret = bch2_move_btree(c, start, end, 1122 drop_extra_replicas_btree_pred, c, stats) ?: ret; 1123 ret = bch2_move_data(c, start, end, NULL, stats, 1124 writepoint_hashed((unsigned long) current), 1125 true, 1126 drop_extra_replicas_pred, c) ?: ret; 1127 ret = bch2_replicas_gc2(c) ?: ret; 1128 break; 1129 default: 1130 ret = -EINVAL; 1131 } 1132 1133 bch2_move_stats_exit(stats, c); 1134 return ret; 1135 } 1136 1137 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1138 { 1139 prt_printf(out, "%s: data type==", stats->name); 1140 bch2_prt_data_type(out, stats->data_type); 1141 prt_str(out, " pos="); 1142 bch2_bbpos_to_text(out, stats->pos); 1143 prt_newline(out); 1144 printbuf_indent_add(out, 2); 1145 1146 prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); 1147 prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); 1148 prt_printf(out, "bytes seen: "); 1149 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1150 prt_newline(out); 1151 1152 prt_printf(out, "bytes moved: "); 1153 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1154 prt_newline(out); 1155 1156 prt_printf(out, "bytes raced: "); 1157 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1158 prt_newline(out); 1159 1160 printbuf_indent_sub(out, 2); 1161 } 1162 1163 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1164 { 1165 struct moving_io *io; 1166 1167 bch2_move_stats_to_text(out, ctxt->stats); 1168 printbuf_indent_add(out, 2); 1169 1170 prt_printf(out, "reads: ios %u/%u sectors %u/%u\n", 1171 atomic_read(&ctxt->read_ios), 1172 c->opts.move_ios_in_flight, 1173 atomic_read(&ctxt->read_sectors), 1174 c->opts.move_bytes_in_flight >> 9); 1175 1176 prt_printf(out, "writes: ios %u/%u sectors %u/%u\n", 1177 atomic_read(&ctxt->write_ios), 1178 c->opts.move_ios_in_flight, 1179 atomic_read(&ctxt->write_sectors), 1180 c->opts.move_bytes_in_flight >> 9); 1181 1182 printbuf_indent_add(out, 2); 1183 1184 mutex_lock(&ctxt->lock); 1185 list_for_each_entry(io, &ctxt->ios, io_list) 1186 bch2_write_op_to_text(out, &io->write.op); 1187 mutex_unlock(&ctxt->lock); 1188 1189 printbuf_indent_sub(out, 4); 1190 } 1191 1192 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1193 { 1194 struct moving_context *ctxt; 1195 1196 mutex_lock(&c->moving_context_lock); 1197 list_for_each_entry(ctxt, &c->moving_context_list, list) 1198 bch2_moving_ctxt_to_text(out, c, ctxt); 1199 mutex_unlock(&c->moving_context_lock); 1200 } 1201 1202 void bch2_fs_move_init(struct bch_fs *c) 1203 { 1204 INIT_LIST_HEAD(&c->moving_context_list); 1205 mutex_init(&c->moving_context_lock); 1206 } 1207