1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_io.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_write_buffer.h" 13 #include "compress.h" 14 #include "disk_groups.h" 15 #include "ec.h" 16 #include "errcode.h" 17 #include "error.h" 18 #include "inode.h" 19 #include "io_read.h" 20 #include "io_write.h" 21 #include "journal_reclaim.h" 22 #include "keylist.h" 23 #include "move.h" 24 #include "replicas.h" 25 #include "snapshot.h" 26 #include "super-io.h" 27 #include "trace.h" 28 29 #include <linux/ioprio.h> 30 #include <linux/kthread.h> 31 32 const char * const bch2_data_ops_strs[] = { 33 #define x(t, n, ...) [n] = #t, 34 BCH_DATA_OPS() 35 #undef x 36 NULL 37 }; 38 39 static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, 40 struct bch_io_opts *io_opts, 41 struct data_update_opts *data_opts) 42 { 43 printbuf_tabstop_push(out, 20); 44 prt_str(out, "rewrite ptrs:"); 45 prt_tab(out); 46 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); 47 prt_newline(out); 48 49 prt_str(out, "kill ptrs: "); 50 prt_tab(out); 51 bch2_prt_u64_base2(out, data_opts->kill_ptrs); 52 prt_newline(out); 53 54 prt_str(out, "target: "); 55 prt_tab(out); 56 bch2_target_to_text(out, c, data_opts->target); 57 prt_newline(out); 58 59 prt_str(out, "compression: "); 60 prt_tab(out); 61 bch2_compression_opt_to_text(out, background_compression(*io_opts)); 62 prt_newline(out); 63 64 prt_str(out, "extra replicas: "); 65 prt_tab(out); 66 prt_u64(out, data_opts->extra_replicas); 67 } 68 69 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, 70 struct bch_io_opts *io_opts, 71 struct data_update_opts *data_opts) 72 { 73 if (trace_move_extent_enabled()) { 74 struct printbuf buf = PRINTBUF; 75 76 bch2_bkey_val_to_text(&buf, c, k); 77 prt_newline(&buf); 78 bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); 79 trace_move_extent(c, buf.buf); 80 printbuf_exit(&buf); 81 } 82 } 83 84 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 85 { 86 if (trace_move_extent_read_enabled()) { 87 struct printbuf buf = PRINTBUF; 88 89 bch2_bkey_val_to_text(&buf, c, k); 90 trace_move_extent_read(c, buf.buf); 91 printbuf_exit(&buf); 92 } 93 } 94 95 struct moving_io { 96 struct list_head read_list; 97 struct list_head io_list; 98 struct move_bucket_in_flight *b; 99 struct closure cl; 100 bool read_completed; 101 102 unsigned read_sectors; 103 unsigned write_sectors; 104 105 struct bch_read_bio rbio; 106 107 struct data_update write; 108 /* Must be last since it is variable size */ 109 struct bio_vec bi_inline_vecs[]; 110 }; 111 112 static void move_free(struct moving_io *io) 113 { 114 struct moving_context *ctxt = io->write.ctxt; 115 116 if (io->b) 117 atomic_dec(&io->b->count); 118 119 bch2_data_update_exit(&io->write); 120 121 mutex_lock(&ctxt->lock); 122 list_del(&io->io_list); 123 wake_up(&ctxt->wait); 124 mutex_unlock(&ctxt->lock); 125 126 kfree(io); 127 } 128 129 static void move_write_done(struct bch_write_op *op) 130 { 131 struct moving_io *io = container_of(op, struct moving_io, write.op); 132 struct moving_context *ctxt = io->write.ctxt; 133 134 if (io->write.op.error) 135 ctxt->write_error = true; 136 137 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 138 atomic_dec(&io->write.ctxt->write_ios); 139 move_free(io); 140 closure_put(&ctxt->cl); 141 } 142 143 static void move_write(struct moving_io *io) 144 { 145 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 146 move_free(io); 147 return; 148 } 149 150 if (trace_move_extent_write_enabled()) { 151 struct bch_fs *c = io->write.op.c; 152 struct printbuf buf = PRINTBUF; 153 154 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); 155 trace_move_extent_write(c, buf.buf); 156 printbuf_exit(&buf); 157 } 158 159 closure_get(&io->write.ctxt->cl); 160 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 161 atomic_inc(&io->write.ctxt->write_ios); 162 163 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 164 } 165 166 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 167 { 168 struct moving_io *io = 169 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 170 171 return io && io->read_completed ? io : NULL; 172 } 173 174 static void move_read_endio(struct bio *bio) 175 { 176 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 177 struct moving_context *ctxt = io->write.ctxt; 178 179 atomic_sub(io->read_sectors, &ctxt->read_sectors); 180 atomic_dec(&ctxt->read_ios); 181 io->read_completed = true; 182 183 wake_up(&ctxt->wait); 184 closure_put(&ctxt->cl); 185 } 186 187 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 188 { 189 struct moving_io *io; 190 191 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 192 bch2_trans_unlock_long(ctxt->trans); 193 list_del(&io->read_list); 194 move_write(io); 195 } 196 } 197 198 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 199 { 200 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 201 202 move_ctxt_wait_event(ctxt, 203 !atomic_read(&ctxt->write_sectors) || 204 atomic_read(&ctxt->write_sectors) != sectors_pending); 205 } 206 207 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) 208 { 209 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 210 bch2_trans_unlock_long(ctxt->trans); 211 closure_sync(&ctxt->cl); 212 } 213 214 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 215 { 216 struct bch_fs *c = ctxt->trans->c; 217 218 bch2_moving_ctxt_flush_all(ctxt); 219 220 EBUG_ON(atomic_read(&ctxt->write_sectors)); 221 EBUG_ON(atomic_read(&ctxt->write_ios)); 222 EBUG_ON(atomic_read(&ctxt->read_sectors)); 223 EBUG_ON(atomic_read(&ctxt->read_ios)); 224 225 mutex_lock(&c->moving_context_lock); 226 list_del(&ctxt->list); 227 mutex_unlock(&c->moving_context_lock); 228 229 bch2_trans_put(ctxt->trans); 230 memset(ctxt, 0, sizeof(*ctxt)); 231 } 232 233 void bch2_moving_ctxt_init(struct moving_context *ctxt, 234 struct bch_fs *c, 235 struct bch_ratelimit *rate, 236 struct bch_move_stats *stats, 237 struct write_point_specifier wp, 238 bool wait_on_copygc) 239 { 240 memset(ctxt, 0, sizeof(*ctxt)); 241 242 ctxt->trans = bch2_trans_get(c); 243 ctxt->fn = (void *) _RET_IP_; 244 ctxt->rate = rate; 245 ctxt->stats = stats; 246 ctxt->wp = wp; 247 ctxt->wait_on_copygc = wait_on_copygc; 248 249 closure_init_stack(&ctxt->cl); 250 251 mutex_init(&ctxt->lock); 252 INIT_LIST_HEAD(&ctxt->reads); 253 INIT_LIST_HEAD(&ctxt->ios); 254 init_waitqueue_head(&ctxt->wait); 255 256 mutex_lock(&c->moving_context_lock); 257 list_add(&ctxt->list, &c->moving_context_list); 258 mutex_unlock(&c->moving_context_lock); 259 } 260 261 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 262 { 263 trace_move_data(c, stats); 264 } 265 266 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) 267 { 268 memset(stats, 0, sizeof(*stats)); 269 stats->data_type = BCH_DATA_user; 270 scnprintf(stats->name, sizeof(stats->name), "%s", name); 271 } 272 273 int bch2_move_extent(struct moving_context *ctxt, 274 struct move_bucket_in_flight *bucket_in_flight, 275 struct btree_iter *iter, 276 struct bkey_s_c k, 277 struct bch_io_opts io_opts, 278 struct data_update_opts data_opts) 279 { 280 struct btree_trans *trans = ctxt->trans; 281 struct bch_fs *c = trans->c; 282 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 283 struct moving_io *io; 284 const union bch_extent_entry *entry; 285 struct extent_ptr_decoded p; 286 unsigned sectors = k.k->size, pages; 287 int ret = -ENOMEM; 288 289 trace_move_extent2(c, k, &io_opts, &data_opts); 290 291 if (ctxt->stats) 292 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 293 294 bch2_data_update_opts_normalize(k, &data_opts); 295 296 if (!data_opts.rewrite_ptrs && 297 !data_opts.extra_replicas) { 298 if (data_opts.kill_ptrs) 299 return bch2_extent_drop_ptrs(trans, iter, k, data_opts); 300 return 0; 301 } 302 303 /* 304 * Before memory allocations & taking nocow locks in 305 * bch2_data_update_init(): 306 */ 307 bch2_trans_unlock(trans); 308 309 /* write path might have to decompress data: */ 310 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 311 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 312 313 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 314 io = kzalloc(sizeof(struct moving_io) + 315 sizeof(struct bio_vec) * pages, GFP_KERNEL); 316 if (!io) 317 goto err; 318 319 INIT_LIST_HEAD(&io->io_list); 320 io->write.ctxt = ctxt; 321 io->read_sectors = k.k->size; 322 io->write_sectors = k.k->size; 323 324 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 325 bio_set_prio(&io->write.op.wbio.bio, 326 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 327 328 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 329 GFP_KERNEL)) 330 goto err_free; 331 332 io->rbio.c = c; 333 io->rbio.opts = io_opts; 334 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 335 io->rbio.bio.bi_vcnt = pages; 336 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 337 io->rbio.bio.bi_iter.bi_size = sectors << 9; 338 339 io->rbio.bio.bi_opf = REQ_OP_READ; 340 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 341 io->rbio.bio.bi_end_io = move_read_endio; 342 343 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, 344 io_opts, data_opts, iter->btree_id, k); 345 if (ret) 346 goto err_free_pages; 347 348 io->write.op.end_io = move_write_done; 349 350 if (ctxt->rate) 351 bch2_ratelimit_increment(ctxt->rate, k.k->size); 352 353 if (ctxt->stats) { 354 atomic64_inc(&ctxt->stats->keys_moved); 355 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 356 } 357 358 if (bucket_in_flight) { 359 io->b = bucket_in_flight; 360 atomic_inc(&io->b->count); 361 } 362 363 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 364 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 365 trace_move_extent_read2(c, k); 366 367 mutex_lock(&ctxt->lock); 368 atomic_add(io->read_sectors, &ctxt->read_sectors); 369 atomic_inc(&ctxt->read_ios); 370 371 list_add_tail(&io->read_list, &ctxt->reads); 372 list_add_tail(&io->io_list, &ctxt->ios); 373 mutex_unlock(&ctxt->lock); 374 375 /* 376 * dropped by move_read_endio() - guards against use after free of 377 * ctxt when doing wakeup 378 */ 379 closure_get(&ctxt->cl); 380 bch2_read_extent(trans, &io->rbio, 381 bkey_start_pos(k.k), 382 iter->btree_id, k, 0, 383 BCH_READ_NODECODE| 384 BCH_READ_LAST_FRAGMENT); 385 return 0; 386 err_free_pages: 387 bio_free_pages(&io->write.op.wbio.bio); 388 err_free: 389 kfree(io); 390 err: 391 if (ret == -BCH_ERR_data_update_done) 392 return 0; 393 394 if (bch2_err_matches(ret, EROFS) || 395 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 396 return ret; 397 398 count_event(c, move_extent_start_fail); 399 400 if (trace_move_extent_start_fail_enabled()) { 401 struct printbuf buf = PRINTBUF; 402 403 bch2_bkey_val_to_text(&buf, c, k); 404 prt_str(&buf, ": "); 405 prt_str(&buf, bch2_err_str(ret)); 406 trace_move_extent_start_fail(c, buf.buf); 407 printbuf_exit(&buf); 408 } 409 return ret; 410 } 411 412 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 413 struct per_snapshot_io_opts *io_opts, 414 struct bkey_s_c extent_k) 415 { 416 struct bch_fs *c = trans->c; 417 u32 restart_count = trans->restart_count; 418 int ret = 0; 419 420 if (io_opts->cur_inum != extent_k.k->p.inode) { 421 io_opts->d.nr = 0; 422 423 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), 424 BTREE_ITER_ALL_SNAPSHOTS, k, ({ 425 if (k.k->p.offset != extent_k.k->p.inode) 426 break; 427 428 if (!bkey_is_inode(k.k)) 429 continue; 430 431 struct bch_inode_unpacked inode; 432 BUG_ON(bch2_inode_unpack(k, &inode)); 433 434 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 435 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 436 437 darray_push(&io_opts->d, e); 438 })); 439 io_opts->cur_inum = extent_k.k->p.inode; 440 } 441 442 ret = ret ?: trans_was_restarted(trans, restart_count); 443 if (ret) 444 return ERR_PTR(ret); 445 446 if (extent_k.k->p.snapshot) 447 darray_for_each(io_opts->d, i) 448 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) 449 return &i->io_opts; 450 451 return &io_opts->fs_io_opts; 452 } 453 454 int bch2_move_get_io_opts_one(struct btree_trans *trans, 455 struct bch_io_opts *io_opts, 456 struct bkey_s_c extent_k) 457 { 458 struct btree_iter iter; 459 struct bkey_s_c k; 460 int ret; 461 462 /* reflink btree? */ 463 if (!extent_k.k->p.inode) { 464 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 465 return 0; 466 } 467 468 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 469 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 470 BTREE_ITER_CACHED); 471 ret = bkey_err(k); 472 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 473 return ret; 474 475 if (!ret && bkey_is_inode(k.k)) { 476 struct bch_inode_unpacked inode; 477 bch2_inode_unpack(k, &inode); 478 bch2_inode_opts_get(io_opts, trans->c, &inode); 479 } else { 480 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 481 } 482 483 bch2_trans_iter_exit(trans, &iter); 484 return 0; 485 } 486 487 int bch2_move_ratelimit(struct moving_context *ctxt) 488 { 489 struct bch_fs *c = ctxt->trans->c; 490 bool is_kthread = current->flags & PF_KTHREAD; 491 u64 delay; 492 493 if (ctxt->wait_on_copygc && c->copygc_running) { 494 bch2_moving_ctxt_flush_all(ctxt); 495 wait_event_killable(c->copygc_running_wq, 496 !c->copygc_running || 497 (is_kthread && kthread_should_stop())); 498 } 499 500 do { 501 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 502 503 if (is_kthread && kthread_should_stop()) 504 return 1; 505 506 if (delay) 507 move_ctxt_wait_event_timeout(ctxt, 508 freezing(current) || 509 (is_kthread && kthread_should_stop()), 510 delay); 511 512 if (unlikely(freezing(current))) { 513 bch2_moving_ctxt_flush_all(ctxt); 514 try_to_freeze(); 515 } 516 } while (delay); 517 518 /* 519 * XXX: these limits really ought to be per device, SSDs and hard drives 520 * will want different limits 521 */ 522 move_ctxt_wait_event(ctxt, 523 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 524 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 525 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 526 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 527 528 return 0; 529 } 530 531 static int bch2_move_data_btree(struct moving_context *ctxt, 532 struct bpos start, 533 struct bpos end, 534 move_pred_fn pred, void *arg, 535 enum btree_id btree_id) 536 { 537 struct btree_trans *trans = ctxt->trans; 538 struct bch_fs *c = trans->c; 539 struct per_snapshot_io_opts snapshot_io_opts; 540 struct bch_io_opts *io_opts; 541 struct bkey_buf sk; 542 struct btree_iter iter; 543 struct bkey_s_c k; 544 struct data_update_opts data_opts; 545 int ret = 0, ret2; 546 547 per_snapshot_io_opts_init(&snapshot_io_opts, c); 548 bch2_bkey_buf_init(&sk); 549 550 if (ctxt->stats) { 551 ctxt->stats->data_type = BCH_DATA_user; 552 ctxt->stats->pos = BBPOS(btree_id, start); 553 } 554 555 bch2_trans_iter_init(trans, &iter, btree_id, start, 556 BTREE_ITER_PREFETCH| 557 BTREE_ITER_ALL_SNAPSHOTS); 558 559 if (ctxt->rate) 560 bch2_ratelimit_reset(ctxt->rate); 561 562 while (!bch2_move_ratelimit(ctxt)) { 563 bch2_trans_begin(trans); 564 565 k = bch2_btree_iter_peek(&iter); 566 if (!k.k) 567 break; 568 569 ret = bkey_err(k); 570 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 571 continue; 572 if (ret) 573 break; 574 575 if (bkey_ge(bkey_start_pos(k.k), end)) 576 break; 577 578 if (ctxt->stats) 579 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 580 581 if (!bkey_extent_is_direct_data(k.k)) 582 goto next_nondata; 583 584 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); 585 ret = PTR_ERR_OR_ZERO(io_opts); 586 if (ret) 587 continue; 588 589 memset(&data_opts, 0, sizeof(data_opts)); 590 if (!pred(c, arg, k, io_opts, &data_opts)) 591 goto next; 592 593 /* 594 * The iterator gets unlocked by __bch2_read_extent - need to 595 * save a copy of @k elsewhere: 596 */ 597 bch2_bkey_buf_reassemble(&sk, c, k); 598 k = bkey_i_to_s_c(sk.k); 599 600 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); 601 if (ret2) { 602 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 603 continue; 604 605 if (ret2 == -ENOMEM) { 606 /* memory allocation failure, wait for some IO to finish */ 607 bch2_move_ctxt_wait_for_io(ctxt); 608 continue; 609 } 610 611 /* XXX signal failure */ 612 goto next; 613 } 614 next: 615 if (ctxt->stats) 616 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 617 next_nondata: 618 bch2_btree_iter_advance(&iter); 619 } 620 621 bch2_trans_iter_exit(trans, &iter); 622 bch2_bkey_buf_exit(&sk, c); 623 per_snapshot_io_opts_exit(&snapshot_io_opts); 624 625 return ret; 626 } 627 628 int __bch2_move_data(struct moving_context *ctxt, 629 struct bbpos start, 630 struct bbpos end, 631 move_pred_fn pred, void *arg) 632 { 633 struct bch_fs *c = ctxt->trans->c; 634 enum btree_id id; 635 int ret = 0; 636 637 for (id = start.btree; 638 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 639 id++) { 640 ctxt->stats->pos = BBPOS(id, POS_MIN); 641 642 if (!btree_type_has_ptrs(id) || 643 !bch2_btree_id_root(c, id)->b) 644 continue; 645 646 ret = bch2_move_data_btree(ctxt, 647 id == start.btree ? start.pos : POS_MIN, 648 id == end.btree ? end.pos : POS_MAX, 649 pred, arg, id); 650 if (ret) 651 break; 652 } 653 654 return ret; 655 } 656 657 int bch2_move_data(struct bch_fs *c, 658 struct bbpos start, 659 struct bbpos end, 660 struct bch_ratelimit *rate, 661 struct bch_move_stats *stats, 662 struct write_point_specifier wp, 663 bool wait_on_copygc, 664 move_pred_fn pred, void *arg) 665 { 666 667 struct moving_context ctxt; 668 int ret; 669 670 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 671 ret = __bch2_move_data(&ctxt, start, end, pred, arg); 672 bch2_moving_ctxt_exit(&ctxt); 673 674 return ret; 675 } 676 677 int bch2_evacuate_bucket(struct moving_context *ctxt, 678 struct move_bucket_in_flight *bucket_in_flight, 679 struct bpos bucket, int gen, 680 struct data_update_opts _data_opts) 681 { 682 struct btree_trans *trans = ctxt->trans; 683 struct bch_fs *c = trans->c; 684 bool is_kthread = current->flags & PF_KTHREAD; 685 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 686 struct btree_iter iter; 687 struct bkey_buf sk; 688 struct bch_backpointer bp; 689 struct bch_alloc_v4 a_convert; 690 const struct bch_alloc_v4 *a; 691 struct bkey_s_c k; 692 struct data_update_opts data_opts; 693 unsigned dirty_sectors, bucket_size; 694 u64 fragmentation; 695 struct bpos bp_pos = POS_MIN; 696 int ret = 0; 697 698 trace_bucket_evacuate(c, &bucket); 699 700 bch2_bkey_buf_init(&sk); 701 702 /* 703 * We're not run in a context that handles transaction restarts: 704 */ 705 bch2_trans_begin(trans); 706 707 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 708 bucket, BTREE_ITER_CACHED); 709 ret = lockrestart_do(trans, 710 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 711 bch2_trans_iter_exit(trans, &iter); 712 713 bch_err_msg(c, ret, "looking up alloc key"); 714 if (ret) 715 goto err; 716 717 a = bch2_alloc_to_v4(k, &a_convert); 718 dirty_sectors = bch2_bucket_sectors_dirty(*a); 719 bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; 720 fragmentation = a->fragmentation_lru; 721 722 ret = bch2_btree_write_buffer_tryflush(trans); 723 bch_err_msg(c, ret, "flushing btree write buffer"); 724 if (ret) 725 goto err; 726 727 while (!(ret = bch2_move_ratelimit(ctxt))) { 728 if (is_kthread && kthread_should_stop()) 729 break; 730 731 bch2_trans_begin(trans); 732 733 ret = bch2_get_next_backpointer(trans, bucket, gen, 734 &bp_pos, &bp, 735 BTREE_ITER_CACHED); 736 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 737 continue; 738 if (ret) 739 goto err; 740 if (bkey_eq(bp_pos, POS_MAX)) 741 break; 742 743 if (!bp.level) { 744 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 745 ret = bkey_err(k); 746 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 747 continue; 748 if (ret) 749 goto err; 750 if (!k.k) 751 goto next; 752 753 bch2_bkey_buf_reassemble(&sk, c, k); 754 k = bkey_i_to_s_c(sk.k); 755 756 ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 757 if (ret) { 758 bch2_trans_iter_exit(trans, &iter); 759 continue; 760 } 761 762 data_opts = _data_opts; 763 data_opts.target = io_opts.background_target; 764 data_opts.rewrite_ptrs = 0; 765 766 unsigned i = 0; 767 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 768 if (ptr->dev == bucket.inode) { 769 data_opts.rewrite_ptrs |= 1U << i; 770 if (ptr->cached) { 771 bch2_trans_iter_exit(trans, &iter); 772 goto next; 773 } 774 } 775 i++; 776 } 777 778 ret = bch2_move_extent(ctxt, bucket_in_flight, 779 &iter, k, io_opts, data_opts); 780 bch2_trans_iter_exit(trans, &iter); 781 782 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 783 continue; 784 if (ret == -ENOMEM) { 785 /* memory allocation failure, wait for some IO to finish */ 786 bch2_move_ctxt_wait_for_io(ctxt); 787 continue; 788 } 789 if (ret) 790 goto err; 791 792 if (ctxt->stats) 793 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 794 } else { 795 struct btree *b; 796 797 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 798 ret = PTR_ERR_OR_ZERO(b); 799 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 800 continue; 801 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 802 continue; 803 if (ret) 804 goto err; 805 if (!b) 806 goto next; 807 808 unsigned sectors = btree_ptr_sectors_written(&b->key); 809 810 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 811 bch2_trans_iter_exit(trans, &iter); 812 813 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 814 continue; 815 if (ret) 816 goto err; 817 818 if (ctxt->rate) 819 bch2_ratelimit_increment(ctxt->rate, sectors); 820 if (ctxt->stats) { 821 atomic64_add(sectors, &ctxt->stats->sectors_seen); 822 atomic64_add(sectors, &ctxt->stats->sectors_moved); 823 } 824 } 825 next: 826 bp_pos = bpos_nosnap_successor(bp_pos); 827 } 828 829 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 830 err: 831 bch2_bkey_buf_exit(&sk, c); 832 return ret; 833 } 834 835 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 836 struct btree *, struct bch_io_opts *, 837 struct data_update_opts *); 838 839 static int bch2_move_btree(struct bch_fs *c, 840 struct bbpos start, 841 struct bbpos end, 842 move_btree_pred pred, void *arg, 843 struct bch_move_stats *stats) 844 { 845 bool kthread = (current->flags & PF_KTHREAD) != 0; 846 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 847 struct moving_context ctxt; 848 struct btree_trans *trans; 849 struct btree_iter iter; 850 struct btree *b; 851 enum btree_id btree; 852 struct data_update_opts data_opts; 853 int ret = 0; 854 855 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 856 writepoint_ptr(&c->btree_write_point), 857 true); 858 trans = ctxt.trans; 859 860 stats->data_type = BCH_DATA_btree; 861 862 for (btree = start.btree; 863 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 864 btree ++) { 865 stats->pos = BBPOS(btree, POS_MIN); 866 867 if (!bch2_btree_id_root(c, btree)->b) 868 continue; 869 870 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0, 871 BTREE_ITER_PREFETCH); 872 retry: 873 ret = 0; 874 while (bch2_trans_begin(trans), 875 (b = bch2_btree_iter_peek_node(&iter)) && 876 !(ret = PTR_ERR_OR_ZERO(b))) { 877 if (kthread && kthread_should_stop()) 878 break; 879 880 if ((cmp_int(btree, end.btree) ?: 881 bpos_cmp(b->key.k.p, end.pos)) > 0) 882 break; 883 884 stats->pos = BBPOS(iter.btree_id, iter.pos); 885 886 if (!pred(c, arg, b, &io_opts, &data_opts)) 887 goto next; 888 889 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 890 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 891 continue; 892 if (ret) 893 break; 894 next: 895 bch2_btree_iter_next_node(&iter); 896 } 897 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 898 goto retry; 899 900 bch2_trans_iter_exit(trans, &iter); 901 902 if (kthread && kthread_should_stop()) 903 break; 904 } 905 906 bch_err_fn(c, ret); 907 bch2_moving_ctxt_exit(&ctxt); 908 bch2_btree_interior_updates_flush(c); 909 910 return ret; 911 } 912 913 static bool rereplicate_pred(struct bch_fs *c, void *arg, 914 struct bkey_s_c k, 915 struct bch_io_opts *io_opts, 916 struct data_update_opts *data_opts) 917 { 918 unsigned nr_good = bch2_bkey_durability(c, k); 919 unsigned replicas = bkey_is_btree_ptr(k.k) 920 ? c->opts.metadata_replicas 921 : io_opts->data_replicas; 922 923 if (!nr_good || nr_good >= replicas) 924 return false; 925 926 data_opts->target = 0; 927 data_opts->extra_replicas = replicas - nr_good; 928 data_opts->btree_insert_flags = 0; 929 return true; 930 } 931 932 static bool migrate_pred(struct bch_fs *c, void *arg, 933 struct bkey_s_c k, 934 struct bch_io_opts *io_opts, 935 struct data_update_opts *data_opts) 936 { 937 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 938 struct bch_ioctl_data *op = arg; 939 unsigned i = 0; 940 941 data_opts->rewrite_ptrs = 0; 942 data_opts->target = 0; 943 data_opts->extra_replicas = 0; 944 data_opts->btree_insert_flags = 0; 945 946 bkey_for_each_ptr(ptrs, ptr) { 947 if (ptr->dev == op->migrate.dev) 948 data_opts->rewrite_ptrs |= 1U << i; 949 i++; 950 } 951 952 return data_opts->rewrite_ptrs != 0; 953 } 954 955 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 956 struct btree *b, 957 struct bch_io_opts *io_opts, 958 struct data_update_opts *data_opts) 959 { 960 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 961 } 962 963 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 964 struct btree *b, 965 struct bch_io_opts *io_opts, 966 struct data_update_opts *data_opts) 967 { 968 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 969 } 970 971 /* 972 * Ancient versions of bcachefs produced packed formats which could represent 973 * keys that the in memory format cannot represent; this checks for those 974 * formats so we can get rid of them. 975 */ 976 static bool bformat_needs_redo(struct bkey_format *f) 977 { 978 for (unsigned i = 0; i < f->nr_fields; i++) { 979 unsigned f_bits = f->bits_per_field[i]; 980 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; 981 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); 982 u64 field_offset = le64_to_cpu(f->field_offset[i]); 983 984 if (f_bits > unpacked_bits) 985 return true; 986 987 if ((f_bits == unpacked_bits) && field_offset) 988 return true; 989 990 u64 f_mask = f_bits 991 ? ~((~0ULL << (f_bits - 1)) << 1) 992 : 0; 993 994 if (((field_offset + f_mask) & unpacked_mask) < field_offset) 995 return true; 996 } 997 998 return false; 999 } 1000 1001 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 1002 struct btree *b, 1003 struct bch_io_opts *io_opts, 1004 struct data_update_opts *data_opts) 1005 { 1006 if (b->version_ondisk != c->sb.version || 1007 btree_node_need_rewrite(b) || 1008 bformat_needs_redo(&b->format)) { 1009 data_opts->target = 0; 1010 data_opts->extra_replicas = 0; 1011 data_opts->btree_insert_flags = 0; 1012 return true; 1013 } 1014 1015 return false; 1016 } 1017 1018 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1019 { 1020 int ret; 1021 1022 ret = bch2_move_btree(c, 1023 BBPOS_MIN, 1024 BBPOS_MAX, 1025 rewrite_old_nodes_pred, c, stats); 1026 if (!ret) { 1027 mutex_lock(&c->sb_lock); 1028 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1029 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1030 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1031 bch2_write_super(c); 1032 mutex_unlock(&c->sb_lock); 1033 } 1034 1035 bch_err_fn(c, ret); 1036 return ret; 1037 } 1038 1039 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, 1040 struct bkey_s_c k, 1041 struct bch_io_opts *io_opts, 1042 struct data_update_opts *data_opts) 1043 { 1044 unsigned durability = bch2_bkey_durability(c, k); 1045 unsigned replicas = bkey_is_btree_ptr(k.k) 1046 ? c->opts.metadata_replicas 1047 : io_opts->data_replicas; 1048 const union bch_extent_entry *entry; 1049 struct extent_ptr_decoded p; 1050 unsigned i = 0; 1051 1052 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { 1053 unsigned d = bch2_extent_ptr_durability(c, &p); 1054 1055 if (d && durability - d >= replicas) { 1056 data_opts->kill_ptrs |= BIT(i); 1057 durability -= d; 1058 } 1059 1060 i++; 1061 } 1062 1063 return data_opts->kill_ptrs != 0; 1064 } 1065 1066 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, 1067 struct btree *b, 1068 struct bch_io_opts *io_opts, 1069 struct data_update_opts *data_opts) 1070 { 1071 return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 1072 } 1073 1074 int bch2_data_job(struct bch_fs *c, 1075 struct bch_move_stats *stats, 1076 struct bch_ioctl_data op) 1077 { 1078 struct bbpos start = BBPOS(op.start_btree, op.start_pos); 1079 struct bbpos end = BBPOS(op.end_btree, op.end_pos); 1080 int ret = 0; 1081 1082 if (op.op >= BCH_DATA_OP_NR) 1083 return -EINVAL; 1084 1085 bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); 1086 1087 switch (op.op) { 1088 case BCH_DATA_OP_rereplicate: 1089 stats->data_type = BCH_DATA_journal; 1090 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1091 ret = bch2_move_btree(c, start, end, 1092 rereplicate_btree_pred, c, stats) ?: ret; 1093 ret = bch2_move_data(c, start, end, 1094 NULL, 1095 stats, 1096 writepoint_hashed((unsigned long) current), 1097 true, 1098 rereplicate_pred, c) ?: ret; 1099 ret = bch2_replicas_gc2(c) ?: ret; 1100 break; 1101 case BCH_DATA_OP_migrate: 1102 if (op.migrate.dev >= c->sb.nr_devices) 1103 return -EINVAL; 1104 1105 stats->data_type = BCH_DATA_journal; 1106 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1107 ret = bch2_move_btree(c, start, end, 1108 migrate_btree_pred, &op, stats) ?: ret; 1109 ret = bch2_move_data(c, start, end, 1110 NULL, 1111 stats, 1112 writepoint_hashed((unsigned long) current), 1113 true, 1114 migrate_pred, &op) ?: ret; 1115 ret = bch2_replicas_gc2(c) ?: ret; 1116 break; 1117 case BCH_DATA_OP_rewrite_old_nodes: 1118 ret = bch2_scan_old_btree_nodes(c, stats); 1119 break; 1120 case BCH_DATA_OP_drop_extra_replicas: 1121 ret = bch2_move_btree(c, start, end, 1122 drop_extra_replicas_btree_pred, c, stats) ?: ret; 1123 ret = bch2_move_data(c, start, end, NULL, stats, 1124 writepoint_hashed((unsigned long) current), 1125 true, 1126 drop_extra_replicas_pred, c) ?: ret; 1127 ret = bch2_replicas_gc2(c) ?: ret; 1128 break; 1129 default: 1130 ret = -EINVAL; 1131 } 1132 1133 bch2_move_stats_exit(stats, c); 1134 return ret; 1135 } 1136 1137 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1138 { 1139 prt_printf(out, "%s: data type==", stats->name); 1140 bch2_prt_data_type(out, stats->data_type); 1141 prt_str(out, " pos="); 1142 bch2_bbpos_to_text(out, stats->pos); 1143 prt_newline(out); 1144 printbuf_indent_add(out, 2); 1145 1146 prt_str(out, "keys moved: "); 1147 prt_u64(out, atomic64_read(&stats->keys_moved)); 1148 prt_newline(out); 1149 1150 prt_str(out, "keys raced: "); 1151 prt_u64(out, atomic64_read(&stats->keys_raced)); 1152 prt_newline(out); 1153 1154 prt_str(out, "bytes seen: "); 1155 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1156 prt_newline(out); 1157 1158 prt_str(out, "bytes moved: "); 1159 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1160 prt_newline(out); 1161 1162 prt_str(out, "bytes raced: "); 1163 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1164 prt_newline(out); 1165 1166 printbuf_indent_sub(out, 2); 1167 } 1168 1169 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1170 { 1171 struct moving_io *io; 1172 1173 bch2_move_stats_to_text(out, ctxt->stats); 1174 printbuf_indent_add(out, 2); 1175 1176 prt_printf(out, "reads: ios %u/%u sectors %u/%u", 1177 atomic_read(&ctxt->read_ios), 1178 c->opts.move_ios_in_flight, 1179 atomic_read(&ctxt->read_sectors), 1180 c->opts.move_bytes_in_flight >> 9); 1181 prt_newline(out); 1182 1183 prt_printf(out, "writes: ios %u/%u sectors %u/%u", 1184 atomic_read(&ctxt->write_ios), 1185 c->opts.move_ios_in_flight, 1186 atomic_read(&ctxt->write_sectors), 1187 c->opts.move_bytes_in_flight >> 9); 1188 prt_newline(out); 1189 1190 printbuf_indent_add(out, 2); 1191 1192 mutex_lock(&ctxt->lock); 1193 list_for_each_entry(io, &ctxt->ios, io_list) 1194 bch2_write_op_to_text(out, &io->write.op); 1195 mutex_unlock(&ctxt->lock); 1196 1197 printbuf_indent_sub(out, 4); 1198 } 1199 1200 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1201 { 1202 struct moving_context *ctxt; 1203 1204 mutex_lock(&c->moving_context_lock); 1205 list_for_each_entry(ctxt, &c->moving_context_list, list) 1206 bch2_moving_ctxt_to_text(out, c, ctxt); 1207 mutex_unlock(&c->moving_context_lock); 1208 } 1209 1210 void bch2_fs_move_init(struct bch_fs *c) 1211 { 1212 INIT_LIST_HEAD(&c->moving_context_list); 1213 mutex_init(&c->moving_context_lock); 1214 } 1215