1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_write_buffer.h" 12 #include "disk_groups.h" 13 #include "ec.h" 14 #include "errcode.h" 15 #include "error.h" 16 #include "inode.h" 17 #include "io_read.h" 18 #include "io_write.h" 19 #include "journal_reclaim.h" 20 #include "keylist.h" 21 #include "move.h" 22 #include "replicas.h" 23 #include "super-io.h" 24 #include "trace.h" 25 26 #include <linux/ioprio.h> 27 #include <linux/kthread.h> 28 29 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) 30 { 31 if (trace_move_extent_enabled()) { 32 struct printbuf buf = PRINTBUF; 33 34 bch2_bkey_val_to_text(&buf, c, k); 35 trace_move_extent(c, buf.buf); 36 printbuf_exit(&buf); 37 } 38 } 39 40 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 41 { 42 if (trace_move_extent_read_enabled()) { 43 struct printbuf buf = PRINTBUF; 44 45 bch2_bkey_val_to_text(&buf, c, k); 46 trace_move_extent_read(c, buf.buf); 47 printbuf_exit(&buf); 48 } 49 } 50 51 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) 52 { 53 if (trace_move_extent_alloc_mem_fail_enabled()) { 54 struct printbuf buf = PRINTBUF; 55 56 bch2_bkey_val_to_text(&buf, c, k); 57 trace_move_extent_alloc_mem_fail(c, buf.buf); 58 printbuf_exit(&buf); 59 } 60 } 61 62 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) 63 { 64 mutex_lock(&c->data_progress_lock); 65 list_add(&stats->list, &c->data_progress_list); 66 mutex_unlock(&c->data_progress_lock); 67 } 68 69 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) 70 { 71 mutex_lock(&c->data_progress_lock); 72 list_del(&stats->list); 73 mutex_unlock(&c->data_progress_lock); 74 } 75 76 struct moving_io { 77 struct list_head read_list; 78 struct list_head io_list; 79 struct move_bucket_in_flight *b; 80 struct closure cl; 81 bool read_completed; 82 83 unsigned read_sectors; 84 unsigned write_sectors; 85 86 struct bch_read_bio rbio; 87 88 struct data_update write; 89 /* Must be last since it is variable size */ 90 struct bio_vec bi_inline_vecs[0]; 91 }; 92 93 static void move_free(struct moving_io *io) 94 { 95 struct moving_context *ctxt = io->write.ctxt; 96 97 if (io->b) 98 atomic_dec(&io->b->count); 99 100 bch2_data_update_exit(&io->write); 101 102 mutex_lock(&ctxt->lock); 103 list_del(&io->io_list); 104 wake_up(&ctxt->wait); 105 mutex_unlock(&ctxt->lock); 106 107 kfree(io); 108 } 109 110 static void move_write_done(struct bch_write_op *op) 111 { 112 struct moving_io *io = container_of(op, struct moving_io, write.op); 113 struct moving_context *ctxt = io->write.ctxt; 114 115 if (io->write.op.error) 116 ctxt->write_error = true; 117 118 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 119 atomic_dec(&io->write.ctxt->write_ios); 120 move_free(io); 121 closure_put(&ctxt->cl); 122 } 123 124 static void move_write(struct moving_io *io) 125 { 126 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 127 move_free(io); 128 return; 129 } 130 131 closure_get(&io->write.ctxt->cl); 132 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 133 atomic_inc(&io->write.ctxt->write_ios); 134 135 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 136 } 137 138 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 139 { 140 struct moving_io *io = 141 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 142 143 return io && io->read_completed ? io : NULL; 144 } 145 146 static void move_read_endio(struct bio *bio) 147 { 148 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 149 struct moving_context *ctxt = io->write.ctxt; 150 151 atomic_sub(io->read_sectors, &ctxt->read_sectors); 152 atomic_dec(&ctxt->read_ios); 153 io->read_completed = true; 154 155 wake_up(&ctxt->wait); 156 closure_put(&ctxt->cl); 157 } 158 159 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, 160 struct btree_trans *trans) 161 { 162 struct moving_io *io; 163 164 if (trans) 165 bch2_trans_unlock(trans); 166 167 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 168 list_del(&io->read_list); 169 move_write(io); 170 } 171 } 172 173 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, 174 struct btree_trans *trans) 175 { 176 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 177 178 move_ctxt_wait_event(ctxt, trans, 179 !atomic_read(&ctxt->write_sectors) || 180 atomic_read(&ctxt->write_sectors) != sectors_pending); 181 } 182 183 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 184 { 185 struct bch_fs *c = ctxt->c; 186 187 move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); 188 closure_sync(&ctxt->cl); 189 190 EBUG_ON(atomic_read(&ctxt->write_sectors)); 191 EBUG_ON(atomic_read(&ctxt->write_ios)); 192 EBUG_ON(atomic_read(&ctxt->read_sectors)); 193 EBUG_ON(atomic_read(&ctxt->read_ios)); 194 195 if (ctxt->stats) { 196 progress_list_del(c, ctxt->stats); 197 trace_move_data(c, 198 atomic64_read(&ctxt->stats->sectors_moved), 199 atomic64_read(&ctxt->stats->keys_moved)); 200 } 201 202 mutex_lock(&c->moving_context_lock); 203 list_del(&ctxt->list); 204 mutex_unlock(&c->moving_context_lock); 205 } 206 207 void bch2_moving_ctxt_init(struct moving_context *ctxt, 208 struct bch_fs *c, 209 struct bch_ratelimit *rate, 210 struct bch_move_stats *stats, 211 struct write_point_specifier wp, 212 bool wait_on_copygc) 213 { 214 memset(ctxt, 0, sizeof(*ctxt)); 215 216 ctxt->c = c; 217 ctxt->fn = (void *) _RET_IP_; 218 ctxt->rate = rate; 219 ctxt->stats = stats; 220 ctxt->wp = wp; 221 ctxt->wait_on_copygc = wait_on_copygc; 222 223 closure_init_stack(&ctxt->cl); 224 225 mutex_init(&ctxt->lock); 226 INIT_LIST_HEAD(&ctxt->reads); 227 INIT_LIST_HEAD(&ctxt->ios); 228 init_waitqueue_head(&ctxt->wait); 229 230 mutex_lock(&c->moving_context_lock); 231 list_add(&ctxt->list, &c->moving_context_list); 232 mutex_unlock(&c->moving_context_lock); 233 234 if (stats) { 235 progress_list_add(c, stats); 236 stats->data_type = BCH_DATA_user; 237 } 238 } 239 240 void bch2_move_stats_init(struct bch_move_stats *stats, char *name) 241 { 242 memset(stats, 0, sizeof(*stats)); 243 scnprintf(stats->name, sizeof(stats->name), "%s", name); 244 } 245 246 static int bch2_extent_drop_ptrs(struct btree_trans *trans, 247 struct btree_iter *iter, 248 struct bkey_s_c k, 249 struct data_update_opts data_opts) 250 { 251 struct bch_fs *c = trans->c; 252 struct bkey_i *n; 253 int ret; 254 255 n = bch2_bkey_make_mut_noupdate(trans, k); 256 ret = PTR_ERR_OR_ZERO(n); 257 if (ret) 258 return ret; 259 260 while (data_opts.kill_ptrs) { 261 unsigned i = 0, drop = __fls(data_opts.kill_ptrs); 262 struct bch_extent_ptr *ptr; 263 264 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); 265 data_opts.kill_ptrs ^= 1U << drop; 266 } 267 268 /* 269 * If the new extent no longer has any pointers, bch2_extent_normalize() 270 * will do the appropriate thing with it (turning it into a 271 * KEY_TYPE_error key, or just a discard if it was a cached extent) 272 */ 273 bch2_extent_normalize(c, bkey_i_to_s(n)); 274 275 /* 276 * Since we're not inserting through an extent iterator 277 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), 278 * we aren't using the extent overwrite path to delete, we're 279 * just using the normal key deletion path: 280 */ 281 if (bkey_deleted(&n->k)) 282 n->k.size = 0; 283 284 return bch2_trans_relock(trans) ?: 285 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 286 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); 287 } 288 289 static int bch2_move_extent(struct btree_trans *trans, 290 struct btree_iter *iter, 291 struct moving_context *ctxt, 292 struct move_bucket_in_flight *bucket_in_flight, 293 struct bch_io_opts io_opts, 294 enum btree_id btree_id, 295 struct bkey_s_c k, 296 struct data_update_opts data_opts) 297 { 298 struct bch_fs *c = trans->c; 299 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 300 struct moving_io *io; 301 const union bch_extent_entry *entry; 302 struct extent_ptr_decoded p; 303 unsigned sectors = k.k->size, pages; 304 int ret = -ENOMEM; 305 306 trace_move_extent2(c, k); 307 308 bch2_data_update_opts_normalize(k, &data_opts); 309 310 if (!data_opts.rewrite_ptrs && 311 !data_opts.extra_replicas) { 312 if (data_opts.kill_ptrs) 313 return bch2_extent_drop_ptrs(trans, iter, k, data_opts); 314 return 0; 315 } 316 317 /* 318 * Before memory allocations & taking nocow locks in 319 * bch2_data_update_init(): 320 */ 321 bch2_trans_unlock(trans); 322 323 /* write path might have to decompress data: */ 324 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 325 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 326 327 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 328 io = kzalloc(sizeof(struct moving_io) + 329 sizeof(struct bio_vec) * pages, GFP_KERNEL); 330 if (!io) 331 goto err; 332 333 INIT_LIST_HEAD(&io->io_list); 334 io->write.ctxt = ctxt; 335 io->read_sectors = k.k->size; 336 io->write_sectors = k.k->size; 337 338 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 339 bio_set_prio(&io->write.op.wbio.bio, 340 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 341 342 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 343 GFP_KERNEL)) 344 goto err_free; 345 346 io->rbio.c = c; 347 io->rbio.opts = io_opts; 348 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 349 io->rbio.bio.bi_vcnt = pages; 350 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 351 io->rbio.bio.bi_iter.bi_size = sectors << 9; 352 353 io->rbio.bio.bi_opf = REQ_OP_READ; 354 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 355 io->rbio.bio.bi_end_io = move_read_endio; 356 357 ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, 358 io_opts, data_opts, btree_id, k); 359 if (ret && ret != -BCH_ERR_unwritten_extent_update) 360 goto err_free_pages; 361 362 if (ret == -BCH_ERR_unwritten_extent_update) { 363 bch2_update_unwritten_extent(trans, &io->write); 364 move_free(io); 365 return 0; 366 } 367 368 BUG_ON(ret); 369 370 io->write.ctxt = ctxt; 371 io->write.op.end_io = move_write_done; 372 373 if (ctxt->stats) { 374 atomic64_inc(&ctxt->stats->keys_moved); 375 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 376 } 377 378 if (bucket_in_flight) { 379 io->b = bucket_in_flight; 380 atomic_inc(&io->b->count); 381 } 382 383 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 384 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 385 trace_move_extent_read2(c, k); 386 387 mutex_lock(&ctxt->lock); 388 atomic_add(io->read_sectors, &ctxt->read_sectors); 389 atomic_inc(&ctxt->read_ios); 390 391 list_add_tail(&io->read_list, &ctxt->reads); 392 list_add_tail(&io->io_list, &ctxt->ios); 393 mutex_unlock(&ctxt->lock); 394 395 /* 396 * dropped by move_read_endio() - guards against use after free of 397 * ctxt when doing wakeup 398 */ 399 closure_get(&ctxt->cl); 400 bch2_read_extent(trans, &io->rbio, 401 bkey_start_pos(k.k), 402 btree_id, k, 0, 403 BCH_READ_NODECODE| 404 BCH_READ_LAST_FRAGMENT); 405 return 0; 406 err_free_pages: 407 bio_free_pages(&io->write.op.wbio.bio); 408 err_free: 409 kfree(io); 410 err: 411 this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); 412 trace_move_extent_alloc_mem_fail2(c, k); 413 return ret; 414 } 415 416 static int lookup_inode(struct btree_trans *trans, struct bpos pos, 417 struct bch_inode_unpacked *inode) 418 { 419 struct btree_iter iter; 420 struct bkey_s_c k; 421 int ret; 422 423 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, 424 BTREE_ITER_ALL_SNAPSHOTS); 425 k = bch2_btree_iter_peek(&iter); 426 ret = bkey_err(k); 427 if (ret) 428 goto err; 429 430 if (!k.k || !bkey_eq(k.k->p, pos)) { 431 ret = -BCH_ERR_ENOENT_inode; 432 goto err; 433 } 434 435 ret = bkey_is_inode(k.k) ? 0 : -EIO; 436 if (ret) 437 goto err; 438 439 ret = bch2_inode_unpack(k, inode); 440 if (ret) 441 goto err; 442 err: 443 bch2_trans_iter_exit(trans, &iter); 444 return ret; 445 } 446 447 static int move_ratelimit(struct btree_trans *trans, 448 struct moving_context *ctxt) 449 { 450 struct bch_fs *c = trans->c; 451 u64 delay; 452 453 if (ctxt->wait_on_copygc) { 454 bch2_trans_unlock(trans); 455 wait_event_killable(c->copygc_running_wq, 456 !c->copygc_running || 457 kthread_should_stop()); 458 } 459 460 do { 461 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 462 463 if (delay) { 464 bch2_trans_unlock(trans); 465 set_current_state(TASK_INTERRUPTIBLE); 466 } 467 468 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { 469 __set_current_state(TASK_RUNNING); 470 return 1; 471 } 472 473 if (delay) 474 schedule_timeout(delay); 475 476 if (unlikely(freezing(current))) { 477 move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); 478 try_to_freeze(); 479 } 480 } while (delay); 481 482 /* 483 * XXX: these limits really ought to be per device, SSDs and hard drives 484 * will want different limits 485 */ 486 move_ctxt_wait_event(ctxt, trans, 487 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 488 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 489 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 490 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 491 492 return 0; 493 } 494 495 static int move_get_io_opts(struct btree_trans *trans, 496 struct bch_io_opts *io_opts, 497 struct bkey_s_c k, u64 *cur_inum) 498 { 499 struct bch_inode_unpacked inode; 500 int ret; 501 502 if (*cur_inum == k.k->p.inode) 503 return 0; 504 505 ret = lookup_inode(trans, 506 SPOS(0, k.k->p.inode, k.k->p.snapshot), 507 &inode); 508 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 509 return ret; 510 511 if (!ret) 512 bch2_inode_opts_get(io_opts, trans->c, &inode); 513 else 514 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 515 *cur_inum = k.k->p.inode; 516 return 0; 517 } 518 519 static int __bch2_move_data(struct moving_context *ctxt, 520 struct bpos start, 521 struct bpos end, 522 move_pred_fn pred, void *arg, 523 enum btree_id btree_id) 524 { 525 struct bch_fs *c = ctxt->c; 526 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 527 struct bkey_buf sk; 528 struct btree_trans *trans = bch2_trans_get(c); 529 struct btree_iter iter; 530 struct bkey_s_c k; 531 struct data_update_opts data_opts; 532 u64 cur_inum = U64_MAX; 533 int ret = 0, ret2; 534 535 bch2_bkey_buf_init(&sk); 536 537 if (ctxt->stats) { 538 ctxt->stats->data_type = BCH_DATA_user; 539 ctxt->stats->btree_id = btree_id; 540 ctxt->stats->pos = start; 541 } 542 543 bch2_trans_iter_init(trans, &iter, btree_id, start, 544 BTREE_ITER_PREFETCH| 545 BTREE_ITER_ALL_SNAPSHOTS); 546 547 if (ctxt->rate) 548 bch2_ratelimit_reset(ctxt->rate); 549 550 while (!move_ratelimit(trans, ctxt)) { 551 bch2_trans_begin(trans); 552 553 k = bch2_btree_iter_peek(&iter); 554 if (!k.k) 555 break; 556 557 ret = bkey_err(k); 558 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 559 continue; 560 if (ret) 561 break; 562 563 if (bkey_ge(bkey_start_pos(k.k), end)) 564 break; 565 566 if (ctxt->stats) 567 ctxt->stats->pos = iter.pos; 568 569 if (!bkey_extent_is_direct_data(k.k)) 570 goto next_nondata; 571 572 ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); 573 if (ret) 574 continue; 575 576 memset(&data_opts, 0, sizeof(data_opts)); 577 if (!pred(c, arg, k, &io_opts, &data_opts)) 578 goto next; 579 580 /* 581 * The iterator gets unlocked by __bch2_read_extent - need to 582 * save a copy of @k elsewhere: 583 */ 584 bch2_bkey_buf_reassemble(&sk, c, k); 585 k = bkey_i_to_s_c(sk.k); 586 587 ret2 = bch2_move_extent(trans, &iter, ctxt, NULL, 588 io_opts, btree_id, k, data_opts); 589 if (ret2) { 590 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 591 continue; 592 593 if (ret2 == -ENOMEM) { 594 /* memory allocation failure, wait for some IO to finish */ 595 bch2_move_ctxt_wait_for_io(ctxt, trans); 596 continue; 597 } 598 599 /* XXX signal failure */ 600 goto next; 601 } 602 603 if (ctxt->rate) 604 bch2_ratelimit_increment(ctxt->rate, k.k->size); 605 next: 606 if (ctxt->stats) 607 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 608 next_nondata: 609 bch2_btree_iter_advance(&iter); 610 } 611 612 bch2_trans_iter_exit(trans, &iter); 613 bch2_trans_put(trans); 614 bch2_bkey_buf_exit(&sk, c); 615 616 return ret; 617 } 618 619 int bch2_move_data(struct bch_fs *c, 620 enum btree_id start_btree_id, struct bpos start_pos, 621 enum btree_id end_btree_id, struct bpos end_pos, 622 struct bch_ratelimit *rate, 623 struct bch_move_stats *stats, 624 struct write_point_specifier wp, 625 bool wait_on_copygc, 626 move_pred_fn pred, void *arg) 627 { 628 struct moving_context ctxt; 629 enum btree_id id; 630 int ret = 0; 631 632 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 633 634 for (id = start_btree_id; 635 id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); 636 id++) { 637 stats->btree_id = id; 638 639 if (id != BTREE_ID_extents && 640 id != BTREE_ID_reflink) 641 continue; 642 643 if (!bch2_btree_id_root(c, id)->b) 644 continue; 645 646 ret = __bch2_move_data(&ctxt, 647 id == start_btree_id ? start_pos : POS_MIN, 648 id == end_btree_id ? end_pos : POS_MAX, 649 pred, arg, id); 650 if (ret) 651 break; 652 } 653 654 bch2_moving_ctxt_exit(&ctxt); 655 656 return ret; 657 } 658 659 int __bch2_evacuate_bucket(struct btree_trans *trans, 660 struct moving_context *ctxt, 661 struct move_bucket_in_flight *bucket_in_flight, 662 struct bpos bucket, int gen, 663 struct data_update_opts _data_opts) 664 { 665 struct bch_fs *c = ctxt->c; 666 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 667 struct btree_iter iter; 668 struct bkey_buf sk; 669 struct bch_backpointer bp; 670 struct bch_alloc_v4 a_convert; 671 const struct bch_alloc_v4 *a; 672 struct bkey_s_c k; 673 struct data_update_opts data_opts; 674 unsigned dirty_sectors, bucket_size; 675 u64 fragmentation; 676 u64 cur_inum = U64_MAX; 677 struct bpos bp_pos = POS_MIN; 678 int ret = 0; 679 680 trace_bucket_evacuate(c, &bucket); 681 682 bch2_bkey_buf_init(&sk); 683 684 /* 685 * We're not run in a context that handles transaction restarts: 686 */ 687 bch2_trans_begin(trans); 688 689 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 690 bucket, BTREE_ITER_CACHED); 691 ret = lockrestart_do(trans, 692 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 693 bch2_trans_iter_exit(trans, &iter); 694 695 if (ret) { 696 bch_err_msg(c, ret, "looking up alloc key"); 697 goto err; 698 } 699 700 a = bch2_alloc_to_v4(k, &a_convert); 701 dirty_sectors = a->dirty_sectors; 702 bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; 703 fragmentation = a->fragmentation_lru; 704 705 ret = bch2_btree_write_buffer_flush(trans); 706 if (ret) { 707 bch_err_msg(c, ret, "flushing btree write buffer"); 708 goto err; 709 } 710 711 while (!(ret = move_ratelimit(trans, ctxt))) { 712 bch2_trans_begin(trans); 713 714 ret = bch2_get_next_backpointer(trans, bucket, gen, 715 &bp_pos, &bp, 716 BTREE_ITER_CACHED); 717 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 718 continue; 719 if (ret) 720 goto err; 721 if (bkey_eq(bp_pos, POS_MAX)) 722 break; 723 724 if (!bp.level) { 725 const struct bch_extent_ptr *ptr; 726 unsigned i = 0; 727 728 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 729 ret = bkey_err(k); 730 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 731 continue; 732 if (ret) 733 goto err; 734 if (!k.k) 735 goto next; 736 737 bch2_bkey_buf_reassemble(&sk, c, k); 738 k = bkey_i_to_s_c(sk.k); 739 740 ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); 741 if (ret) { 742 bch2_trans_iter_exit(trans, &iter); 743 continue; 744 } 745 746 data_opts = _data_opts; 747 data_opts.target = io_opts.background_target; 748 data_opts.rewrite_ptrs = 0; 749 750 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 751 if (ptr->dev == bucket.inode) { 752 data_opts.rewrite_ptrs |= 1U << i; 753 if (ptr->cached) { 754 bch2_trans_iter_exit(trans, &iter); 755 goto next; 756 } 757 } 758 i++; 759 } 760 761 ret = bch2_move_extent(trans, &iter, ctxt, 762 bucket_in_flight, 763 io_opts, bp.btree_id, k, data_opts); 764 bch2_trans_iter_exit(trans, &iter); 765 766 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 767 continue; 768 if (ret == -ENOMEM) { 769 /* memory allocation failure, wait for some IO to finish */ 770 bch2_move_ctxt_wait_for_io(ctxt, trans); 771 continue; 772 } 773 if (ret) 774 goto err; 775 776 if (ctxt->rate) 777 bch2_ratelimit_increment(ctxt->rate, k.k->size); 778 if (ctxt->stats) 779 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 780 } else { 781 struct btree *b; 782 783 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 784 ret = PTR_ERR_OR_ZERO(b); 785 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 786 continue; 787 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 788 continue; 789 if (ret) 790 goto err; 791 if (!b) 792 goto next; 793 794 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 795 bch2_trans_iter_exit(trans, &iter); 796 797 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 798 continue; 799 if (ret) 800 goto err; 801 802 if (ctxt->rate) 803 bch2_ratelimit_increment(ctxt->rate, 804 c->opts.btree_node_size >> 9); 805 if (ctxt->stats) { 806 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); 807 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); 808 } 809 } 810 next: 811 bp_pos = bpos_nosnap_successor(bp_pos); 812 } 813 814 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 815 err: 816 bch2_bkey_buf_exit(&sk, c); 817 return ret; 818 } 819 820 int bch2_evacuate_bucket(struct bch_fs *c, 821 struct bpos bucket, int gen, 822 struct data_update_opts data_opts, 823 struct bch_ratelimit *rate, 824 struct bch_move_stats *stats, 825 struct write_point_specifier wp, 826 bool wait_on_copygc) 827 { 828 struct btree_trans *trans = bch2_trans_get(c); 829 struct moving_context ctxt; 830 int ret; 831 832 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 833 ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts); 834 bch2_moving_ctxt_exit(&ctxt); 835 bch2_trans_put(trans); 836 837 return ret; 838 } 839 840 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 841 struct btree *, struct bch_io_opts *, 842 struct data_update_opts *); 843 844 static int bch2_move_btree(struct bch_fs *c, 845 enum btree_id start_btree_id, struct bpos start_pos, 846 enum btree_id end_btree_id, struct bpos end_pos, 847 move_btree_pred pred, void *arg, 848 struct bch_move_stats *stats) 849 { 850 bool kthread = (current->flags & PF_KTHREAD) != 0; 851 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 852 struct btree_trans *trans = bch2_trans_get(c); 853 struct btree_iter iter; 854 struct btree *b; 855 enum btree_id id; 856 struct data_update_opts data_opts; 857 int ret = 0; 858 859 progress_list_add(c, stats); 860 861 stats->data_type = BCH_DATA_btree; 862 863 for (id = start_btree_id; 864 id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); 865 id++) { 866 stats->btree_id = id; 867 868 if (!bch2_btree_id_root(c, id)->b) 869 continue; 870 871 bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, 872 BTREE_ITER_PREFETCH); 873 retry: 874 ret = 0; 875 while (bch2_trans_begin(trans), 876 (b = bch2_btree_iter_peek_node(&iter)) && 877 !(ret = PTR_ERR_OR_ZERO(b))) { 878 if (kthread && kthread_should_stop()) 879 break; 880 881 if ((cmp_int(id, end_btree_id) ?: 882 bpos_cmp(b->key.k.p, end_pos)) > 0) 883 break; 884 885 stats->pos = iter.pos; 886 887 if (!pred(c, arg, b, &io_opts, &data_opts)) 888 goto next; 889 890 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 891 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 892 continue; 893 if (ret) 894 break; 895 next: 896 bch2_btree_iter_next_node(&iter); 897 } 898 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 899 goto retry; 900 901 bch2_trans_iter_exit(trans, &iter); 902 903 if (kthread && kthread_should_stop()) 904 break; 905 } 906 907 bch2_trans_put(trans); 908 909 if (ret) 910 bch_err_fn(c, ret); 911 912 bch2_btree_interior_updates_flush(c); 913 914 progress_list_del(c, stats); 915 return ret; 916 } 917 918 static bool rereplicate_pred(struct bch_fs *c, void *arg, 919 struct bkey_s_c k, 920 struct bch_io_opts *io_opts, 921 struct data_update_opts *data_opts) 922 { 923 unsigned nr_good = bch2_bkey_durability(c, k); 924 unsigned replicas = bkey_is_btree_ptr(k.k) 925 ? c->opts.metadata_replicas 926 : io_opts->data_replicas; 927 928 if (!nr_good || nr_good >= replicas) 929 return false; 930 931 data_opts->target = 0; 932 data_opts->extra_replicas = replicas - nr_good; 933 data_opts->btree_insert_flags = 0; 934 return true; 935 } 936 937 static bool migrate_pred(struct bch_fs *c, void *arg, 938 struct bkey_s_c k, 939 struct bch_io_opts *io_opts, 940 struct data_update_opts *data_opts) 941 { 942 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 943 const struct bch_extent_ptr *ptr; 944 struct bch_ioctl_data *op = arg; 945 unsigned i = 0; 946 947 data_opts->rewrite_ptrs = 0; 948 data_opts->target = 0; 949 data_opts->extra_replicas = 0; 950 data_opts->btree_insert_flags = 0; 951 952 bkey_for_each_ptr(ptrs, ptr) { 953 if (ptr->dev == op->migrate.dev) 954 data_opts->rewrite_ptrs |= 1U << i; 955 i++; 956 } 957 958 return data_opts->rewrite_ptrs != 0; 959 } 960 961 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 962 struct btree *b, 963 struct bch_io_opts *io_opts, 964 struct data_update_opts *data_opts) 965 { 966 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 967 } 968 969 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 970 struct btree *b, 971 struct bch_io_opts *io_opts, 972 struct data_update_opts *data_opts) 973 { 974 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 975 } 976 977 static bool bformat_needs_redo(struct bkey_format *f) 978 { 979 unsigned i; 980 981 for (i = 0; i < f->nr_fields; i++) { 982 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; 983 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); 984 u64 field_offset = le64_to_cpu(f->field_offset[i]); 985 986 if (f->bits_per_field[i] > unpacked_bits) 987 return true; 988 989 if ((f->bits_per_field[i] == unpacked_bits) && field_offset) 990 return true; 991 992 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & 993 unpacked_mask) < 994 field_offset) 995 return true; 996 } 997 998 return false; 999 } 1000 1001 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 1002 struct btree *b, 1003 struct bch_io_opts *io_opts, 1004 struct data_update_opts *data_opts) 1005 { 1006 if (b->version_ondisk != c->sb.version || 1007 btree_node_need_rewrite(b) || 1008 bformat_needs_redo(&b->format)) { 1009 data_opts->target = 0; 1010 data_opts->extra_replicas = 0; 1011 data_opts->btree_insert_flags = 0; 1012 return true; 1013 } 1014 1015 return false; 1016 } 1017 1018 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1019 { 1020 int ret; 1021 1022 ret = bch2_move_btree(c, 1023 0, POS_MIN, 1024 BTREE_ID_NR, SPOS_MAX, 1025 rewrite_old_nodes_pred, c, stats); 1026 if (!ret) { 1027 mutex_lock(&c->sb_lock); 1028 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1029 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1030 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1031 bch2_write_super(c); 1032 mutex_unlock(&c->sb_lock); 1033 } 1034 1035 if (ret) 1036 bch_err_fn(c, ret); 1037 return ret; 1038 } 1039 1040 int bch2_data_job(struct bch_fs *c, 1041 struct bch_move_stats *stats, 1042 struct bch_ioctl_data op) 1043 { 1044 int ret = 0; 1045 1046 switch (op.op) { 1047 case BCH_DATA_OP_REREPLICATE: 1048 bch2_move_stats_init(stats, "rereplicate"); 1049 stats->data_type = BCH_DATA_journal; 1050 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1051 1052 ret = bch2_move_btree(c, 1053 op.start_btree, op.start_pos, 1054 op.end_btree, op.end_pos, 1055 rereplicate_btree_pred, c, stats) ?: ret; 1056 ret = bch2_replicas_gc2(c) ?: ret; 1057 1058 ret = bch2_move_data(c, 1059 op.start_btree, op.start_pos, 1060 op.end_btree, op.end_pos, 1061 NULL, 1062 stats, 1063 writepoint_hashed((unsigned long) current), 1064 true, 1065 rereplicate_pred, c) ?: ret; 1066 ret = bch2_replicas_gc2(c) ?: ret; 1067 break; 1068 case BCH_DATA_OP_MIGRATE: 1069 if (op.migrate.dev >= c->sb.nr_devices) 1070 return -EINVAL; 1071 1072 bch2_move_stats_init(stats, "migrate"); 1073 stats->data_type = BCH_DATA_journal; 1074 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1075 1076 ret = bch2_move_btree(c, 1077 op.start_btree, op.start_pos, 1078 op.end_btree, op.end_pos, 1079 migrate_btree_pred, &op, stats) ?: ret; 1080 ret = bch2_replicas_gc2(c) ?: ret; 1081 1082 ret = bch2_move_data(c, 1083 op.start_btree, op.start_pos, 1084 op.end_btree, op.end_pos, 1085 NULL, 1086 stats, 1087 writepoint_hashed((unsigned long) current), 1088 true, 1089 migrate_pred, &op) ?: ret; 1090 ret = bch2_replicas_gc2(c) ?: ret; 1091 break; 1092 case BCH_DATA_OP_REWRITE_OLD_NODES: 1093 bch2_move_stats_init(stats, "rewrite_old_nodes"); 1094 ret = bch2_scan_old_btree_nodes(c, stats); 1095 break; 1096 default: 1097 ret = -EINVAL; 1098 } 1099 1100 return ret; 1101 } 1102 1103 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1104 { 1105 struct bch_move_stats *stats = ctxt->stats; 1106 struct moving_io *io; 1107 1108 prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); 1109 prt_newline(out); 1110 1111 prt_printf(out, " data type %s btree_id %s position: ", 1112 bch2_data_types[stats->data_type], 1113 bch2_btree_ids[stats->btree_id]); 1114 bch2_bpos_to_text(out, stats->pos); 1115 prt_newline(out); 1116 printbuf_indent_add(out, 2); 1117 1118 prt_printf(out, "reads: ios %u/%u sectors %u/%u", 1119 atomic_read(&ctxt->read_ios), 1120 c->opts.move_ios_in_flight, 1121 atomic_read(&ctxt->read_sectors), 1122 c->opts.move_bytes_in_flight >> 9); 1123 prt_newline(out); 1124 1125 prt_printf(out, "writes: ios %u/%u sectors %u/%u", 1126 atomic_read(&ctxt->write_ios), 1127 c->opts.move_ios_in_flight, 1128 atomic_read(&ctxt->write_sectors), 1129 c->opts.move_bytes_in_flight >> 9); 1130 prt_newline(out); 1131 1132 printbuf_indent_add(out, 2); 1133 1134 mutex_lock(&ctxt->lock); 1135 list_for_each_entry(io, &ctxt->ios, io_list) 1136 bch2_write_op_to_text(out, &io->write.op); 1137 mutex_unlock(&ctxt->lock); 1138 1139 printbuf_indent_sub(out, 4); 1140 } 1141 1142 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1143 { 1144 struct moving_context *ctxt; 1145 1146 mutex_lock(&c->moving_context_lock); 1147 list_for_each_entry(ctxt, &c->moving_context_list, list) 1148 bch2_moving_ctxt_to_text(out, c, ctxt); 1149 mutex_unlock(&c->moving_context_lock); 1150 } 1151 1152 void bch2_fs_move_init(struct bch_fs *c) 1153 { 1154 INIT_LIST_HEAD(&c->moving_context_list); 1155 mutex_init(&c->moving_context_lock); 1156 1157 INIT_LIST_HEAD(&c->data_progress_list); 1158 mutex_init(&c->data_progress_lock); 1159 } 1160