1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "alloc_background.h" 5 #include "alloc_foreground.h" 6 #include "backpointers.h" 7 #include "bkey_buf.h" 8 #include "btree_gc.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_write_buffer.h" 12 #include "disk_groups.h" 13 #include "ec.h" 14 #include "errcode.h" 15 #include "error.h" 16 #include "inode.h" 17 #include "io_read.h" 18 #include "io_write.h" 19 #include "journal_reclaim.h" 20 #include "keylist.h" 21 #include "move.h" 22 #include "replicas.h" 23 #include "snapshot.h" 24 #include "super-io.h" 25 #include "trace.h" 26 27 #include <linux/ioprio.h> 28 #include <linux/kthread.h> 29 30 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) 31 { 32 if (trace_move_extent_enabled()) { 33 struct printbuf buf = PRINTBUF; 34 35 bch2_bkey_val_to_text(&buf, c, k); 36 trace_move_extent(c, buf.buf); 37 printbuf_exit(&buf); 38 } 39 } 40 41 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) 42 { 43 if (trace_move_extent_read_enabled()) { 44 struct printbuf buf = PRINTBUF; 45 46 bch2_bkey_val_to_text(&buf, c, k); 47 trace_move_extent_read(c, buf.buf); 48 printbuf_exit(&buf); 49 } 50 } 51 52 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) 53 { 54 if (trace_move_extent_alloc_mem_fail_enabled()) { 55 struct printbuf buf = PRINTBUF; 56 57 bch2_bkey_val_to_text(&buf, c, k); 58 trace_move_extent_alloc_mem_fail(c, buf.buf); 59 printbuf_exit(&buf); 60 } 61 } 62 63 struct moving_io { 64 struct list_head read_list; 65 struct list_head io_list; 66 struct move_bucket_in_flight *b; 67 struct closure cl; 68 bool read_completed; 69 70 unsigned read_sectors; 71 unsigned write_sectors; 72 73 struct bch_read_bio rbio; 74 75 struct data_update write; 76 /* Must be last since it is variable size */ 77 struct bio_vec bi_inline_vecs[0]; 78 }; 79 80 static void move_free(struct moving_io *io) 81 { 82 struct moving_context *ctxt = io->write.ctxt; 83 84 if (io->b) 85 atomic_dec(&io->b->count); 86 87 bch2_data_update_exit(&io->write); 88 89 mutex_lock(&ctxt->lock); 90 list_del(&io->io_list); 91 wake_up(&ctxt->wait); 92 mutex_unlock(&ctxt->lock); 93 94 kfree(io); 95 } 96 97 static void move_write_done(struct bch_write_op *op) 98 { 99 struct moving_io *io = container_of(op, struct moving_io, write.op); 100 struct moving_context *ctxt = io->write.ctxt; 101 102 if (io->write.op.error) 103 ctxt->write_error = true; 104 105 atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); 106 atomic_dec(&io->write.ctxt->write_ios); 107 move_free(io); 108 closure_put(&ctxt->cl); 109 } 110 111 static void move_write(struct moving_io *io) 112 { 113 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { 114 move_free(io); 115 return; 116 } 117 118 closure_get(&io->write.ctxt->cl); 119 atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); 120 atomic_inc(&io->write.ctxt->write_ios); 121 122 bch2_data_update_read_done(&io->write, io->rbio.pick.crc); 123 } 124 125 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) 126 { 127 struct moving_io *io = 128 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); 129 130 return io && io->read_completed ? io : NULL; 131 } 132 133 static void move_read_endio(struct bio *bio) 134 { 135 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); 136 struct moving_context *ctxt = io->write.ctxt; 137 138 atomic_sub(io->read_sectors, &ctxt->read_sectors); 139 atomic_dec(&ctxt->read_ios); 140 io->read_completed = true; 141 142 wake_up(&ctxt->wait); 143 closure_put(&ctxt->cl); 144 } 145 146 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) 147 { 148 struct moving_io *io; 149 150 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { 151 bch2_trans_unlock_long(ctxt->trans); 152 list_del(&io->read_list); 153 move_write(io); 154 } 155 } 156 157 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) 158 { 159 unsigned sectors_pending = atomic_read(&ctxt->write_sectors); 160 161 move_ctxt_wait_event(ctxt, 162 !atomic_read(&ctxt->write_sectors) || 163 atomic_read(&ctxt->write_sectors) != sectors_pending); 164 } 165 166 void bch2_moving_ctxt_exit(struct moving_context *ctxt) 167 { 168 struct bch_fs *c = ctxt->trans->c; 169 170 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 171 closure_sync(&ctxt->cl); 172 173 EBUG_ON(atomic_read(&ctxt->write_sectors)); 174 EBUG_ON(atomic_read(&ctxt->write_ios)); 175 EBUG_ON(atomic_read(&ctxt->read_sectors)); 176 EBUG_ON(atomic_read(&ctxt->read_ios)); 177 178 mutex_lock(&c->moving_context_lock); 179 list_del(&ctxt->list); 180 mutex_unlock(&c->moving_context_lock); 181 182 bch2_trans_put(ctxt->trans); 183 memset(ctxt, 0, sizeof(*ctxt)); 184 } 185 186 void bch2_moving_ctxt_init(struct moving_context *ctxt, 187 struct bch_fs *c, 188 struct bch_ratelimit *rate, 189 struct bch_move_stats *stats, 190 struct write_point_specifier wp, 191 bool wait_on_copygc) 192 { 193 memset(ctxt, 0, sizeof(*ctxt)); 194 195 ctxt->trans = bch2_trans_get(c); 196 ctxt->fn = (void *) _RET_IP_; 197 ctxt->rate = rate; 198 ctxt->stats = stats; 199 ctxt->wp = wp; 200 ctxt->wait_on_copygc = wait_on_copygc; 201 202 closure_init_stack(&ctxt->cl); 203 204 mutex_init(&ctxt->lock); 205 INIT_LIST_HEAD(&ctxt->reads); 206 INIT_LIST_HEAD(&ctxt->ios); 207 init_waitqueue_head(&ctxt->wait); 208 209 mutex_lock(&c->moving_context_lock); 210 list_add(&ctxt->list, &c->moving_context_list); 211 mutex_unlock(&c->moving_context_lock); 212 } 213 214 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) 215 { 216 trace_move_data(c, stats); 217 } 218 219 void bch2_move_stats_init(struct bch_move_stats *stats, char *name) 220 { 221 memset(stats, 0, sizeof(*stats)); 222 stats->data_type = BCH_DATA_user; 223 scnprintf(stats->name, sizeof(stats->name), "%s", name); 224 } 225 226 static int bch2_extent_drop_ptrs(struct btree_trans *trans, 227 struct btree_iter *iter, 228 struct bkey_s_c k, 229 struct data_update_opts data_opts) 230 { 231 struct bch_fs *c = trans->c; 232 struct bkey_i *n; 233 int ret; 234 235 n = bch2_bkey_make_mut_noupdate(trans, k); 236 ret = PTR_ERR_OR_ZERO(n); 237 if (ret) 238 return ret; 239 240 while (data_opts.kill_ptrs) { 241 unsigned i = 0, drop = __fls(data_opts.kill_ptrs); 242 struct bch_extent_ptr *ptr; 243 244 bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); 245 data_opts.kill_ptrs ^= 1U << drop; 246 } 247 248 /* 249 * If the new extent no longer has any pointers, bch2_extent_normalize() 250 * will do the appropriate thing with it (turning it into a 251 * KEY_TYPE_error key, or just a discard if it was a cached extent) 252 */ 253 bch2_extent_normalize(c, bkey_i_to_s(n)); 254 255 /* 256 * Since we're not inserting through an extent iterator 257 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), 258 * we aren't using the extent overwrite path to delete, we're 259 * just using the normal key deletion path: 260 */ 261 if (bkey_deleted(&n->k)) 262 n->k.size = 0; 263 264 return bch2_trans_relock(trans) ?: 265 bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 266 bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); 267 } 268 269 int bch2_move_extent(struct moving_context *ctxt, 270 struct move_bucket_in_flight *bucket_in_flight, 271 struct btree_iter *iter, 272 struct bkey_s_c k, 273 struct bch_io_opts io_opts, 274 struct data_update_opts data_opts) 275 { 276 struct btree_trans *trans = ctxt->trans; 277 struct bch_fs *c = trans->c; 278 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 279 struct moving_io *io; 280 const union bch_extent_entry *entry; 281 struct extent_ptr_decoded p; 282 unsigned sectors = k.k->size, pages; 283 int ret = -ENOMEM; 284 285 if (ctxt->stats) 286 ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); 287 trace_move_extent2(c, k); 288 289 bch2_data_update_opts_normalize(k, &data_opts); 290 291 if (!data_opts.rewrite_ptrs && 292 !data_opts.extra_replicas) { 293 if (data_opts.kill_ptrs) 294 return bch2_extent_drop_ptrs(trans, iter, k, data_opts); 295 return 0; 296 } 297 298 /* 299 * Before memory allocations & taking nocow locks in 300 * bch2_data_update_init(): 301 */ 302 bch2_trans_unlock(trans); 303 304 /* write path might have to decompress data: */ 305 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) 306 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); 307 308 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); 309 io = kzalloc(sizeof(struct moving_io) + 310 sizeof(struct bio_vec) * pages, GFP_KERNEL); 311 if (!io) 312 goto err; 313 314 INIT_LIST_HEAD(&io->io_list); 315 io->write.ctxt = ctxt; 316 io->read_sectors = k.k->size; 317 io->write_sectors = k.k->size; 318 319 bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); 320 bio_set_prio(&io->write.op.wbio.bio, 321 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 322 323 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, 324 GFP_KERNEL)) 325 goto err_free; 326 327 io->rbio.c = c; 328 io->rbio.opts = io_opts; 329 bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); 330 io->rbio.bio.bi_vcnt = pages; 331 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 332 io->rbio.bio.bi_iter.bi_size = sectors << 9; 333 334 io->rbio.bio.bi_opf = REQ_OP_READ; 335 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); 336 io->rbio.bio.bi_end_io = move_read_endio; 337 338 ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, 339 io_opts, data_opts, iter->btree_id, k); 340 if (ret && ret != -BCH_ERR_unwritten_extent_update) 341 goto err_free_pages; 342 343 if (ret == -BCH_ERR_unwritten_extent_update) { 344 bch2_update_unwritten_extent(trans, &io->write); 345 move_free(io); 346 return 0; 347 } 348 349 BUG_ON(ret); 350 351 io->write.op.end_io = move_write_done; 352 353 if (ctxt->rate) 354 bch2_ratelimit_increment(ctxt->rate, k.k->size); 355 356 if (ctxt->stats) { 357 atomic64_inc(&ctxt->stats->keys_moved); 358 atomic64_add(k.k->size, &ctxt->stats->sectors_moved); 359 } 360 361 if (bucket_in_flight) { 362 io->b = bucket_in_flight; 363 atomic_inc(&io->b->count); 364 } 365 366 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); 367 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); 368 trace_move_extent_read2(c, k); 369 370 mutex_lock(&ctxt->lock); 371 atomic_add(io->read_sectors, &ctxt->read_sectors); 372 atomic_inc(&ctxt->read_ios); 373 374 list_add_tail(&io->read_list, &ctxt->reads); 375 list_add_tail(&io->io_list, &ctxt->ios); 376 mutex_unlock(&ctxt->lock); 377 378 /* 379 * dropped by move_read_endio() - guards against use after free of 380 * ctxt when doing wakeup 381 */ 382 closure_get(&ctxt->cl); 383 bch2_read_extent(trans, &io->rbio, 384 bkey_start_pos(k.k), 385 iter->btree_id, k, 0, 386 BCH_READ_NODECODE| 387 BCH_READ_LAST_FRAGMENT); 388 return 0; 389 err_free_pages: 390 bio_free_pages(&io->write.op.wbio.bio); 391 err_free: 392 kfree(io); 393 err: 394 this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); 395 trace_move_extent_alloc_mem_fail2(c, k); 396 return ret; 397 } 398 399 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, 400 struct per_snapshot_io_opts *io_opts, 401 struct bkey_s_c extent_k) 402 { 403 struct bch_fs *c = trans->c; 404 u32 restart_count = trans->restart_count; 405 int ret = 0; 406 407 if (io_opts->cur_inum != extent_k.k->p.inode) { 408 struct btree_iter iter; 409 struct bkey_s_c k; 410 411 io_opts->d.nr = 0; 412 413 for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), 414 BTREE_ITER_ALL_SNAPSHOTS, k, ret) { 415 if (k.k->p.offset != extent_k.k->p.inode) 416 break; 417 418 if (!bkey_is_inode(k.k)) 419 continue; 420 421 struct bch_inode_unpacked inode; 422 BUG_ON(bch2_inode_unpack(k, &inode)); 423 424 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; 425 bch2_inode_opts_get(&e.io_opts, trans->c, &inode); 426 427 ret = darray_push(&io_opts->d, e); 428 if (ret) 429 break; 430 } 431 bch2_trans_iter_exit(trans, &iter); 432 io_opts->cur_inum = extent_k.k->p.inode; 433 } 434 435 ret = ret ?: trans_was_restarted(trans, restart_count); 436 if (ret) 437 return ERR_PTR(ret); 438 439 if (extent_k.k->p.snapshot) { 440 struct snapshot_io_opts_entry *i; 441 darray_for_each(io_opts->d, i) 442 if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) 443 return &i->io_opts; 444 } 445 446 return &io_opts->fs_io_opts; 447 } 448 449 int bch2_move_get_io_opts_one(struct btree_trans *trans, 450 struct bch_io_opts *io_opts, 451 struct bkey_s_c extent_k) 452 { 453 struct btree_iter iter; 454 struct bkey_s_c k; 455 int ret; 456 457 /* reflink btree? */ 458 if (!extent_k.k->p.inode) { 459 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 460 return 0; 461 } 462 463 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, 464 SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), 465 BTREE_ITER_CACHED); 466 ret = bkey_err(k); 467 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 468 return ret; 469 470 if (!ret && bkey_is_inode(k.k)) { 471 struct bch_inode_unpacked inode; 472 bch2_inode_unpack(k, &inode); 473 bch2_inode_opts_get(io_opts, trans->c, &inode); 474 } else { 475 *io_opts = bch2_opts_to_inode_opts(trans->c->opts); 476 } 477 478 bch2_trans_iter_exit(trans, &iter); 479 return 0; 480 } 481 482 int bch2_move_ratelimit(struct moving_context *ctxt) 483 { 484 struct bch_fs *c = ctxt->trans->c; 485 u64 delay; 486 487 if (ctxt->wait_on_copygc && !c->copygc_running) { 488 bch2_trans_unlock_long(ctxt->trans); 489 wait_event_killable(c->copygc_running_wq, 490 !c->copygc_running || 491 kthread_should_stop()); 492 } 493 494 do { 495 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; 496 497 498 if (delay) { 499 if (delay > HZ / 10) 500 bch2_trans_unlock_long(ctxt->trans); 501 else 502 bch2_trans_unlock(ctxt->trans); 503 set_current_state(TASK_INTERRUPTIBLE); 504 } 505 506 if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { 507 __set_current_state(TASK_RUNNING); 508 return 1; 509 } 510 511 if (delay) 512 schedule_timeout(delay); 513 514 if (unlikely(freezing(current))) { 515 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); 516 try_to_freeze(); 517 } 518 } while (delay); 519 520 /* 521 * XXX: these limits really ought to be per device, SSDs and hard drives 522 * will want different limits 523 */ 524 move_ctxt_wait_event(ctxt, 525 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && 526 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && 527 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && 528 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); 529 530 return 0; 531 } 532 533 static int bch2_move_data_btree(struct moving_context *ctxt, 534 struct bpos start, 535 struct bpos end, 536 move_pred_fn pred, void *arg, 537 enum btree_id btree_id) 538 { 539 struct btree_trans *trans = ctxt->trans; 540 struct bch_fs *c = trans->c; 541 struct per_snapshot_io_opts snapshot_io_opts; 542 struct bch_io_opts *io_opts; 543 struct bkey_buf sk; 544 struct btree_iter iter; 545 struct bkey_s_c k; 546 struct data_update_opts data_opts; 547 int ret = 0, ret2; 548 549 per_snapshot_io_opts_init(&snapshot_io_opts, c); 550 bch2_bkey_buf_init(&sk); 551 552 if (ctxt->stats) { 553 ctxt->stats->data_type = BCH_DATA_user; 554 ctxt->stats->pos = BBPOS(btree_id, start); 555 } 556 557 bch2_trans_iter_init(trans, &iter, btree_id, start, 558 BTREE_ITER_PREFETCH| 559 BTREE_ITER_ALL_SNAPSHOTS); 560 561 if (ctxt->rate) 562 bch2_ratelimit_reset(ctxt->rate); 563 564 while (!bch2_move_ratelimit(ctxt)) { 565 bch2_trans_begin(trans); 566 567 k = bch2_btree_iter_peek(&iter); 568 if (!k.k) 569 break; 570 571 ret = bkey_err(k); 572 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 573 continue; 574 if (ret) 575 break; 576 577 if (bkey_ge(bkey_start_pos(k.k), end)) 578 break; 579 580 if (ctxt->stats) 581 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); 582 583 if (!bkey_extent_is_direct_data(k.k)) 584 goto next_nondata; 585 586 io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); 587 ret = PTR_ERR_OR_ZERO(io_opts); 588 if (ret) 589 continue; 590 591 memset(&data_opts, 0, sizeof(data_opts)); 592 if (!pred(c, arg, k, io_opts, &data_opts)) 593 goto next; 594 595 /* 596 * The iterator gets unlocked by __bch2_read_extent - need to 597 * save a copy of @k elsewhere: 598 */ 599 bch2_bkey_buf_reassemble(&sk, c, k); 600 k = bkey_i_to_s_c(sk.k); 601 602 ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); 603 if (ret2) { 604 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) 605 continue; 606 607 if (ret2 == -ENOMEM) { 608 /* memory allocation failure, wait for some IO to finish */ 609 bch2_move_ctxt_wait_for_io(ctxt); 610 continue; 611 } 612 613 /* XXX signal failure */ 614 goto next; 615 } 616 next: 617 if (ctxt->stats) 618 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 619 next_nondata: 620 bch2_btree_iter_advance(&iter); 621 } 622 623 bch2_trans_iter_exit(trans, &iter); 624 bch2_bkey_buf_exit(&sk, c); 625 per_snapshot_io_opts_exit(&snapshot_io_opts); 626 627 return ret; 628 } 629 630 int __bch2_move_data(struct moving_context *ctxt, 631 struct bbpos start, 632 struct bbpos end, 633 move_pred_fn pred, void *arg) 634 { 635 struct bch_fs *c = ctxt->trans->c; 636 enum btree_id id; 637 int ret = 0; 638 639 for (id = start.btree; 640 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); 641 id++) { 642 ctxt->stats->pos = BBPOS(id, POS_MIN); 643 644 if (!btree_type_has_ptrs(id) || 645 !bch2_btree_id_root(c, id)->b) 646 continue; 647 648 ret = bch2_move_data_btree(ctxt, 649 id == start.btree ? start.pos : POS_MIN, 650 id == end.btree ? end.pos : POS_MAX, 651 pred, arg, id); 652 if (ret) 653 break; 654 } 655 656 return ret; 657 } 658 659 int bch2_move_data(struct bch_fs *c, 660 struct bbpos start, 661 struct bbpos end, 662 struct bch_ratelimit *rate, 663 struct bch_move_stats *stats, 664 struct write_point_specifier wp, 665 bool wait_on_copygc, 666 move_pred_fn pred, void *arg) 667 { 668 669 struct moving_context ctxt; 670 int ret; 671 672 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 673 ret = __bch2_move_data(&ctxt, start, end, pred, arg); 674 bch2_moving_ctxt_exit(&ctxt); 675 676 return ret; 677 } 678 679 int __bch2_evacuate_bucket(struct moving_context *ctxt, 680 struct move_bucket_in_flight *bucket_in_flight, 681 struct bpos bucket, int gen, 682 struct data_update_opts _data_opts) 683 { 684 struct btree_trans *trans = ctxt->trans; 685 struct bch_fs *c = trans->c; 686 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 687 struct btree_iter iter; 688 struct bkey_buf sk; 689 struct bch_backpointer bp; 690 struct bch_alloc_v4 a_convert; 691 const struct bch_alloc_v4 *a; 692 struct bkey_s_c k; 693 struct data_update_opts data_opts; 694 unsigned dirty_sectors, bucket_size; 695 u64 fragmentation; 696 struct bpos bp_pos = POS_MIN; 697 int ret = 0; 698 699 trace_bucket_evacuate(c, &bucket); 700 701 bch2_bkey_buf_init(&sk); 702 703 /* 704 * We're not run in a context that handles transaction restarts: 705 */ 706 bch2_trans_begin(trans); 707 708 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 709 bucket, BTREE_ITER_CACHED); 710 ret = lockrestart_do(trans, 711 bkey_err(k = bch2_btree_iter_peek_slot(&iter))); 712 bch2_trans_iter_exit(trans, &iter); 713 714 if (ret) { 715 bch_err_msg(c, ret, "looking up alloc key"); 716 goto err; 717 } 718 719 a = bch2_alloc_to_v4(k, &a_convert); 720 dirty_sectors = a->dirty_sectors; 721 bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; 722 fragmentation = a->fragmentation_lru; 723 724 ret = bch2_btree_write_buffer_flush(trans); 725 if (ret) { 726 bch_err_msg(c, ret, "flushing btree write buffer"); 727 goto err; 728 } 729 730 while (!(ret = bch2_move_ratelimit(ctxt))) { 731 bch2_trans_begin(trans); 732 733 ret = bch2_get_next_backpointer(trans, bucket, gen, 734 &bp_pos, &bp, 735 BTREE_ITER_CACHED); 736 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 737 continue; 738 if (ret) 739 goto err; 740 if (bkey_eq(bp_pos, POS_MAX)) 741 break; 742 743 if (!bp.level) { 744 const struct bch_extent_ptr *ptr; 745 unsigned i = 0; 746 747 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); 748 ret = bkey_err(k); 749 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 750 continue; 751 if (ret) 752 goto err; 753 if (!k.k) 754 goto next; 755 756 bch2_bkey_buf_reassemble(&sk, c, k); 757 k = bkey_i_to_s_c(sk.k); 758 759 ret = bch2_move_get_io_opts_one(trans, &io_opts, k); 760 if (ret) { 761 bch2_trans_iter_exit(trans, &iter); 762 continue; 763 } 764 765 data_opts = _data_opts; 766 data_opts.target = io_opts.background_target; 767 data_opts.rewrite_ptrs = 0; 768 769 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { 770 if (ptr->dev == bucket.inode) { 771 data_opts.rewrite_ptrs |= 1U << i; 772 if (ptr->cached) { 773 bch2_trans_iter_exit(trans, &iter); 774 goto next; 775 } 776 } 777 i++; 778 } 779 780 ret = bch2_move_extent(ctxt, bucket_in_flight, 781 &iter, k, io_opts, data_opts); 782 bch2_trans_iter_exit(trans, &iter); 783 784 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 785 continue; 786 if (ret == -ENOMEM) { 787 /* memory allocation failure, wait for some IO to finish */ 788 bch2_move_ctxt_wait_for_io(ctxt); 789 continue; 790 } 791 if (ret) 792 goto err; 793 794 if (ctxt->stats) 795 atomic64_add(k.k->size, &ctxt->stats->sectors_seen); 796 } else { 797 struct btree *b; 798 799 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); 800 ret = PTR_ERR_OR_ZERO(b); 801 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 802 continue; 803 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 804 continue; 805 if (ret) 806 goto err; 807 if (!b) 808 goto next; 809 810 ret = bch2_btree_node_rewrite(trans, &iter, b, 0); 811 bch2_trans_iter_exit(trans, &iter); 812 813 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 814 continue; 815 if (ret) 816 goto err; 817 818 if (ctxt->rate) 819 bch2_ratelimit_increment(ctxt->rate, 820 c->opts.btree_node_size >> 9); 821 if (ctxt->stats) { 822 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); 823 atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); 824 } 825 } 826 next: 827 bp_pos = bpos_nosnap_successor(bp_pos); 828 } 829 830 trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); 831 err: 832 bch2_bkey_buf_exit(&sk, c); 833 return ret; 834 } 835 836 int bch2_evacuate_bucket(struct bch_fs *c, 837 struct bpos bucket, int gen, 838 struct data_update_opts data_opts, 839 struct bch_ratelimit *rate, 840 struct bch_move_stats *stats, 841 struct write_point_specifier wp, 842 bool wait_on_copygc) 843 { 844 struct moving_context ctxt; 845 int ret; 846 847 bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); 848 ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); 849 bch2_moving_ctxt_exit(&ctxt); 850 851 return ret; 852 } 853 854 typedef bool (*move_btree_pred)(struct bch_fs *, void *, 855 struct btree *, struct bch_io_opts *, 856 struct data_update_opts *); 857 858 static int bch2_move_btree(struct bch_fs *c, 859 enum btree_id start_btree_id, struct bpos start_pos, 860 enum btree_id end_btree_id, struct bpos end_pos, 861 move_btree_pred pred, void *arg, 862 struct bch_move_stats *stats) 863 { 864 bool kthread = (current->flags & PF_KTHREAD) != 0; 865 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); 866 struct moving_context ctxt; 867 struct btree_trans *trans; 868 struct btree_iter iter; 869 struct btree *b; 870 enum btree_id id; 871 struct data_update_opts data_opts; 872 int ret = 0; 873 874 bch2_moving_ctxt_init(&ctxt, c, NULL, stats, 875 writepoint_ptr(&c->btree_write_point), 876 true); 877 trans = ctxt.trans; 878 879 stats->data_type = BCH_DATA_btree; 880 881 for (id = start_btree_id; 882 id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); 883 id++) { 884 stats->pos = BBPOS(id, POS_MIN); 885 886 if (!bch2_btree_id_root(c, id)->b) 887 continue; 888 889 bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, 890 BTREE_ITER_PREFETCH); 891 retry: 892 ret = 0; 893 while (bch2_trans_begin(trans), 894 (b = bch2_btree_iter_peek_node(&iter)) && 895 !(ret = PTR_ERR_OR_ZERO(b))) { 896 if (kthread && kthread_should_stop()) 897 break; 898 899 if ((cmp_int(id, end_btree_id) ?: 900 bpos_cmp(b->key.k.p, end_pos)) > 0) 901 break; 902 903 stats->pos = BBPOS(iter.btree_id, iter.pos); 904 905 if (!pred(c, arg, b, &io_opts, &data_opts)) 906 goto next; 907 908 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; 909 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 910 continue; 911 if (ret) 912 break; 913 next: 914 bch2_btree_iter_next_node(&iter); 915 } 916 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 917 goto retry; 918 919 bch2_trans_iter_exit(trans, &iter); 920 921 if (kthread && kthread_should_stop()) 922 break; 923 } 924 925 bch_err_fn(c, ret); 926 bch2_moving_ctxt_exit(&ctxt); 927 bch2_btree_interior_updates_flush(c); 928 929 return ret; 930 } 931 932 static bool rereplicate_pred(struct bch_fs *c, void *arg, 933 struct bkey_s_c k, 934 struct bch_io_opts *io_opts, 935 struct data_update_opts *data_opts) 936 { 937 unsigned nr_good = bch2_bkey_durability(c, k); 938 unsigned replicas = bkey_is_btree_ptr(k.k) 939 ? c->opts.metadata_replicas 940 : io_opts->data_replicas; 941 942 if (!nr_good || nr_good >= replicas) 943 return false; 944 945 data_opts->target = 0; 946 data_opts->extra_replicas = replicas - nr_good; 947 data_opts->btree_insert_flags = 0; 948 return true; 949 } 950 951 static bool migrate_pred(struct bch_fs *c, void *arg, 952 struct bkey_s_c k, 953 struct bch_io_opts *io_opts, 954 struct data_update_opts *data_opts) 955 { 956 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 957 const struct bch_extent_ptr *ptr; 958 struct bch_ioctl_data *op = arg; 959 unsigned i = 0; 960 961 data_opts->rewrite_ptrs = 0; 962 data_opts->target = 0; 963 data_opts->extra_replicas = 0; 964 data_opts->btree_insert_flags = 0; 965 966 bkey_for_each_ptr(ptrs, ptr) { 967 if (ptr->dev == op->migrate.dev) 968 data_opts->rewrite_ptrs |= 1U << i; 969 i++; 970 } 971 972 return data_opts->rewrite_ptrs != 0; 973 } 974 975 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, 976 struct btree *b, 977 struct bch_io_opts *io_opts, 978 struct data_update_opts *data_opts) 979 { 980 return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 981 } 982 983 static bool migrate_btree_pred(struct bch_fs *c, void *arg, 984 struct btree *b, 985 struct bch_io_opts *io_opts, 986 struct data_update_opts *data_opts) 987 { 988 return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); 989 } 990 991 static bool bformat_needs_redo(struct bkey_format *f) 992 { 993 unsigned i; 994 995 for (i = 0; i < f->nr_fields; i++) { 996 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; 997 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); 998 u64 field_offset = le64_to_cpu(f->field_offset[i]); 999 1000 if (f->bits_per_field[i] > unpacked_bits) 1001 return true; 1002 1003 if ((f->bits_per_field[i] == unpacked_bits) && field_offset) 1004 return true; 1005 1006 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & 1007 unpacked_mask) < 1008 field_offset) 1009 return true; 1010 } 1011 1012 return false; 1013 } 1014 1015 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, 1016 struct btree *b, 1017 struct bch_io_opts *io_opts, 1018 struct data_update_opts *data_opts) 1019 { 1020 if (b->version_ondisk != c->sb.version || 1021 btree_node_need_rewrite(b) || 1022 bformat_needs_redo(&b->format)) { 1023 data_opts->target = 0; 1024 data_opts->extra_replicas = 0; 1025 data_opts->btree_insert_flags = 0; 1026 return true; 1027 } 1028 1029 return false; 1030 } 1031 1032 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) 1033 { 1034 int ret; 1035 1036 ret = bch2_move_btree(c, 1037 0, POS_MIN, 1038 BTREE_ID_NR, SPOS_MAX, 1039 rewrite_old_nodes_pred, c, stats); 1040 if (!ret) { 1041 mutex_lock(&c->sb_lock); 1042 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); 1043 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); 1044 c->disk_sb.sb->version_min = c->disk_sb.sb->version; 1045 bch2_write_super(c); 1046 mutex_unlock(&c->sb_lock); 1047 } 1048 1049 bch_err_fn(c, ret); 1050 return ret; 1051 } 1052 1053 int bch2_data_job(struct bch_fs *c, 1054 struct bch_move_stats *stats, 1055 struct bch_ioctl_data op) 1056 { 1057 int ret = 0; 1058 1059 switch (op.op) { 1060 case BCH_DATA_OP_REREPLICATE: 1061 bch2_move_stats_init(stats, "rereplicate"); 1062 stats->data_type = BCH_DATA_journal; 1063 ret = bch2_journal_flush_device_pins(&c->journal, -1); 1064 1065 ret = bch2_move_btree(c, 1066 op.start_btree, op.start_pos, 1067 op.end_btree, op.end_pos, 1068 rereplicate_btree_pred, c, stats) ?: ret; 1069 ret = bch2_replicas_gc2(c) ?: ret; 1070 1071 ret = bch2_move_data(c, 1072 (struct bbpos) { op.start_btree, op.start_pos }, 1073 (struct bbpos) { op.end_btree, op.end_pos }, 1074 NULL, 1075 stats, 1076 writepoint_hashed((unsigned long) current), 1077 true, 1078 rereplicate_pred, c) ?: ret; 1079 ret = bch2_replicas_gc2(c) ?: ret; 1080 1081 bch2_move_stats_exit(stats, c); 1082 break; 1083 case BCH_DATA_OP_MIGRATE: 1084 if (op.migrate.dev >= c->sb.nr_devices) 1085 return -EINVAL; 1086 1087 bch2_move_stats_init(stats, "migrate"); 1088 stats->data_type = BCH_DATA_journal; 1089 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); 1090 1091 ret = bch2_move_btree(c, 1092 op.start_btree, op.start_pos, 1093 op.end_btree, op.end_pos, 1094 migrate_btree_pred, &op, stats) ?: ret; 1095 ret = bch2_replicas_gc2(c) ?: ret; 1096 1097 ret = bch2_move_data(c, 1098 (struct bbpos) { op.start_btree, op.start_pos }, 1099 (struct bbpos) { op.end_btree, op.end_pos }, 1100 NULL, 1101 stats, 1102 writepoint_hashed((unsigned long) current), 1103 true, 1104 migrate_pred, &op) ?: ret; 1105 ret = bch2_replicas_gc2(c) ?: ret; 1106 1107 bch2_move_stats_exit(stats, c); 1108 break; 1109 case BCH_DATA_OP_REWRITE_OLD_NODES: 1110 bch2_move_stats_init(stats, "rewrite_old_nodes"); 1111 ret = bch2_scan_old_btree_nodes(c, stats); 1112 bch2_move_stats_exit(stats, c); 1113 break; 1114 default: 1115 ret = -EINVAL; 1116 } 1117 1118 return ret; 1119 } 1120 1121 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) 1122 { 1123 prt_printf(out, "%s: data type=%s pos=", 1124 stats->name, 1125 bch2_data_types[stats->data_type]); 1126 bch2_bbpos_to_text(out, stats->pos); 1127 prt_newline(out); 1128 printbuf_indent_add(out, 2); 1129 1130 prt_str(out, "keys moved: "); 1131 prt_u64(out, atomic64_read(&stats->keys_moved)); 1132 prt_newline(out); 1133 1134 prt_str(out, "keys raced: "); 1135 prt_u64(out, atomic64_read(&stats->keys_raced)); 1136 prt_newline(out); 1137 1138 prt_str(out, "bytes seen: "); 1139 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); 1140 prt_newline(out); 1141 1142 prt_str(out, "bytes moved: "); 1143 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); 1144 prt_newline(out); 1145 1146 prt_str(out, "bytes raced: "); 1147 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); 1148 prt_newline(out); 1149 1150 printbuf_indent_sub(out, 2); 1151 } 1152 1153 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) 1154 { 1155 struct moving_io *io; 1156 1157 bch2_move_stats_to_text(out, ctxt->stats); 1158 printbuf_indent_add(out, 2); 1159 1160 prt_printf(out, "reads: ios %u/%u sectors %u/%u", 1161 atomic_read(&ctxt->read_ios), 1162 c->opts.move_ios_in_flight, 1163 atomic_read(&ctxt->read_sectors), 1164 c->opts.move_bytes_in_flight >> 9); 1165 prt_newline(out); 1166 1167 prt_printf(out, "writes: ios %u/%u sectors %u/%u", 1168 atomic_read(&ctxt->write_ios), 1169 c->opts.move_ios_in_flight, 1170 atomic_read(&ctxt->write_sectors), 1171 c->opts.move_bytes_in_flight >> 9); 1172 prt_newline(out); 1173 1174 printbuf_indent_add(out, 2); 1175 1176 mutex_lock(&ctxt->lock); 1177 list_for_each_entry(io, &ctxt->ios, io_list) 1178 bch2_write_op_to_text(out, &io->write.op); 1179 mutex_unlock(&ctxt->lock); 1180 1181 printbuf_indent_sub(out, 4); 1182 } 1183 1184 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) 1185 { 1186 struct moving_context *ctxt; 1187 1188 mutex_lock(&c->moving_context_lock); 1189 list_for_each_entry(ctxt, &c->moving_context_list, list) 1190 bch2_moving_ctxt_to_text(out, c, ctxt); 1191 mutex_unlock(&c->moving_context_lock); 1192 } 1193 1194 void bch2_fs_move_init(struct bch_fs *c) 1195 { 1196 INIT_LIST_HEAD(&c->moving_context_list); 1197 mutex_init(&c->moving_context_lock); 1198 } 1199