xref: /linux/fs/bcachefs/move.c (revision ff0905bbf991f4337b5ebc19c0d43525ebb0d96b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_io.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_write_buffer.h"
13 #include "compress.h"
14 #include "disk_groups.h"
15 #include "ec.h"
16 #include "errcode.h"
17 #include "error.h"
18 #include "inode.h"
19 #include "io_read.h"
20 #include "io_write.h"
21 #include "journal_reclaim.h"
22 #include "keylist.h"
23 #include "move.h"
24 #include "rebalance.h"
25 #include "reflink.h"
26 #include "replicas.h"
27 #include "snapshot.h"
28 #include "super-io.h"
29 #include "trace.h"
30 
31 #include <linux/ioprio.h>
32 #include <linux/kthread.h>
33 
34 const char * const bch2_data_ops_strs[] = {
35 #define x(t, n, ...) [n] = #t,
36 	BCH_DATA_OPS()
37 #undef x
38 	NULL
39 };
40 
41 struct evacuate_bucket_arg {
42 	struct bpos		bucket;
43 	int			gen;
44 	struct data_update_opts	data_opts;
45 };
46 
47 static bool evacuate_bucket_pred(struct bch_fs *, void *,
48 				 enum btree_id, struct bkey_s_c,
49 				 struct bch_io_opts *,
50 				 struct data_update_opts *);
51 
52 static noinline void
trace_io_move2(struct bch_fs * c,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)53 trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
54 	       struct bch_io_opts *io_opts,
55 	       struct data_update_opts *data_opts)
56 {
57 	struct printbuf buf = PRINTBUF;
58 
59 	bch2_bkey_val_to_text(&buf, c, k);
60 	prt_newline(&buf);
61 	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
62 	trace_io_move(c, buf.buf);
63 	printbuf_exit(&buf);
64 }
65 
trace_io_move_read2(struct bch_fs * c,struct bkey_s_c k)66 static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
67 {
68 	struct printbuf buf = PRINTBUF;
69 
70 	bch2_bkey_val_to_text(&buf, c, k);
71 	trace_io_move_read(c, buf.buf);
72 	printbuf_exit(&buf);
73 }
74 
75 static noinline void
trace_io_move_pred2(struct bch_fs * c,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts,move_pred_fn pred,void * _arg,bool p)76 trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
77 		    struct bch_io_opts *io_opts,
78 		    struct data_update_opts *data_opts,
79 		    move_pred_fn pred, void *_arg, bool p)
80 {
81 	struct printbuf buf = PRINTBUF;
82 
83 	prt_printf(&buf, "%ps: %u", pred, p);
84 
85 	if (pred == evacuate_bucket_pred) {
86 		struct evacuate_bucket_arg *arg = _arg;
87 		prt_printf(&buf, " gen=%u", arg->gen);
88 	}
89 
90 	prt_newline(&buf);
91 	bch2_bkey_val_to_text(&buf, c, k);
92 	prt_newline(&buf);
93 	bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
94 	trace_io_move_pred(c, buf.buf);
95 	printbuf_exit(&buf);
96 }
97 
98 static noinline void
trace_io_move_evacuate_bucket2(struct bch_fs * c,struct bpos bucket,int gen)99 trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
100 {
101 	struct printbuf buf = PRINTBUF;
102 
103 	prt_printf(&buf, "bucket: ");
104 	bch2_bpos_to_text(&buf, bucket);
105 	prt_printf(&buf, " gen: %i\n", gen);
106 
107 	trace_io_move_evacuate_bucket(c, buf.buf);
108 	printbuf_exit(&buf);
109 }
110 
111 struct moving_io {
112 	struct list_head		read_list;
113 	struct list_head		io_list;
114 	struct move_bucket		*b;
115 	struct closure			cl;
116 	bool				read_completed;
117 
118 	unsigned			read_sectors;
119 	unsigned			write_sectors;
120 
121 	struct data_update		write;
122 };
123 
move_free(struct moving_io * io)124 static void move_free(struct moving_io *io)
125 {
126 	struct moving_context *ctxt = io->write.ctxt;
127 
128 	if (io->b)
129 		atomic_dec(&io->b->count);
130 
131 	mutex_lock(&ctxt->lock);
132 	list_del(&io->io_list);
133 	wake_up(&ctxt->wait);
134 	mutex_unlock(&ctxt->lock);
135 
136 	if (!io->write.data_opts.scrub) {
137 		bch2_data_update_exit(&io->write);
138 	} else {
139 		bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
140 		kfree(io->write.bvecs);
141 	}
142 	kfree(io);
143 }
144 
move_write_done(struct bch_write_op * op)145 static void move_write_done(struct bch_write_op *op)
146 {
147 	struct moving_io *io = container_of(op, struct moving_io, write.op);
148 	struct bch_fs *c = op->c;
149 	struct moving_context *ctxt = io->write.ctxt;
150 
151 	if (op->error) {
152 		if (trace_io_move_write_fail_enabled()) {
153 			struct printbuf buf = PRINTBUF;
154 
155 			bch2_write_op_to_text(&buf, op);
156 			trace_io_move_write_fail(c, buf.buf);
157 			printbuf_exit(&buf);
158 		}
159 		this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
160 
161 		ctxt->write_error = true;
162 	}
163 
164 	atomic_sub(io->write_sectors, &ctxt->write_sectors);
165 	atomic_dec(&ctxt->write_ios);
166 	move_free(io);
167 	closure_put(&ctxt->cl);
168 }
169 
move_write(struct moving_io * io)170 static void move_write(struct moving_io *io)
171 {
172 	struct bch_fs *c = io->write.op.c;
173 	struct moving_context *ctxt = io->write.ctxt;
174 	struct bch_read_bio *rbio = &io->write.rbio;
175 
176 	if (ctxt->stats) {
177 		if (rbio->bio.bi_status)
178 			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
179 				     &ctxt->stats->sectors_error_uncorrected);
180 		else if (rbio->saw_error)
181 			atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
182 				     &ctxt->stats->sectors_error_corrected);
183 	}
184 
185 	/*
186 	 * If the extent has been bitrotted, we're going to have to give it a
187 	 * new checksum in order to move it - but the poison bit will ensure
188 	 * that userspace still gets the appropriate error.
189 	 */
190 	if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
191 		     (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
192 		struct bch_extent_crc_unpacked crc = rbio->pick.crc;
193 		struct nonce nonce = extent_nonce(rbio->version, crc);
194 
195 		rbio->pick.crc.csum	= bch2_checksum_bio(c, rbio->pick.crc.csum_type,
196 							    nonce, &rbio->bio);
197 		rbio->ret		= 0;
198 	}
199 
200 	if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
201 		move_free(io);
202 		return;
203 	}
204 
205 	if (trace_io_move_write_enabled()) {
206 		struct printbuf buf = PRINTBUF;
207 
208 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
209 		trace_io_move_write(c, buf.buf);
210 		printbuf_exit(&buf);
211 	}
212 
213 	closure_get(&io->write.ctxt->cl);
214 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
215 	atomic_inc(&io->write.ctxt->write_ios);
216 
217 	bch2_data_update_read_done(&io->write);
218 }
219 
bch2_moving_ctxt_next_pending_write(struct moving_context * ctxt)220 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
221 {
222 	struct moving_io *io =
223 		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
224 
225 	return io && io->read_completed ? io : NULL;
226 }
227 
move_read_endio(struct bio * bio)228 static void move_read_endio(struct bio *bio)
229 {
230 	struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
231 	struct moving_context *ctxt = io->write.ctxt;
232 
233 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
234 	atomic_dec(&ctxt->read_ios);
235 	io->read_completed = true;
236 
237 	wake_up(&ctxt->wait);
238 	closure_put(&ctxt->cl);
239 }
240 
bch2_moving_ctxt_do_pending_writes(struct moving_context * ctxt)241 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
242 {
243 	struct moving_io *io;
244 
245 	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
246 		bch2_trans_unlock_long(ctxt->trans);
247 		list_del(&io->read_list);
248 		move_write(io);
249 	}
250 }
251 
bch2_move_ctxt_wait_for_io(struct moving_context * ctxt)252 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
253 {
254 	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
255 
256 	move_ctxt_wait_event(ctxt,
257 		!atomic_read(&ctxt->write_sectors) ||
258 		atomic_read(&ctxt->write_sectors) != sectors_pending);
259 }
260 
bch2_moving_ctxt_flush_all(struct moving_context * ctxt)261 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
262 {
263 	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
264 	bch2_trans_unlock_long(ctxt->trans);
265 	closure_sync(&ctxt->cl);
266 }
267 
bch2_moving_ctxt_exit(struct moving_context * ctxt)268 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
269 {
270 	struct bch_fs *c = ctxt->trans->c;
271 
272 	bch2_moving_ctxt_flush_all(ctxt);
273 
274 	EBUG_ON(atomic_read(&ctxt->write_sectors));
275 	EBUG_ON(atomic_read(&ctxt->write_ios));
276 	EBUG_ON(atomic_read(&ctxt->read_sectors));
277 	EBUG_ON(atomic_read(&ctxt->read_ios));
278 
279 	mutex_lock(&c->moving_context_lock);
280 	list_del(&ctxt->list);
281 	mutex_unlock(&c->moving_context_lock);
282 
283 	/*
284 	 * Generally, releasing a transaction within a transaction restart means
285 	 * an unhandled transaction restart: but this can happen legitimately
286 	 * within the move code, e.g. when bch2_move_ratelimit() tells us to
287 	 * exit before we've retried
288 	 */
289 	bch2_trans_begin(ctxt->trans);
290 	bch2_trans_put(ctxt->trans);
291 	memset(ctxt, 0, sizeof(*ctxt));
292 }
293 
bch2_moving_ctxt_init(struct moving_context * ctxt,struct bch_fs * c,struct bch_ratelimit * rate,struct bch_move_stats * stats,struct write_point_specifier wp,bool wait_on_copygc)294 void bch2_moving_ctxt_init(struct moving_context *ctxt,
295 			   struct bch_fs *c,
296 			   struct bch_ratelimit *rate,
297 			   struct bch_move_stats *stats,
298 			   struct write_point_specifier wp,
299 			   bool wait_on_copygc)
300 {
301 	memset(ctxt, 0, sizeof(*ctxt));
302 
303 	ctxt->trans	= bch2_trans_get(c);
304 	ctxt->fn	= (void *) _RET_IP_;
305 	ctxt->rate	= rate;
306 	ctxt->stats	= stats;
307 	ctxt->wp	= wp;
308 	ctxt->wait_on_copygc = wait_on_copygc;
309 
310 	closure_init_stack(&ctxt->cl);
311 
312 	mutex_init(&ctxt->lock);
313 	INIT_LIST_HEAD(&ctxt->reads);
314 	INIT_LIST_HEAD(&ctxt->ios);
315 	init_waitqueue_head(&ctxt->wait);
316 
317 	mutex_lock(&c->moving_context_lock);
318 	list_add(&ctxt->list, &c->moving_context_list);
319 	mutex_unlock(&c->moving_context_lock);
320 }
321 
bch2_move_stats_exit(struct bch_move_stats * stats,struct bch_fs * c)322 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
323 {
324 	trace_move_data(c, stats);
325 }
326 
bch2_move_stats_init(struct bch_move_stats * stats,const char * name)327 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
328 {
329 	memset(stats, 0, sizeof(*stats));
330 	stats->data_type = BCH_DATA_user;
331 	scnprintf(stats->name, sizeof(stats->name), "%s", name);
332 }
333 
bch2_move_extent(struct moving_context * ctxt,struct move_bucket * bucket_in_flight,struct btree_iter * iter,struct bkey_s_c k,struct bch_io_opts io_opts,struct data_update_opts data_opts)334 int bch2_move_extent(struct moving_context *ctxt,
335 		     struct move_bucket *bucket_in_flight,
336 		     struct btree_iter *iter,
337 		     struct bkey_s_c k,
338 		     struct bch_io_opts io_opts,
339 		     struct data_update_opts data_opts)
340 {
341 	struct btree_trans *trans = ctxt->trans;
342 	struct bch_fs *c = trans->c;
343 	int ret = -ENOMEM;
344 
345 	if (trace_io_move_enabled())
346 		trace_io_move2(c, k, &io_opts, &data_opts);
347 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
348 
349 	if (ctxt->stats)
350 		ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
351 
352 	bch2_data_update_opts_normalize(k, &data_opts);
353 
354 	if (!data_opts.rewrite_ptrs &&
355 	    !data_opts.extra_replicas &&
356 	    !data_opts.scrub) {
357 		if (data_opts.kill_ptrs)
358 			return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
359 		return 0;
360 	}
361 
362 	struct moving_io *io = allocate_dropping_locks(trans, ret,
363 				kzalloc(sizeof(struct moving_io), _gfp));
364 	if (!io)
365 		goto err;
366 
367 	if (ret)
368 		goto err_free;
369 
370 	INIT_LIST_HEAD(&io->io_list);
371 	io->write.ctxt		= ctxt;
372 	io->read_sectors	= k.k->size;
373 	io->write_sectors	= k.k->size;
374 
375 	if (!data_opts.scrub) {
376 		ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
377 					    &io_opts, data_opts, iter->btree_id, k);
378 		if (ret)
379 			goto err_free;
380 
381 		io->write.op.end_io	= move_write_done;
382 	} else {
383 		bch2_bkey_buf_init(&io->write.k);
384 		bch2_bkey_buf_reassemble(&io->write.k, c, k);
385 
386 		io->write.op.c		= c;
387 		io->write.data_opts	= data_opts;
388 
389 		bch2_trans_unlock(trans);
390 
391 		ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
392 		if (ret)
393 			goto err_free;
394 	}
395 
396 	io->write.rbio.bio.bi_end_io = move_read_endio;
397 	io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
398 
399 	if (ctxt->rate)
400 		bch2_ratelimit_increment(ctxt->rate, k.k->size);
401 
402 	if (ctxt->stats) {
403 		atomic64_inc(&ctxt->stats->keys_moved);
404 		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
405 	}
406 
407 	if (bucket_in_flight) {
408 		io->b = bucket_in_flight;
409 		atomic_inc(&io->b->count);
410 	}
411 
412 	if (trace_io_move_read_enabled())
413 		trace_io_move_read2(c, k);
414 
415 	mutex_lock(&ctxt->lock);
416 	atomic_add(io->read_sectors, &ctxt->read_sectors);
417 	atomic_inc(&ctxt->read_ios);
418 
419 	list_add_tail(&io->read_list, &ctxt->reads);
420 	list_add_tail(&io->io_list, &ctxt->ios);
421 	mutex_unlock(&ctxt->lock);
422 
423 	/*
424 	 * dropped by move_read_endio() - guards against use after free of
425 	 * ctxt when doing wakeup
426 	 */
427 	closure_get(&ctxt->cl);
428 	__bch2_read_extent(trans, &io->write.rbio,
429 			   io->write.rbio.bio.bi_iter,
430 			   bkey_start_pos(k.k),
431 			   iter->btree_id, k, 0,
432 			   NULL,
433 			   BCH_READ_last_fragment,
434 			   data_opts.scrub ?  data_opts.read_dev : -1);
435 	return 0;
436 err_free:
437 	kfree(io);
438 err:
439 	if (bch2_err_matches(ret, EROFS) ||
440 	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
441 		return ret;
442 
443 	count_event(c, io_move_start_fail);
444 
445 	if (trace_io_move_start_fail_enabled()) {
446 		struct printbuf buf = PRINTBUF;
447 
448 		bch2_bkey_val_to_text(&buf, c, k);
449 		prt_str(&buf, ": ");
450 		prt_str(&buf, bch2_err_str(ret));
451 		trace_io_move_start_fail(c, buf.buf);
452 		printbuf_exit(&buf);
453 	}
454 
455 	if (bch2_err_matches(ret, BCH_ERR_data_update_done))
456 		return 0;
457 	return ret;
458 }
459 
bch2_move_get_io_opts(struct btree_trans * trans,struct per_snapshot_io_opts * io_opts,struct bpos extent_pos,struct btree_iter * extent_iter,struct bkey_s_c extent_k)460 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
461 			  struct per_snapshot_io_opts *io_opts,
462 			  struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
463 			  struct btree_iter *extent_iter,
464 			  struct bkey_s_c extent_k)
465 {
466 	struct bch_fs *c = trans->c;
467 	u32 restart_count = trans->restart_count;
468 	struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
469 	int ret = 0;
470 
471 	if (extent_iter->min_depth)
472 		return opts_ret;
473 
474 	if (extent_k.k->type == KEY_TYPE_reflink_v)
475 		goto out;
476 
477 	if (io_opts->cur_inum != extent_pos.inode) {
478 		io_opts->d.nr = 0;
479 
480 		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode),
481 					 BTREE_ITER_all_snapshots, k, ({
482 			if (k.k->p.offset != extent_pos.inode)
483 				break;
484 
485 			if (!bkey_is_inode(k.k))
486 				continue;
487 
488 			struct bch_inode_unpacked inode;
489 			_ret3 = bch2_inode_unpack(k, &inode);
490 			if (_ret3)
491 				break;
492 
493 			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
494 			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
495 
496 			darray_push(&io_opts->d, e);
497 		}));
498 		io_opts->cur_inum = extent_pos.inode;
499 	}
500 
501 	ret = ret ?: trans_was_restarted(trans, restart_count);
502 	if (ret)
503 		return ERR_PTR(ret);
504 
505 	if (extent_k.k->p.snapshot)
506 		darray_for_each(io_opts->d, i)
507 			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) {
508 				opts_ret = &i->io_opts;
509 				break;
510 			}
511 out:
512 	ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k);
513 	if (ret)
514 		return ERR_PTR(ret);
515 	return opts_ret;
516 }
517 
bch2_move_get_io_opts_one(struct btree_trans * trans,struct bch_io_opts * io_opts,struct btree_iter * extent_iter,struct bkey_s_c extent_k)518 int bch2_move_get_io_opts_one(struct btree_trans *trans,
519 			      struct bch_io_opts *io_opts,
520 			      struct btree_iter *extent_iter,
521 			      struct bkey_s_c extent_k)
522 {
523 	struct bch_fs *c = trans->c;
524 
525 	*io_opts = bch2_opts_to_inode_opts(c->opts);
526 
527 	/* reflink btree? */
528 	if (!extent_k.k->p.inode)
529 		goto out;
530 
531 	struct btree_iter inode_iter;
532 	struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
533 			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
534 			       BTREE_ITER_cached);
535 	int ret = bkey_err(inode_k);
536 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
537 		return ret;
538 
539 	if (!ret && bkey_is_inode(inode_k.k)) {
540 		struct bch_inode_unpacked inode;
541 		bch2_inode_unpack(inode_k, &inode);
542 		bch2_inode_opts_get(io_opts, c, &inode);
543 	}
544 	bch2_trans_iter_exit(trans, &inode_iter);
545 	/* seem to be spinning here? */
546 out:
547 	return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
548 }
549 
bch2_move_ratelimit(struct moving_context * ctxt)550 int bch2_move_ratelimit(struct moving_context *ctxt)
551 {
552 	struct bch_fs *c = ctxt->trans->c;
553 	bool is_kthread = current->flags & PF_KTHREAD;
554 	u64 delay;
555 
556 	if (ctxt->wait_on_copygc && c->copygc_running) {
557 		bch2_moving_ctxt_flush_all(ctxt);
558 		wait_event_killable(c->copygc_running_wq,
559 				    !c->copygc_running ||
560 				    (is_kthread && kthread_should_stop()));
561 	}
562 
563 	do {
564 		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
565 
566 		if (is_kthread && kthread_should_stop())
567 			return 1;
568 
569 		if (delay)
570 			move_ctxt_wait_event_timeout(ctxt,
571 					freezing(current) ||
572 					(is_kthread && kthread_should_stop()),
573 					delay);
574 
575 		if (unlikely(freezing(current))) {
576 			bch2_moving_ctxt_flush_all(ctxt);
577 			try_to_freeze();
578 		}
579 	} while (delay);
580 
581 	/*
582 	 * XXX: these limits really ought to be per device, SSDs and hard drives
583 	 * will want different limits
584 	 */
585 	move_ctxt_wait_event(ctxt,
586 		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
587 		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
588 		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
589 		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
590 
591 	return 0;
592 }
593 
594 /*
595  * Move requires non extents iterators, and there's also no need for it to
596  * signal indirect_extent_missing_error:
597  */
bch2_lookup_indirect_extent_for_move(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c_reflink_p p)598 static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
599 					    struct btree_iter *iter,
600 					    struct bkey_s_c_reflink_p p)
601 {
602 	if (unlikely(REFLINK_P_ERROR(p.v)))
603 		return bkey_s_c_null;
604 
605 	struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
606 
607 	bch2_trans_iter_init(trans, iter,
608 			     BTREE_ID_reflink, reflink_pos,
609 			     BTREE_ITER_not_extents);
610 
611 	struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
612 	if (!k.k || bkey_err(k)) {
613 		bch2_trans_iter_exit(trans, iter);
614 		return k;
615 	}
616 
617 	if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
618 		bch2_trans_iter_exit(trans, iter);
619 		return bkey_s_c_null;
620 	}
621 
622 	return k;
623 }
624 
bch2_move_data_btree(struct moving_context * ctxt,struct bpos start,struct bpos end,move_pred_fn pred,void * arg,enum btree_id btree_id,unsigned level)625 int bch2_move_data_btree(struct moving_context *ctxt,
626 			 struct bpos start,
627 			 struct bpos end,
628 			 move_pred_fn pred, void *arg,
629 			 enum btree_id btree_id, unsigned level)
630 {
631 	struct btree_trans *trans = ctxt->trans;
632 	struct bch_fs *c = trans->c;
633 	struct per_snapshot_io_opts snapshot_io_opts;
634 	struct bch_io_opts *io_opts;
635 	struct bkey_buf sk;
636 	struct btree_iter iter, reflink_iter = {};
637 	struct bkey_s_c k;
638 	struct data_update_opts data_opts;
639 	/*
640 	 * If we're moving a single file, also process reflinked data it points
641 	 * to (this includes propagating changed io_opts from the inode to the
642 	 * extent):
643 	 */
644 	bool walk_indirect = start.inode == end.inode;
645 	int ret = 0, ret2;
646 
647 	per_snapshot_io_opts_init(&snapshot_io_opts, c);
648 	bch2_bkey_buf_init(&sk);
649 
650 	if (ctxt->stats) {
651 		ctxt->stats->data_type	= BCH_DATA_user;
652 		ctxt->stats->pos	= BBPOS(btree_id, start);
653 	}
654 
655 retry_root:
656 	bch2_trans_begin(trans);
657 
658 	if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
659 		bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
660 					  BTREE_ITER_prefetch|
661 					  BTREE_ITER_not_extents|
662 					  BTREE_ITER_all_snapshots);
663 		struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
664 		ret = PTR_ERR_OR_ZERO(b);
665 		if (ret)
666 			goto root_err;
667 
668 		if (b != btree_node_root(c, b)) {
669 			bch2_trans_iter_exit(trans, &iter);
670 			goto retry_root;
671 		}
672 
673 		k = bkey_i_to_s_c(&b->key);
674 
675 		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
676 						iter.pos, &iter, k);
677 		ret = PTR_ERR_OR_ZERO(io_opts);
678 		if (ret)
679 			goto root_err;
680 
681 		memset(&data_opts, 0, sizeof(data_opts));
682 		if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
683 			goto out;
684 
685 
686 		if (!data_opts.scrub)
687 			ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
688 							  k.k->p, data_opts.target, 0);
689 		else
690 			ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
691 
692 root_err:
693 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
694 			bch2_trans_iter_exit(trans, &iter);
695 			goto retry_root;
696 		}
697 
698 		goto out;
699 	}
700 
701 	bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
702 				  BTREE_ITER_prefetch|
703 				  BTREE_ITER_not_extents|
704 				  BTREE_ITER_all_snapshots);
705 
706 	if (ctxt->rate)
707 		bch2_ratelimit_reset(ctxt->rate);
708 
709 	while (!bch2_move_ratelimit(ctxt)) {
710 		struct btree_iter *extent_iter = &iter;
711 
712 		bch2_trans_begin(trans);
713 
714 		k = bch2_btree_iter_peek(trans, &iter);
715 		if (!k.k)
716 			break;
717 
718 		ret = bkey_err(k);
719 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
720 			continue;
721 		if (ret)
722 			break;
723 
724 		if (bkey_gt(bkey_start_pos(k.k), end))
725 			break;
726 
727 		if (ctxt->stats)
728 			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
729 
730 		if (walk_indirect &&
731 		    k.k->type == KEY_TYPE_reflink_p &&
732 		    REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
733 			struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
734 
735 			bch2_trans_iter_exit(trans, &reflink_iter);
736 			k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
737 			ret = bkey_err(k);
738 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
739 				continue;
740 			if (ret)
741 				break;
742 
743 			if (!k.k)
744 				goto next_nondata;
745 
746 			/*
747 			 * XXX: reflink pointers may point to multiple indirect
748 			 * extents, so don't advance past the entire reflink
749 			 * pointer - need to fixup iter->k
750 			 */
751 			extent_iter = &reflink_iter;
752 		}
753 
754 		if (!bkey_extent_is_direct_data(k.k))
755 			goto next_nondata;
756 
757 		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
758 						iter.pos, extent_iter, k);
759 		ret = PTR_ERR_OR_ZERO(io_opts);
760 		if (ret)
761 			continue;
762 
763 		memset(&data_opts, 0, sizeof(data_opts));
764 		if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
765 			goto next;
766 
767 		/*
768 		 * The iterator gets unlocked by __bch2_read_extent - need to
769 		 * save a copy of @k elsewhere:
770 		 */
771 		bch2_bkey_buf_reassemble(&sk, c, k);
772 		k = bkey_i_to_s_c(sk.k);
773 
774 		if (!level)
775 			ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
776 		else if (!data_opts.scrub)
777 			ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
778 							  k.k->p, data_opts.target, 0);
779 		else
780 			ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
781 
782 		if (ret2) {
783 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
784 				continue;
785 
786 			if (bch2_err_matches(ret2, ENOMEM)) {
787 				/* memory allocation failure, wait for some IO to finish */
788 				bch2_move_ctxt_wait_for_io(ctxt);
789 				continue;
790 			}
791 
792 			/* XXX signal failure */
793 			goto next;
794 		}
795 next:
796 		if (ctxt->stats)
797 			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
798 next_nondata:
799 		if (!bch2_btree_iter_advance(trans, &iter))
800 			break;
801 	}
802 out:
803 	bch2_trans_iter_exit(trans, &reflink_iter);
804 	bch2_trans_iter_exit(trans, &iter);
805 	bch2_bkey_buf_exit(&sk, c);
806 	per_snapshot_io_opts_exit(&snapshot_io_opts);
807 
808 	return ret;
809 }
810 
__bch2_move_data(struct moving_context * ctxt,struct bbpos start,struct bbpos end,move_pred_fn pred,void * arg)811 int __bch2_move_data(struct moving_context *ctxt,
812 		     struct bbpos start,
813 		     struct bbpos end,
814 		     move_pred_fn pred, void *arg)
815 {
816 	struct bch_fs *c = ctxt->trans->c;
817 	enum btree_id id;
818 	int ret = 0;
819 
820 	for (id = start.btree;
821 	     id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
822 	     id++) {
823 		ctxt->stats->pos = BBPOS(id, POS_MIN);
824 
825 		if (!btree_type_has_ptrs(id) ||
826 		    !bch2_btree_id_root(c, id)->b)
827 			continue;
828 
829 		ret = bch2_move_data_btree(ctxt,
830 				       id == start.btree ? start.pos : POS_MIN,
831 				       id == end.btree   ? end.pos   : POS_MAX,
832 				       pred, arg, id, 0);
833 		if (ret)
834 			break;
835 	}
836 
837 	return ret;
838 }
839 
bch2_move_data(struct bch_fs * c,struct bbpos start,struct bbpos end,struct bch_ratelimit * rate,struct bch_move_stats * stats,struct write_point_specifier wp,bool wait_on_copygc,move_pred_fn pred,void * arg)840 int bch2_move_data(struct bch_fs *c,
841 		   struct bbpos start,
842 		   struct bbpos end,
843 		   struct bch_ratelimit *rate,
844 		   struct bch_move_stats *stats,
845 		   struct write_point_specifier wp,
846 		   bool wait_on_copygc,
847 		   move_pred_fn pred, void *arg)
848 {
849 	struct moving_context ctxt;
850 
851 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
852 	int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
853 	bch2_moving_ctxt_exit(&ctxt);
854 
855 	return ret;
856 }
857 
__bch2_move_data_phys(struct moving_context * ctxt,struct move_bucket * bucket_in_flight,unsigned dev,u64 bucket_start,u64 bucket_end,unsigned data_types,bool copygc,move_pred_fn pred,void * arg)858 static int __bch2_move_data_phys(struct moving_context *ctxt,
859 			struct move_bucket *bucket_in_flight,
860 			unsigned dev,
861 			u64 bucket_start,
862 			u64 bucket_end,
863 			unsigned data_types,
864 			bool copygc,
865 			move_pred_fn pred, void *arg)
866 {
867 	struct btree_trans *trans = ctxt->trans;
868 	struct bch_fs *c = trans->c;
869 	bool is_kthread = current->flags & PF_KTHREAD;
870 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
871 	struct btree_iter iter = {}, bp_iter = {};
872 	struct bkey_buf sk;
873 	struct bkey_s_c k;
874 	struct bkey_buf last_flushed;
875 	u64 check_mismatch_done = bucket_start;
876 	int ret = 0;
877 
878 	struct bch_dev *ca = bch2_dev_tryget(c, dev);
879 	if (!ca)
880 		return 0;
881 
882 	bucket_end = min(bucket_end, ca->mi.nbuckets);
883 
884 	struct bpos bp_start	= bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
885 	struct bpos bp_end	= bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
886 
887 	bch2_bkey_buf_init(&last_flushed);
888 	bkey_init(&last_flushed.k->k);
889 	bch2_bkey_buf_init(&sk);
890 
891 	/*
892 	 * We're not run in a context that handles transaction restarts:
893 	 */
894 	bch2_trans_begin(trans);
895 
896 	bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
897 
898 	ret = bch2_btree_write_buffer_tryflush(trans);
899 	if (!bch2_err_matches(ret, EROFS))
900 		bch_err_msg(c, ret, "flushing btree write buffer");
901 	if (ret)
902 		goto err;
903 
904 	while (!(ret = bch2_move_ratelimit(ctxt))) {
905 		if (is_kthread && kthread_should_stop())
906 			break;
907 
908 		bch2_trans_begin(trans);
909 
910 		k = bch2_btree_iter_peek(trans, &bp_iter);
911 		ret = bkey_err(k);
912 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
913 			continue;
914 		if (ret)
915 			goto err;
916 
917 		if (!k.k || bkey_gt(k.k->p, bp_end))
918 			break;
919 
920 		if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
921 			while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
922 				bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
923 								       copygc, &last_flushed);
924 			}
925 			continue;
926 		}
927 
928 		if (k.k->type != KEY_TYPE_backpointer)
929 			goto next;
930 
931 		struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
932 
933 		if (ctxt->stats)
934 			ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
935 
936 		if (!(data_types & BIT(bp.v->data_type)))
937 			goto next;
938 
939 		if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
940 			goto next;
941 
942 		k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
943 		ret = bkey_err(k);
944 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
945 			continue;
946 		if (ret)
947 			goto err;
948 		if (!k.k)
949 			goto next;
950 
951 		if (!bp.v->level) {
952 			ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
953 			if (ret) {
954 				bch2_trans_iter_exit(trans, &iter);
955 				continue;
956 			}
957 		}
958 
959 		struct data_update_opts data_opts = {};
960 		bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
961 
962 		if (trace_io_move_pred_enabled())
963 			trace_io_move_pred2(c, k, &io_opts, &data_opts,
964 					    pred, arg, p);
965 
966 		if (!p) {
967 			bch2_trans_iter_exit(trans, &iter);
968 			goto next;
969 		}
970 
971 		if (data_opts.scrub &&
972 		    !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
973 			bch2_trans_iter_exit(trans, &iter);
974 			ret = bch_err_throw(c, device_offline);
975 			break;
976 		}
977 
978 		bch2_bkey_buf_reassemble(&sk, c, k);
979 		k = bkey_i_to_s_c(sk.k);
980 
981 		/* move_extent will drop locks */
982 		unsigned sectors = bp.v->bucket_len;
983 
984 		if (!bp.v->level)
985 			ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
986 		else if (!data_opts.scrub)
987 			ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
988 							  k.k->p, data_opts.target, 0);
989 		else
990 			ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
991 
992 		bch2_trans_iter_exit(trans, &iter);
993 
994 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
995 			continue;
996 		if (ret == -ENOMEM) {
997 			/* memory allocation failure, wait for some IO to finish */
998 			bch2_move_ctxt_wait_for_io(ctxt);
999 			continue;
1000 		}
1001 		if (ret)
1002 			goto err;
1003 
1004 		if (ctxt->stats)
1005 			atomic64_add(sectors, &ctxt->stats->sectors_seen);
1006 next:
1007 		bch2_btree_iter_advance(trans, &bp_iter);
1008 	}
1009 
1010 	while (check_mismatch_done < bucket_end)
1011 		bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
1012 						       copygc, &last_flushed);
1013 err:
1014 	bch2_trans_iter_exit(trans, &bp_iter);
1015 	bch2_bkey_buf_exit(&sk, c);
1016 	bch2_bkey_buf_exit(&last_flushed, c);
1017 	bch2_dev_put(ca);
1018 	return ret;
1019 }
1020 
bch2_move_data_phys(struct bch_fs * c,unsigned dev,u64 start,u64 end,unsigned data_types,struct bch_ratelimit * rate,struct bch_move_stats * stats,struct write_point_specifier wp,bool wait_on_copygc,move_pred_fn pred,void * arg)1021 int bch2_move_data_phys(struct bch_fs *c,
1022 			unsigned dev,
1023 			u64 start,
1024 			u64 end,
1025 			unsigned data_types,
1026 			struct bch_ratelimit *rate,
1027 			struct bch_move_stats *stats,
1028 			struct write_point_specifier wp,
1029 			bool wait_on_copygc,
1030 			move_pred_fn pred, void *arg)
1031 {
1032 	struct moving_context ctxt;
1033 
1034 	bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
1035 
1036 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
1037 	if (ctxt.stats) {
1038 		ctxt.stats->phys = true;
1039 		ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
1040 	}
1041 
1042 	int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
1043 					data_types, false, pred, arg);
1044 	bch2_moving_ctxt_exit(&ctxt);
1045 
1046 	return ret;
1047 }
1048 
evacuate_bucket_pred(struct bch_fs * c,void * _arg,enum btree_id btree,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1049 static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
1050 				 enum btree_id btree, struct bkey_s_c k,
1051 				 struct bch_io_opts *io_opts,
1052 				 struct data_update_opts *data_opts)
1053 {
1054 	struct evacuate_bucket_arg *arg = _arg;
1055 
1056 	*data_opts = arg->data_opts;
1057 
1058 	unsigned i = 0;
1059 	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
1060 		if (ptr->dev == arg->bucket.inode &&
1061 		    (arg->gen < 0 || arg->gen == ptr->gen) &&
1062 		    !ptr->cached)
1063 			data_opts->rewrite_ptrs |= BIT(i);
1064 		i++;
1065 	}
1066 
1067 	return data_opts->rewrite_ptrs != 0;
1068 }
1069 
bch2_evacuate_bucket(struct moving_context * ctxt,struct move_bucket * bucket_in_flight,struct bpos bucket,int gen,struct data_update_opts data_opts)1070 int bch2_evacuate_bucket(struct moving_context *ctxt,
1071 			 struct move_bucket *bucket_in_flight,
1072 			 struct bpos bucket, int gen,
1073 			 struct data_update_opts data_opts)
1074 {
1075 	struct bch_fs *c = ctxt->trans->c;
1076 	struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
1077 
1078 	count_event(c, io_move_evacuate_bucket);
1079 	if (trace_io_move_evacuate_bucket_enabled())
1080 		trace_io_move_evacuate_bucket2(c, bucket, gen);
1081 
1082 	return __bch2_move_data_phys(ctxt, bucket_in_flight,
1083 				   bucket.inode,
1084 				   bucket.offset,
1085 				   bucket.offset + 1,
1086 				   ~0,
1087 				   true,
1088 				   evacuate_bucket_pred, &arg);
1089 }
1090 
1091 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
1092 				struct btree *, struct bch_io_opts *,
1093 				struct data_update_opts *);
1094 
bch2_move_btree(struct bch_fs * c,struct bbpos start,struct bbpos end,move_btree_pred pred,void * arg,struct bch_move_stats * stats)1095 static int bch2_move_btree(struct bch_fs *c,
1096 			   struct bbpos start,
1097 			   struct bbpos end,
1098 			   move_btree_pred pred, void *arg,
1099 			   struct bch_move_stats *stats)
1100 {
1101 	bool kthread = (current->flags & PF_KTHREAD) != 0;
1102 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
1103 	struct moving_context ctxt;
1104 	struct btree_trans *trans;
1105 	struct btree_iter iter;
1106 	struct btree *b;
1107 	enum btree_id btree;
1108 	struct data_update_opts data_opts;
1109 	int ret = 0;
1110 
1111 	bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
1112 			      writepoint_ptr(&c->btree_write_point),
1113 			      true);
1114 	trans = ctxt.trans;
1115 
1116 	stats->data_type = BCH_DATA_btree;
1117 
1118 	for (btree = start.btree;
1119 	     btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
1120 	     btree ++) {
1121 		stats->pos = BBPOS(btree, POS_MIN);
1122 
1123 		if (!bch2_btree_id_root(c, btree)->b)
1124 			continue;
1125 
1126 		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
1127 					  BTREE_ITER_prefetch);
1128 retry:
1129 		ret = 0;
1130 		while (bch2_trans_begin(trans),
1131 		       (b = bch2_btree_iter_peek_node(trans, &iter)) &&
1132 		       !(ret = PTR_ERR_OR_ZERO(b))) {
1133 			if (kthread && kthread_should_stop())
1134 				break;
1135 
1136 			if ((cmp_int(btree, end.btree) ?:
1137 			     bpos_cmp(b->key.k.p, end.pos)) > 0)
1138 				break;
1139 
1140 			stats->pos = BBPOS(iter.btree_id, iter.pos);
1141 
1142 			if (!pred(c, arg, b, &io_opts, &data_opts))
1143 				goto next;
1144 
1145 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
1146 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1147 				continue;
1148 			if (ret)
1149 				break;
1150 next:
1151 			bch2_btree_iter_next_node(trans, &iter);
1152 		}
1153 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1154 			goto retry;
1155 
1156 		bch2_trans_iter_exit(trans, &iter);
1157 
1158 		if (kthread && kthread_should_stop())
1159 			break;
1160 	}
1161 
1162 	bch_err_fn(c, ret);
1163 	bch2_moving_ctxt_exit(&ctxt);
1164 	bch2_btree_interior_updates_flush(c);
1165 
1166 	return ret;
1167 }
1168 
rereplicate_pred(struct bch_fs * c,void * arg,enum btree_id btree,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1169 static bool rereplicate_pred(struct bch_fs *c, void *arg,
1170 			     enum btree_id btree, struct bkey_s_c k,
1171 			     struct bch_io_opts *io_opts,
1172 			     struct data_update_opts *data_opts)
1173 {
1174 	unsigned nr_good = bch2_bkey_durability(c, k);
1175 	unsigned replicas = bkey_is_btree_ptr(k.k)
1176 		? c->opts.metadata_replicas
1177 		: io_opts->data_replicas;
1178 
1179 	guard(rcu)();
1180 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1181 	unsigned i = 0;
1182 	bkey_for_each_ptr(ptrs, ptr) {
1183 		struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1184 		if (!ptr->cached &&
1185 		    (!ca || !ca->mi.durability))
1186 			data_opts->kill_ptrs |= BIT(i);
1187 		i++;
1188 	}
1189 
1190 	if (!data_opts->kill_ptrs &&
1191 	    (!nr_good || nr_good >= replicas))
1192 		return false;
1193 
1194 	data_opts->target		= 0;
1195 	data_opts->extra_replicas	= replicas - nr_good;
1196 	data_opts->btree_insert_flags	= 0;
1197 	return true;
1198 }
1199 
migrate_pred(struct bch_fs * c,void * arg,enum btree_id btree,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1200 static bool migrate_pred(struct bch_fs *c, void *arg,
1201 			 enum btree_id btree, struct bkey_s_c k,
1202 			 struct bch_io_opts *io_opts,
1203 			 struct data_update_opts *data_opts)
1204 {
1205 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1206 	struct bch_ioctl_data *op = arg;
1207 	unsigned i = 0;
1208 
1209 	data_opts->rewrite_ptrs		= 0;
1210 	data_opts->target		= 0;
1211 	data_opts->extra_replicas	= 0;
1212 	data_opts->btree_insert_flags	= 0;
1213 
1214 	bkey_for_each_ptr(ptrs, ptr) {
1215 		if (ptr->dev == op->migrate.dev)
1216 			data_opts->rewrite_ptrs |= 1U << i;
1217 		i++;
1218 	}
1219 
1220 	return data_opts->rewrite_ptrs != 0;
1221 }
1222 
rereplicate_btree_pred(struct bch_fs * c,void * arg,struct btree * b,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1223 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
1224 				   struct btree *b,
1225 				   struct bch_io_opts *io_opts,
1226 				   struct data_update_opts *data_opts)
1227 {
1228 	return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1229 }
1230 
1231 /*
1232  * Ancient versions of bcachefs produced packed formats which could represent
1233  * keys that the in memory format cannot represent; this checks for those
1234  * formats so we can get rid of them.
1235  */
bformat_needs_redo(struct bkey_format * f)1236 static bool bformat_needs_redo(struct bkey_format *f)
1237 {
1238 	for (unsigned i = 0; i < f->nr_fields; i++)
1239 		if (bch2_bkey_format_field_overflows(f, i))
1240 			return true;
1241 
1242 	return false;
1243 }
1244 
rewrite_old_nodes_pred(struct bch_fs * c,void * arg,struct btree * b,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1245 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1246 				   struct btree *b,
1247 				   struct bch_io_opts *io_opts,
1248 				   struct data_update_opts *data_opts)
1249 {
1250 	if (b->version_ondisk != c->sb.version ||
1251 	    btree_node_need_rewrite(b) ||
1252 	    bformat_needs_redo(&b->format)) {
1253 		data_opts->target		= 0;
1254 		data_opts->extra_replicas	= 0;
1255 		data_opts->btree_insert_flags	= 0;
1256 		return true;
1257 	}
1258 
1259 	return false;
1260 }
1261 
bch2_scan_old_btree_nodes(struct bch_fs * c,struct bch_move_stats * stats)1262 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1263 {
1264 	int ret;
1265 
1266 	ret = bch2_move_btree(c,
1267 			      BBPOS_MIN,
1268 			      BBPOS_MAX,
1269 			      rewrite_old_nodes_pred, c, stats);
1270 	if (!ret) {
1271 		mutex_lock(&c->sb_lock);
1272 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1273 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1274 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1275 		bch2_write_super(c);
1276 		mutex_unlock(&c->sb_lock);
1277 	}
1278 
1279 	bch_err_fn(c, ret);
1280 	return ret;
1281 }
1282 
drop_extra_replicas_pred(struct bch_fs * c,void * arg,enum btree_id btree,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1283 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1284 			     enum btree_id btree, struct bkey_s_c k,
1285 			     struct bch_io_opts *io_opts,
1286 			     struct data_update_opts *data_opts)
1287 {
1288 	unsigned durability = bch2_bkey_durability(c, k);
1289 	unsigned replicas = bkey_is_btree_ptr(k.k)
1290 		? c->opts.metadata_replicas
1291 		: io_opts->data_replicas;
1292 	const union bch_extent_entry *entry;
1293 	struct extent_ptr_decoded p;
1294 	unsigned i = 0;
1295 
1296 	guard(rcu)();
1297 	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1298 		unsigned d = bch2_extent_ptr_durability(c, &p);
1299 
1300 		if (d && durability - d >= replicas) {
1301 			data_opts->kill_ptrs |= BIT(i);
1302 			durability -= d;
1303 		}
1304 
1305 		i++;
1306 	}
1307 
1308 	return data_opts->kill_ptrs != 0;
1309 }
1310 
drop_extra_replicas_btree_pred(struct bch_fs * c,void * arg,struct btree * b,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1311 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1312 				   struct btree *b,
1313 				   struct bch_io_opts *io_opts,
1314 				   struct data_update_opts *data_opts)
1315 {
1316 	return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
1317 					io_opts, data_opts);
1318 }
1319 
scrub_pred(struct bch_fs * c,void * _arg,enum btree_id btree,struct bkey_s_c k,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)1320 static bool scrub_pred(struct bch_fs *c, void *_arg,
1321 		       enum btree_id btree, struct bkey_s_c k,
1322 		       struct bch_io_opts *io_opts,
1323 		       struct data_update_opts *data_opts)
1324 {
1325 	struct bch_ioctl_data *arg = _arg;
1326 
1327 	if (k.k->type != KEY_TYPE_btree_ptr_v2) {
1328 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1329 		const union bch_extent_entry *entry;
1330 		struct extent_ptr_decoded p;
1331 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
1332 			if (p.ptr.dev == arg->migrate.dev) {
1333 				if (!p.crc.csum_type)
1334 					return false;
1335 				break;
1336 			}
1337 	}
1338 
1339 	data_opts->scrub	= true;
1340 	data_opts->read_dev	= arg->migrate.dev;
1341 	return true;
1342 }
1343 
bch2_data_job(struct bch_fs * c,struct bch_move_stats * stats,struct bch_ioctl_data op)1344 int bch2_data_job(struct bch_fs *c,
1345 		  struct bch_move_stats *stats,
1346 		  struct bch_ioctl_data op)
1347 {
1348 	struct bbpos start	= BBPOS(op.start_btree, op.start_pos);
1349 	struct bbpos end	= BBPOS(op.end_btree, op.end_pos);
1350 	int ret = 0;
1351 
1352 	if (op.op >= BCH_DATA_OP_NR)
1353 		return -EINVAL;
1354 
1355 	bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1356 
1357 	switch (op.op) {
1358 	case BCH_DATA_OP_scrub:
1359 		/*
1360 		 * prevent tests from spuriously failing, make sure we see all
1361 		 * btree nodes that need to be repaired
1362 		 */
1363 		bch2_btree_interior_updates_flush(c);
1364 
1365 		ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
1366 					  op.scrub.data_types,
1367 					  NULL,
1368 					  stats,
1369 					  writepoint_hashed((unsigned long) current),
1370 					  false,
1371 					  scrub_pred, &op) ?: ret;
1372 		break;
1373 
1374 	case BCH_DATA_OP_rereplicate:
1375 		stats->data_type = BCH_DATA_journal;
1376 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
1377 		ret = bch2_move_btree(c, start, end,
1378 				      rereplicate_btree_pred, c, stats) ?: ret;
1379 		ret = bch2_move_data(c, start, end,
1380 				     NULL,
1381 				     stats,
1382 				     writepoint_hashed((unsigned long) current),
1383 				     true,
1384 				     rereplicate_pred, c) ?: ret;
1385 		ret = bch2_replicas_gc2(c) ?: ret;
1386 		break;
1387 	case BCH_DATA_OP_migrate:
1388 		if (op.migrate.dev >= c->sb.nr_devices)
1389 			return -EINVAL;
1390 
1391 		stats->data_type = BCH_DATA_journal;
1392 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1393 		ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
1394 					  ~0,
1395 					  NULL,
1396 					  stats,
1397 					  writepoint_hashed((unsigned long) current),
1398 					  true,
1399 					  migrate_pred, &op) ?: ret;
1400 		bch2_btree_interior_updates_flush(c);
1401 		ret = bch2_replicas_gc2(c) ?: ret;
1402 		break;
1403 	case BCH_DATA_OP_rewrite_old_nodes:
1404 		ret = bch2_scan_old_btree_nodes(c, stats);
1405 		break;
1406 	case BCH_DATA_OP_drop_extra_replicas:
1407 		ret = bch2_move_btree(c, start, end,
1408 				drop_extra_replicas_btree_pred, c, stats) ?: ret;
1409 		ret = bch2_move_data(c, start, end, NULL, stats,
1410 				writepoint_hashed((unsigned long) current),
1411 				true,
1412 				drop_extra_replicas_pred, c) ?: ret;
1413 		ret = bch2_replicas_gc2(c) ?: ret;
1414 		break;
1415 	default:
1416 		ret = -EINVAL;
1417 	}
1418 
1419 	bch2_move_stats_exit(stats, c);
1420 	return ret;
1421 }
1422 
bch2_move_stats_to_text(struct printbuf * out,struct bch_move_stats * stats)1423 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1424 {
1425 	prt_printf(out, "%s: data type==", stats->name);
1426 	bch2_prt_data_type(out, stats->data_type);
1427 	prt_str(out, " pos=");
1428 	bch2_bbpos_to_text(out, stats->pos);
1429 	prt_newline(out);
1430 	printbuf_indent_add(out, 2);
1431 
1432 	prt_printf(out, "keys moved:\t%llu\n",	atomic64_read(&stats->keys_moved));
1433 	prt_printf(out, "keys raced:\t%llu\n",	atomic64_read(&stats->keys_raced));
1434 	prt_printf(out, "bytes seen:\t");
1435 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1436 	prt_newline(out);
1437 
1438 	prt_printf(out, "bytes moved:\t");
1439 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1440 	prt_newline(out);
1441 
1442 	prt_printf(out, "bytes raced:\t");
1443 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1444 	prt_newline(out);
1445 
1446 	printbuf_indent_sub(out, 2);
1447 }
1448 
bch2_moving_ctxt_to_text(struct printbuf * out,struct bch_fs * c,struct moving_context * ctxt)1449 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1450 {
1451 	if (!out->nr_tabstops)
1452 		printbuf_tabstop_push(out, 32);
1453 
1454 	bch2_move_stats_to_text(out, ctxt->stats);
1455 	printbuf_indent_add(out, 2);
1456 
1457 	prt_printf(out, "reads: ios %u/%u sectors %u/%u\n",
1458 		   atomic_read(&ctxt->read_ios),
1459 		   c->opts.move_ios_in_flight,
1460 		   atomic_read(&ctxt->read_sectors),
1461 		   c->opts.move_bytes_in_flight >> 9);
1462 
1463 	prt_printf(out, "writes: ios %u/%u sectors %u/%u\n",
1464 		   atomic_read(&ctxt->write_ios),
1465 		   c->opts.move_ios_in_flight,
1466 		   atomic_read(&ctxt->write_sectors),
1467 		   c->opts.move_bytes_in_flight >> 9);
1468 
1469 	printbuf_indent_add(out, 2);
1470 
1471 	mutex_lock(&ctxt->lock);
1472 	struct moving_io *io;
1473 	list_for_each_entry(io, &ctxt->ios, io_list)
1474 		bch2_data_update_inflight_to_text(out, &io->write);
1475 	mutex_unlock(&ctxt->lock);
1476 
1477 	printbuf_indent_sub(out, 4);
1478 }
1479 
bch2_fs_moving_ctxts_to_text(struct printbuf * out,struct bch_fs * c)1480 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1481 {
1482 	struct moving_context *ctxt;
1483 
1484 	mutex_lock(&c->moving_context_lock);
1485 	list_for_each_entry(ctxt, &c->moving_context_list, list)
1486 		bch2_moving_ctxt_to_text(out, c, ctxt);
1487 	mutex_unlock(&c->moving_context_lock);
1488 }
1489 
bch2_fs_move_init(struct bch_fs * c)1490 void bch2_fs_move_init(struct bch_fs *c)
1491 {
1492 	INIT_LIST_HEAD(&c->moving_context_list);
1493 	mutex_init(&c->moving_context_lock);
1494 }
1495