xref: /linux/fs/bcachefs/move.c (revision f82811e22b480a203a438d8e1f29af9c93ccbb0c)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_io.h"
10 #include "btree_update.h"
11 #include "btree_update_interior.h"
12 #include "btree_write_buffer.h"
13 #include "compress.h"
14 #include "disk_groups.h"
15 #include "ec.h"
16 #include "errcode.h"
17 #include "error.h"
18 #include "inode.h"
19 #include "io_read.h"
20 #include "io_write.h"
21 #include "journal_reclaim.h"
22 #include "keylist.h"
23 #include "move.h"
24 #include "replicas.h"
25 #include "snapshot.h"
26 #include "super-io.h"
27 #include "trace.h"
28 
29 #include <linux/ioprio.h>
30 #include <linux/kthread.h>
31 
32 const char * const bch2_data_ops_strs[] = {
33 #define x(t, n, ...) [n] = #t,
34 	BCH_DATA_OPS()
35 #undef x
36 	NULL
37 };
38 
39 static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
40 					  struct bch_io_opts *io_opts,
41 					  struct data_update_opts *data_opts)
42 {
43 	printbuf_tabstop_push(out, 20);
44 	prt_str(out, "rewrite ptrs:");
45 	prt_tab(out);
46 	bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
47 	prt_newline(out);
48 
49 	prt_str(out, "kill ptrs: ");
50 	prt_tab(out);
51 	bch2_prt_u64_base2(out, data_opts->kill_ptrs);
52 	prt_newline(out);
53 
54 	prt_str(out, "target: ");
55 	prt_tab(out);
56 	bch2_target_to_text(out, c, data_opts->target);
57 	prt_newline(out);
58 
59 	prt_str(out, "compression: ");
60 	prt_tab(out);
61 	bch2_compression_opt_to_text(out, background_compression(*io_opts));
62 	prt_newline(out);
63 
64 	prt_str(out, "extra replicas: ");
65 	prt_tab(out);
66 	prt_u64(out, data_opts->extra_replicas);
67 }
68 
69 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
70 			       struct bch_io_opts *io_opts,
71 			       struct data_update_opts *data_opts)
72 {
73 	if (trace_move_extent_enabled()) {
74 		struct printbuf buf = PRINTBUF;
75 
76 		bch2_bkey_val_to_text(&buf, c, k);
77 		prt_newline(&buf);
78 		bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
79 		trace_move_extent(c, buf.buf);
80 		printbuf_exit(&buf);
81 	}
82 }
83 
84 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
85 {
86 	if (trace_move_extent_read_enabled()) {
87 		struct printbuf buf = PRINTBUF;
88 
89 		bch2_bkey_val_to_text(&buf, c, k);
90 		trace_move_extent_read(c, buf.buf);
91 		printbuf_exit(&buf);
92 	}
93 }
94 
95 struct moving_io {
96 	struct list_head		read_list;
97 	struct list_head		io_list;
98 	struct move_bucket_in_flight	*b;
99 	struct closure			cl;
100 	bool				read_completed;
101 
102 	unsigned			read_sectors;
103 	unsigned			write_sectors;
104 
105 	struct bch_read_bio		rbio;
106 
107 	struct data_update		write;
108 	/* Must be last since it is variable size */
109 	struct bio_vec			bi_inline_vecs[];
110 };
111 
112 static void move_free(struct moving_io *io)
113 {
114 	struct moving_context *ctxt = io->write.ctxt;
115 
116 	if (io->b)
117 		atomic_dec(&io->b->count);
118 
119 	bch2_data_update_exit(&io->write);
120 
121 	mutex_lock(&ctxt->lock);
122 	list_del(&io->io_list);
123 	wake_up(&ctxt->wait);
124 	mutex_unlock(&ctxt->lock);
125 
126 	kfree(io);
127 }
128 
129 static void move_write_done(struct bch_write_op *op)
130 {
131 	struct moving_io *io = container_of(op, struct moving_io, write.op);
132 	struct moving_context *ctxt = io->write.ctxt;
133 
134 	if (io->write.op.error)
135 		ctxt->write_error = true;
136 
137 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
138 	atomic_dec(&io->write.ctxt->write_ios);
139 	move_free(io);
140 	closure_put(&ctxt->cl);
141 }
142 
143 static void move_write(struct moving_io *io)
144 {
145 	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
146 		move_free(io);
147 		return;
148 	}
149 
150 	if (trace_move_extent_write_enabled()) {
151 		struct bch_fs *c = io->write.op.c;
152 		struct printbuf buf = PRINTBUF;
153 
154 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
155 		trace_move_extent_write(c, buf.buf);
156 		printbuf_exit(&buf);
157 	}
158 
159 	closure_get(&io->write.ctxt->cl);
160 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
161 	atomic_inc(&io->write.ctxt->write_ios);
162 
163 	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
164 }
165 
166 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
167 {
168 	struct moving_io *io =
169 		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
170 
171 	return io && io->read_completed ? io : NULL;
172 }
173 
174 static void move_read_endio(struct bio *bio)
175 {
176 	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
177 	struct moving_context *ctxt = io->write.ctxt;
178 
179 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
180 	atomic_dec(&ctxt->read_ios);
181 	io->read_completed = true;
182 
183 	wake_up(&ctxt->wait);
184 	closure_put(&ctxt->cl);
185 }
186 
187 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
188 {
189 	struct moving_io *io;
190 
191 	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
192 		bch2_trans_unlock_long(ctxt->trans);
193 		list_del(&io->read_list);
194 		move_write(io);
195 	}
196 }
197 
198 void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
199 {
200 	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
201 
202 	move_ctxt_wait_event(ctxt,
203 		!atomic_read(&ctxt->write_sectors) ||
204 		atomic_read(&ctxt->write_sectors) != sectors_pending);
205 }
206 
207 void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
208 {
209 	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
210 	bch2_trans_unlock_long(ctxt->trans);
211 	closure_sync(&ctxt->cl);
212 }
213 
214 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
215 {
216 	struct bch_fs *c = ctxt->trans->c;
217 
218 	bch2_moving_ctxt_flush_all(ctxt);
219 
220 	EBUG_ON(atomic_read(&ctxt->write_sectors));
221 	EBUG_ON(atomic_read(&ctxt->write_ios));
222 	EBUG_ON(atomic_read(&ctxt->read_sectors));
223 	EBUG_ON(atomic_read(&ctxt->read_ios));
224 
225 	mutex_lock(&c->moving_context_lock);
226 	list_del(&ctxt->list);
227 	mutex_unlock(&c->moving_context_lock);
228 
229 	bch2_trans_put(ctxt->trans);
230 	memset(ctxt, 0, sizeof(*ctxt));
231 }
232 
233 void bch2_moving_ctxt_init(struct moving_context *ctxt,
234 			   struct bch_fs *c,
235 			   struct bch_ratelimit *rate,
236 			   struct bch_move_stats *stats,
237 			   struct write_point_specifier wp,
238 			   bool wait_on_copygc)
239 {
240 	memset(ctxt, 0, sizeof(*ctxt));
241 
242 	ctxt->trans	= bch2_trans_get(c);
243 	ctxt->fn	= (void *) _RET_IP_;
244 	ctxt->rate	= rate;
245 	ctxt->stats	= stats;
246 	ctxt->wp	= wp;
247 	ctxt->wait_on_copygc = wait_on_copygc;
248 
249 	closure_init_stack(&ctxt->cl);
250 
251 	mutex_init(&ctxt->lock);
252 	INIT_LIST_HEAD(&ctxt->reads);
253 	INIT_LIST_HEAD(&ctxt->ios);
254 	init_waitqueue_head(&ctxt->wait);
255 
256 	mutex_lock(&c->moving_context_lock);
257 	list_add(&ctxt->list, &c->moving_context_list);
258 	mutex_unlock(&c->moving_context_lock);
259 }
260 
261 void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
262 {
263 	trace_move_data(c, stats);
264 }
265 
266 void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
267 {
268 	memset(stats, 0, sizeof(*stats));
269 	stats->data_type = BCH_DATA_user;
270 	scnprintf(stats->name, sizeof(stats->name), "%s", name);
271 }
272 
273 int bch2_move_extent(struct moving_context *ctxt,
274 		     struct move_bucket_in_flight *bucket_in_flight,
275 		     struct btree_iter *iter,
276 		     struct bkey_s_c k,
277 		     struct bch_io_opts io_opts,
278 		     struct data_update_opts data_opts)
279 {
280 	struct btree_trans *trans = ctxt->trans;
281 	struct bch_fs *c = trans->c;
282 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
283 	struct moving_io *io;
284 	const union bch_extent_entry *entry;
285 	struct extent_ptr_decoded p;
286 	unsigned sectors = k.k->size, pages;
287 	int ret = -ENOMEM;
288 
289 	trace_move_extent2(c, k, &io_opts, &data_opts);
290 
291 	if (ctxt->stats)
292 		ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
293 
294 	bch2_data_update_opts_normalize(k, &data_opts);
295 
296 	if (!data_opts.rewrite_ptrs &&
297 	    !data_opts.extra_replicas) {
298 		if (data_opts.kill_ptrs)
299 			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
300 		return 0;
301 	}
302 
303 	/*
304 	 * Before memory allocations & taking nocow locks in
305 	 * bch2_data_update_init():
306 	 */
307 	bch2_trans_unlock(trans);
308 
309 	/* write path might have to decompress data: */
310 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
311 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
312 
313 	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
314 	io = kzalloc(sizeof(struct moving_io) +
315 		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
316 	if (!io)
317 		goto err;
318 
319 	INIT_LIST_HEAD(&io->io_list);
320 	io->write.ctxt		= ctxt;
321 	io->read_sectors	= k.k->size;
322 	io->write_sectors	= k.k->size;
323 
324 	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
325 	bio_set_prio(&io->write.op.wbio.bio,
326 		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
327 
328 	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
329 				 GFP_KERNEL))
330 		goto err_free;
331 
332 	io->rbio.c		= c;
333 	io->rbio.opts		= io_opts;
334 	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
335 	io->rbio.bio.bi_vcnt = pages;
336 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
337 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
338 
339 	io->rbio.bio.bi_opf		= REQ_OP_READ;
340 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
341 	io->rbio.bio.bi_end_io		= move_read_endio;
342 
343 	ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
344 				    io_opts, data_opts, iter->btree_id, k);
345 	if (ret)
346 		goto err_free_pages;
347 
348 	io->write.op.end_io = move_write_done;
349 
350 	if (ctxt->rate)
351 		bch2_ratelimit_increment(ctxt->rate, k.k->size);
352 
353 	if (ctxt->stats) {
354 		atomic64_inc(&ctxt->stats->keys_moved);
355 		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
356 	}
357 
358 	if (bucket_in_flight) {
359 		io->b = bucket_in_flight;
360 		atomic_inc(&io->b->count);
361 	}
362 
363 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
364 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
365 	trace_move_extent_read2(c, k);
366 
367 	mutex_lock(&ctxt->lock);
368 	atomic_add(io->read_sectors, &ctxt->read_sectors);
369 	atomic_inc(&ctxt->read_ios);
370 
371 	list_add_tail(&io->read_list, &ctxt->reads);
372 	list_add_tail(&io->io_list, &ctxt->ios);
373 	mutex_unlock(&ctxt->lock);
374 
375 	/*
376 	 * dropped by move_read_endio() - guards against use after free of
377 	 * ctxt when doing wakeup
378 	 */
379 	closure_get(&ctxt->cl);
380 	bch2_read_extent(trans, &io->rbio,
381 			 bkey_start_pos(k.k),
382 			 iter->btree_id, k, 0,
383 			 BCH_READ_NODECODE|
384 			 BCH_READ_LAST_FRAGMENT);
385 	return 0;
386 err_free_pages:
387 	bio_free_pages(&io->write.op.wbio.bio);
388 err_free:
389 	kfree(io);
390 err:
391 	if (ret == -BCH_ERR_data_update_done)
392 		return 0;
393 
394 	if (bch2_err_matches(ret, EROFS) ||
395 	    bch2_err_matches(ret, BCH_ERR_transaction_restart))
396 		return ret;
397 
398 	count_event(c, move_extent_start_fail);
399 
400 	if (trace_move_extent_start_fail_enabled()) {
401 		struct printbuf buf = PRINTBUF;
402 
403 		bch2_bkey_val_to_text(&buf, c, k);
404 		prt_str(&buf, ": ");
405 		prt_str(&buf, bch2_err_str(ret));
406 		trace_move_extent_start_fail(c, buf.buf);
407 		printbuf_exit(&buf);
408 	}
409 	return ret;
410 }
411 
412 struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
413 			  struct per_snapshot_io_opts *io_opts,
414 			  struct bkey_s_c extent_k)
415 {
416 	struct bch_fs *c = trans->c;
417 	u32 restart_count = trans->restart_count;
418 	int ret = 0;
419 
420 	if (io_opts->cur_inum != extent_k.k->p.inode) {
421 		io_opts->d.nr = 0;
422 
423 		ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
424 					 BTREE_ITER_ALL_SNAPSHOTS, k, ({
425 			if (k.k->p.offset != extent_k.k->p.inode)
426 				break;
427 
428 			if (!bkey_is_inode(k.k))
429 				continue;
430 
431 			struct bch_inode_unpacked inode;
432 			BUG_ON(bch2_inode_unpack(k, &inode));
433 
434 			struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
435 			bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
436 
437 			darray_push(&io_opts->d, e);
438 		}));
439 		io_opts->cur_inum = extent_k.k->p.inode;
440 	}
441 
442 	ret = ret ?: trans_was_restarted(trans, restart_count);
443 	if (ret)
444 		return ERR_PTR(ret);
445 
446 	if (extent_k.k->p.snapshot)
447 		darray_for_each(io_opts->d, i)
448 			if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
449 				return &i->io_opts;
450 
451 	return &io_opts->fs_io_opts;
452 }
453 
454 int bch2_move_get_io_opts_one(struct btree_trans *trans,
455 			      struct bch_io_opts *io_opts,
456 			      struct bkey_s_c extent_k)
457 {
458 	struct btree_iter iter;
459 	struct bkey_s_c k;
460 	int ret;
461 
462 	/* reflink btree? */
463 	if (!extent_k.k->p.inode) {
464 		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
465 		return 0;
466 	}
467 
468 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
469 			       SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
470 			       BTREE_ITER_CACHED);
471 	ret = bkey_err(k);
472 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
473 		return ret;
474 
475 	if (!ret && bkey_is_inode(k.k)) {
476 		struct bch_inode_unpacked inode;
477 		bch2_inode_unpack(k, &inode);
478 		bch2_inode_opts_get(io_opts, trans->c, &inode);
479 	} else {
480 		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
481 	}
482 
483 	bch2_trans_iter_exit(trans, &iter);
484 	return 0;
485 }
486 
487 int bch2_move_ratelimit(struct moving_context *ctxt)
488 {
489 	struct bch_fs *c = ctxt->trans->c;
490 	bool is_kthread = current->flags & PF_KTHREAD;
491 	u64 delay;
492 
493 	if (ctxt->wait_on_copygc && c->copygc_running) {
494 		bch2_moving_ctxt_flush_all(ctxt);
495 		wait_event_killable(c->copygc_running_wq,
496 				    !c->copygc_running ||
497 				    (is_kthread && kthread_should_stop()));
498 	}
499 
500 	do {
501 		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
502 
503 		if (is_kthread && kthread_should_stop())
504 			return 1;
505 
506 		if (delay)
507 			move_ctxt_wait_event_timeout(ctxt,
508 					freezing(current) ||
509 					(is_kthread && kthread_should_stop()),
510 					delay);
511 
512 		if (unlikely(freezing(current))) {
513 			bch2_moving_ctxt_flush_all(ctxt);
514 			try_to_freeze();
515 		}
516 	} while (delay);
517 
518 	/*
519 	 * XXX: these limits really ought to be per device, SSDs and hard drives
520 	 * will want different limits
521 	 */
522 	move_ctxt_wait_event(ctxt,
523 		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
524 		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
525 		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
526 		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
527 
528 	return 0;
529 }
530 
531 static int bch2_move_data_btree(struct moving_context *ctxt,
532 				struct bpos start,
533 				struct bpos end,
534 				move_pred_fn pred, void *arg,
535 				enum btree_id btree_id)
536 {
537 	struct btree_trans *trans = ctxt->trans;
538 	struct bch_fs *c = trans->c;
539 	struct per_snapshot_io_opts snapshot_io_opts;
540 	struct bch_io_opts *io_opts;
541 	struct bkey_buf sk;
542 	struct btree_iter iter;
543 	struct bkey_s_c k;
544 	struct data_update_opts data_opts;
545 	int ret = 0, ret2;
546 
547 	per_snapshot_io_opts_init(&snapshot_io_opts, c);
548 	bch2_bkey_buf_init(&sk);
549 
550 	if (ctxt->stats) {
551 		ctxt->stats->data_type	= BCH_DATA_user;
552 		ctxt->stats->pos	= BBPOS(btree_id, start);
553 	}
554 
555 	bch2_trans_iter_init(trans, &iter, btree_id, start,
556 			     BTREE_ITER_PREFETCH|
557 			     BTREE_ITER_ALL_SNAPSHOTS);
558 
559 	if (ctxt->rate)
560 		bch2_ratelimit_reset(ctxt->rate);
561 
562 	while (!bch2_move_ratelimit(ctxt)) {
563 		bch2_trans_begin(trans);
564 
565 		k = bch2_btree_iter_peek(&iter);
566 		if (!k.k)
567 			break;
568 
569 		ret = bkey_err(k);
570 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571 			continue;
572 		if (ret)
573 			break;
574 
575 		if (bkey_ge(bkey_start_pos(k.k), end))
576 			break;
577 
578 		if (ctxt->stats)
579 			ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
580 
581 		if (!bkey_extent_is_direct_data(k.k))
582 			goto next_nondata;
583 
584 		io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
585 		ret = PTR_ERR_OR_ZERO(io_opts);
586 		if (ret)
587 			continue;
588 
589 		memset(&data_opts, 0, sizeof(data_opts));
590 		if (!pred(c, arg, k, io_opts, &data_opts))
591 			goto next;
592 
593 		/*
594 		 * The iterator gets unlocked by __bch2_read_extent - need to
595 		 * save a copy of @k elsewhere:
596 		 */
597 		bch2_bkey_buf_reassemble(&sk, c, k);
598 		k = bkey_i_to_s_c(sk.k);
599 
600 		ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
601 		if (ret2) {
602 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
603 				continue;
604 
605 			if (ret2 == -ENOMEM) {
606 				/* memory allocation failure, wait for some IO to finish */
607 				bch2_move_ctxt_wait_for_io(ctxt);
608 				continue;
609 			}
610 
611 			/* XXX signal failure */
612 			goto next;
613 		}
614 next:
615 		if (ctxt->stats)
616 			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
617 next_nondata:
618 		bch2_btree_iter_advance(&iter);
619 	}
620 
621 	bch2_trans_iter_exit(trans, &iter);
622 	bch2_bkey_buf_exit(&sk, c);
623 	per_snapshot_io_opts_exit(&snapshot_io_opts);
624 
625 	return ret;
626 }
627 
628 int __bch2_move_data(struct moving_context *ctxt,
629 		     struct bbpos start,
630 		     struct bbpos end,
631 		     move_pred_fn pred, void *arg)
632 {
633 	struct bch_fs *c = ctxt->trans->c;
634 	enum btree_id id;
635 	int ret = 0;
636 
637 	for (id = start.btree;
638 	     id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
639 	     id++) {
640 		ctxt->stats->pos = BBPOS(id, POS_MIN);
641 
642 		if (!btree_type_has_ptrs(id) ||
643 		    !bch2_btree_id_root(c, id)->b)
644 			continue;
645 
646 		ret = bch2_move_data_btree(ctxt,
647 				       id == start.btree ? start.pos : POS_MIN,
648 				       id == end.btree   ? end.pos   : POS_MAX,
649 				       pred, arg, id);
650 		if (ret)
651 			break;
652 	}
653 
654 	return ret;
655 }
656 
657 int bch2_move_data(struct bch_fs *c,
658 		   struct bbpos start,
659 		   struct bbpos end,
660 		   struct bch_ratelimit *rate,
661 		   struct bch_move_stats *stats,
662 		   struct write_point_specifier wp,
663 		   bool wait_on_copygc,
664 		   move_pred_fn pred, void *arg)
665 {
666 
667 	struct moving_context ctxt;
668 	int ret;
669 
670 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
671 	ret = __bch2_move_data(&ctxt, start, end, pred, arg);
672 	bch2_moving_ctxt_exit(&ctxt);
673 
674 	return ret;
675 }
676 
677 int bch2_evacuate_bucket(struct moving_context *ctxt,
678 			   struct move_bucket_in_flight *bucket_in_flight,
679 			   struct bpos bucket, int gen,
680 			   struct data_update_opts _data_opts)
681 {
682 	struct btree_trans *trans = ctxt->trans;
683 	struct bch_fs *c = trans->c;
684 	bool is_kthread = current->flags & PF_KTHREAD;
685 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
686 	struct btree_iter iter;
687 	struct bkey_buf sk;
688 	struct bch_backpointer bp;
689 	struct bch_alloc_v4 a_convert;
690 	const struct bch_alloc_v4 *a;
691 	struct bkey_s_c k;
692 	struct data_update_opts data_opts;
693 	unsigned dirty_sectors, bucket_size;
694 	u64 fragmentation;
695 	struct bpos bp_pos = POS_MIN;
696 	int ret = 0;
697 
698 	trace_bucket_evacuate(c, &bucket);
699 
700 	bch2_bkey_buf_init(&sk);
701 
702 	/*
703 	 * We're not run in a context that handles transaction restarts:
704 	 */
705 	bch2_trans_begin(trans);
706 
707 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
708 			     bucket, BTREE_ITER_CACHED);
709 	ret = lockrestart_do(trans,
710 			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
711 	bch2_trans_iter_exit(trans, &iter);
712 
713 	bch_err_msg(c, ret, "looking up alloc key");
714 	if (ret)
715 		goto err;
716 
717 	a = bch2_alloc_to_v4(k, &a_convert);
718 	dirty_sectors = bch2_bucket_sectors_dirty(*a);
719 	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
720 	fragmentation = a->fragmentation_lru;
721 
722 	ret = bch2_btree_write_buffer_tryflush(trans);
723 	bch_err_msg(c, ret, "flushing btree write buffer");
724 	if (ret)
725 		goto err;
726 
727 	while (!(ret = bch2_move_ratelimit(ctxt))) {
728 		if (is_kthread && kthread_should_stop())
729 			break;
730 
731 		bch2_trans_begin(trans);
732 
733 		ret = bch2_get_next_backpointer(trans, bucket, gen,
734 						&bp_pos, &bp,
735 						BTREE_ITER_CACHED);
736 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737 			continue;
738 		if (ret)
739 			goto err;
740 		if (bkey_eq(bp_pos, POS_MAX))
741 			break;
742 
743 		if (!bp.level) {
744 			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
745 			ret = bkey_err(k);
746 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
747 				continue;
748 			if (ret)
749 				goto err;
750 			if (!k.k)
751 				goto next;
752 
753 			bch2_bkey_buf_reassemble(&sk, c, k);
754 			k = bkey_i_to_s_c(sk.k);
755 
756 			ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
757 			if (ret) {
758 				bch2_trans_iter_exit(trans, &iter);
759 				continue;
760 			}
761 
762 			data_opts = _data_opts;
763 			data_opts.target	= io_opts.background_target;
764 			data_opts.rewrite_ptrs = 0;
765 
766 			unsigned i = 0;
767 			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
768 				if (ptr->dev == bucket.inode) {
769 					data_opts.rewrite_ptrs |= 1U << i;
770 					if (ptr->cached) {
771 						bch2_trans_iter_exit(trans, &iter);
772 						goto next;
773 					}
774 				}
775 				i++;
776 			}
777 
778 			ret = bch2_move_extent(ctxt, bucket_in_flight,
779 					       &iter, k, io_opts, data_opts);
780 			bch2_trans_iter_exit(trans, &iter);
781 
782 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
783 				continue;
784 			if (ret == -ENOMEM) {
785 				/* memory allocation failure, wait for some IO to finish */
786 				bch2_move_ctxt_wait_for_io(ctxt);
787 				continue;
788 			}
789 			if (ret)
790 				goto err;
791 
792 			if (ctxt->stats)
793 				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
794 		} else {
795 			struct btree *b;
796 
797 			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
798 			ret = PTR_ERR_OR_ZERO(b);
799 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
800 				continue;
801 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
802 				continue;
803 			if (ret)
804 				goto err;
805 			if (!b)
806 				goto next;
807 
808 			unsigned sectors = btree_ptr_sectors_written(&b->key);
809 
810 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
811 			bch2_trans_iter_exit(trans, &iter);
812 
813 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
814 				continue;
815 			if (ret)
816 				goto err;
817 
818 			if (ctxt->rate)
819 				bch2_ratelimit_increment(ctxt->rate, sectors);
820 			if (ctxt->stats) {
821 				atomic64_add(sectors, &ctxt->stats->sectors_seen);
822 				atomic64_add(sectors, &ctxt->stats->sectors_moved);
823 			}
824 		}
825 next:
826 		bp_pos = bpos_nosnap_successor(bp_pos);
827 	}
828 
829 	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
830 err:
831 	bch2_bkey_buf_exit(&sk, c);
832 	return ret;
833 }
834 
835 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
836 				struct btree *, struct bch_io_opts *,
837 				struct data_update_opts *);
838 
839 static int bch2_move_btree(struct bch_fs *c,
840 			   struct bbpos start,
841 			   struct bbpos end,
842 			   move_btree_pred pred, void *arg,
843 			   struct bch_move_stats *stats)
844 {
845 	bool kthread = (current->flags & PF_KTHREAD) != 0;
846 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
847 	struct moving_context ctxt;
848 	struct btree_trans *trans;
849 	struct btree_iter iter;
850 	struct btree *b;
851 	enum btree_id btree;
852 	struct data_update_opts data_opts;
853 	int ret = 0;
854 
855 	bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
856 			      writepoint_ptr(&c->btree_write_point),
857 			      true);
858 	trans = ctxt.trans;
859 
860 	stats->data_type = BCH_DATA_btree;
861 
862 	for (btree = start.btree;
863 	     btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
864 	     btree ++) {
865 		stats->pos = BBPOS(btree, POS_MIN);
866 
867 		if (!bch2_btree_id_root(c, btree)->b)
868 			continue;
869 
870 		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
871 					  BTREE_ITER_PREFETCH);
872 retry:
873 		ret = 0;
874 		while (bch2_trans_begin(trans),
875 		       (b = bch2_btree_iter_peek_node(&iter)) &&
876 		       !(ret = PTR_ERR_OR_ZERO(b))) {
877 			if (kthread && kthread_should_stop())
878 				break;
879 
880 			if ((cmp_int(btree, end.btree) ?:
881 			     bpos_cmp(b->key.k.p, end.pos)) > 0)
882 				break;
883 
884 			stats->pos = BBPOS(iter.btree_id, iter.pos);
885 
886 			if (!pred(c, arg, b, &io_opts, &data_opts))
887 				goto next;
888 
889 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
890 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
891 				continue;
892 			if (ret)
893 				break;
894 next:
895 			bch2_btree_iter_next_node(&iter);
896 		}
897 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
898 			goto retry;
899 
900 		bch2_trans_iter_exit(trans, &iter);
901 
902 		if (kthread && kthread_should_stop())
903 			break;
904 	}
905 
906 	bch_err_fn(c, ret);
907 	bch2_moving_ctxt_exit(&ctxt);
908 	bch2_btree_interior_updates_flush(c);
909 
910 	return ret;
911 }
912 
913 static bool rereplicate_pred(struct bch_fs *c, void *arg,
914 			     struct bkey_s_c k,
915 			     struct bch_io_opts *io_opts,
916 			     struct data_update_opts *data_opts)
917 {
918 	unsigned nr_good = bch2_bkey_durability(c, k);
919 	unsigned replicas = bkey_is_btree_ptr(k.k)
920 		? c->opts.metadata_replicas
921 		: io_opts->data_replicas;
922 
923 	if (!nr_good || nr_good >= replicas)
924 		return false;
925 
926 	data_opts->target		= 0;
927 	data_opts->extra_replicas	= replicas - nr_good;
928 	data_opts->btree_insert_flags	= 0;
929 	return true;
930 }
931 
932 static bool migrate_pred(struct bch_fs *c, void *arg,
933 			 struct bkey_s_c k,
934 			 struct bch_io_opts *io_opts,
935 			 struct data_update_opts *data_opts)
936 {
937 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
938 	struct bch_ioctl_data *op = arg;
939 	unsigned i = 0;
940 
941 	data_opts->rewrite_ptrs		= 0;
942 	data_opts->target		= 0;
943 	data_opts->extra_replicas	= 0;
944 	data_opts->btree_insert_flags	= 0;
945 
946 	bkey_for_each_ptr(ptrs, ptr) {
947 		if (ptr->dev == op->migrate.dev)
948 			data_opts->rewrite_ptrs |= 1U << i;
949 		i++;
950 	}
951 
952 	return data_opts->rewrite_ptrs != 0;
953 }
954 
955 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
956 				   struct btree *b,
957 				   struct bch_io_opts *io_opts,
958 				   struct data_update_opts *data_opts)
959 {
960 	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
961 }
962 
963 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
964 			       struct btree *b,
965 			       struct bch_io_opts *io_opts,
966 			       struct data_update_opts *data_opts)
967 {
968 	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
969 }
970 
971 static bool bformat_needs_redo(struct bkey_format *f)
972 {
973 	unsigned i;
974 
975 	for (i = 0; i < f->nr_fields; i++) {
976 		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
977 		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
978 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
979 
980 		if (f->bits_per_field[i] > unpacked_bits)
981 			return true;
982 
983 		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
984 			return true;
985 
986 		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
987 		     unpacked_mask) <
988 		    field_offset)
989 			return true;
990 	}
991 
992 	return false;
993 }
994 
995 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
996 				   struct btree *b,
997 				   struct bch_io_opts *io_opts,
998 				   struct data_update_opts *data_opts)
999 {
1000 	if (b->version_ondisk != c->sb.version ||
1001 	    btree_node_need_rewrite(b) ||
1002 	    bformat_needs_redo(&b->format)) {
1003 		data_opts->target		= 0;
1004 		data_opts->extra_replicas	= 0;
1005 		data_opts->btree_insert_flags	= 0;
1006 		return true;
1007 	}
1008 
1009 	return false;
1010 }
1011 
1012 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1013 {
1014 	int ret;
1015 
1016 	ret = bch2_move_btree(c,
1017 			      BBPOS_MIN,
1018 			      BBPOS_MAX,
1019 			      rewrite_old_nodes_pred, c, stats);
1020 	if (!ret) {
1021 		mutex_lock(&c->sb_lock);
1022 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1023 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1024 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1025 		bch2_write_super(c);
1026 		mutex_unlock(&c->sb_lock);
1027 	}
1028 
1029 	bch_err_fn(c, ret);
1030 	return ret;
1031 }
1032 
1033 static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1034 			     struct bkey_s_c k,
1035 			     struct bch_io_opts *io_opts,
1036 			     struct data_update_opts *data_opts)
1037 {
1038 	unsigned durability = bch2_bkey_durability(c, k);
1039 	unsigned replicas = bkey_is_btree_ptr(k.k)
1040 		? c->opts.metadata_replicas
1041 		: io_opts->data_replicas;
1042 	const union bch_extent_entry *entry;
1043 	struct extent_ptr_decoded p;
1044 	unsigned i = 0;
1045 
1046 	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1047 		unsigned d = bch2_extent_ptr_durability(c, &p);
1048 
1049 		if (d && durability - d >= replicas) {
1050 			data_opts->kill_ptrs |= BIT(i);
1051 			durability -= d;
1052 		}
1053 
1054 		i++;
1055 	}
1056 
1057 	return data_opts->kill_ptrs != 0;
1058 }
1059 
1060 static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1061 				   struct btree *b,
1062 				   struct bch_io_opts *io_opts,
1063 				   struct data_update_opts *data_opts)
1064 {
1065 	return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
1066 }
1067 
1068 int bch2_data_job(struct bch_fs *c,
1069 		  struct bch_move_stats *stats,
1070 		  struct bch_ioctl_data op)
1071 {
1072 	struct bbpos start	= BBPOS(op.start_btree, op.start_pos);
1073 	struct bbpos end	= BBPOS(op.end_btree, op.end_pos);
1074 	int ret = 0;
1075 
1076 	if (op.op >= BCH_DATA_OP_NR)
1077 		return -EINVAL;
1078 
1079 	bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
1080 
1081 	switch (op.op) {
1082 	case BCH_DATA_OP_rereplicate:
1083 		stats->data_type = BCH_DATA_journal;
1084 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
1085 		ret = bch2_move_btree(c, start, end,
1086 				      rereplicate_btree_pred, c, stats) ?: ret;
1087 		ret = bch2_move_data(c, start, end,
1088 				     NULL,
1089 				     stats,
1090 				     writepoint_hashed((unsigned long) current),
1091 				     true,
1092 				     rereplicate_pred, c) ?: ret;
1093 		ret = bch2_replicas_gc2(c) ?: ret;
1094 		break;
1095 	case BCH_DATA_OP_migrate:
1096 		if (op.migrate.dev >= c->sb.nr_devices)
1097 			return -EINVAL;
1098 
1099 		stats->data_type = BCH_DATA_journal;
1100 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1101 		ret = bch2_move_btree(c, start, end,
1102 				      migrate_btree_pred, &op, stats) ?: ret;
1103 		ret = bch2_move_data(c, start, end,
1104 				     NULL,
1105 				     stats,
1106 				     writepoint_hashed((unsigned long) current),
1107 				     true,
1108 				     migrate_pred, &op) ?: ret;
1109 		ret = bch2_replicas_gc2(c) ?: ret;
1110 		break;
1111 	case BCH_DATA_OP_rewrite_old_nodes:
1112 		ret = bch2_scan_old_btree_nodes(c, stats);
1113 		break;
1114 	case BCH_DATA_OP_drop_extra_replicas:
1115 		ret = bch2_move_btree(c, start, end,
1116 				drop_extra_replicas_btree_pred, c, stats) ?: ret;
1117 		ret = bch2_move_data(c, start, end, NULL, stats,
1118 				writepoint_hashed((unsigned long) current),
1119 				true,
1120 				drop_extra_replicas_pred, c) ?: ret;
1121 		ret = bch2_replicas_gc2(c) ?: ret;
1122 		break;
1123 	default:
1124 		ret = -EINVAL;
1125 	}
1126 
1127 	bch2_move_stats_exit(stats, c);
1128 	return ret;
1129 }
1130 
1131 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1132 {
1133 	prt_printf(out, "%s: data type==", stats->name);
1134 	bch2_prt_data_type(out, stats->data_type);
1135 	prt_str(out, " pos=");
1136 	bch2_bbpos_to_text(out, stats->pos);
1137 	prt_newline(out);
1138 	printbuf_indent_add(out, 2);
1139 
1140 	prt_str(out, "keys moved:  ");
1141 	prt_u64(out, atomic64_read(&stats->keys_moved));
1142 	prt_newline(out);
1143 
1144 	prt_str(out, "keys raced:  ");
1145 	prt_u64(out, atomic64_read(&stats->keys_raced));
1146 	prt_newline(out);
1147 
1148 	prt_str(out, "bytes seen:  ");
1149 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1150 	prt_newline(out);
1151 
1152 	prt_str(out, "bytes moved: ");
1153 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1154 	prt_newline(out);
1155 
1156 	prt_str(out, "bytes raced: ");
1157 	prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1158 	prt_newline(out);
1159 
1160 	printbuf_indent_sub(out, 2);
1161 }
1162 
1163 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1164 {
1165 	struct moving_io *io;
1166 
1167 	bch2_move_stats_to_text(out, ctxt->stats);
1168 	printbuf_indent_add(out, 2);
1169 
1170 	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1171 		   atomic_read(&ctxt->read_ios),
1172 		   c->opts.move_ios_in_flight,
1173 		   atomic_read(&ctxt->read_sectors),
1174 		   c->opts.move_bytes_in_flight >> 9);
1175 	prt_newline(out);
1176 
1177 	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1178 		   atomic_read(&ctxt->write_ios),
1179 		   c->opts.move_ios_in_flight,
1180 		   atomic_read(&ctxt->write_sectors),
1181 		   c->opts.move_bytes_in_flight >> 9);
1182 	prt_newline(out);
1183 
1184 	printbuf_indent_add(out, 2);
1185 
1186 	mutex_lock(&ctxt->lock);
1187 	list_for_each_entry(io, &ctxt->ios, io_list)
1188 		bch2_write_op_to_text(out, &io->write.op);
1189 	mutex_unlock(&ctxt->lock);
1190 
1191 	printbuf_indent_sub(out, 4);
1192 }
1193 
1194 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1195 {
1196 	struct moving_context *ctxt;
1197 
1198 	mutex_lock(&c->moving_context_lock);
1199 	list_for_each_entry(ctxt, &c->moving_context_list, list)
1200 		bch2_moving_ctxt_to_text(out, c, ctxt);
1201 	mutex_unlock(&c->moving_context_lock);
1202 }
1203 
1204 void bch2_fs_move_init(struct bch_fs *c)
1205 {
1206 	INIT_LIST_HEAD(&c->moving_context_list);
1207 	mutex_init(&c->moving_context_lock);
1208 }
1209