xref: /linux/fs/bcachefs/move.c (revision 031fba65fc202abf1f193e321be7a2c274fd88ba)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "backpointers.h"
7 #include "bkey_buf.h"
8 #include "btree_gc.h"
9 #include "btree_update.h"
10 #include "btree_update_interior.h"
11 #include "btree_write_buffer.h"
12 #include "disk_groups.h"
13 #include "ec.h"
14 #include "errcode.h"
15 #include "error.h"
16 #include "inode.h"
17 #include "io_read.h"
18 #include "io_write.h"
19 #include "journal_reclaim.h"
20 #include "keylist.h"
21 #include "move.h"
22 #include "replicas.h"
23 #include "super-io.h"
24 #include "trace.h"
25 
26 #include <linux/ioprio.h>
27 #include <linux/kthread.h>
28 
29 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
30 {
31 	if (trace_move_extent_enabled()) {
32 		struct printbuf buf = PRINTBUF;
33 
34 		bch2_bkey_val_to_text(&buf, c, k);
35 		trace_move_extent(c, buf.buf);
36 		printbuf_exit(&buf);
37 	}
38 }
39 
40 static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
41 {
42 	if (trace_move_extent_read_enabled()) {
43 		struct printbuf buf = PRINTBUF;
44 
45 		bch2_bkey_val_to_text(&buf, c, k);
46 		trace_move_extent_read(c, buf.buf);
47 		printbuf_exit(&buf);
48 	}
49 }
50 
51 static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k)
52 {
53 	if (trace_move_extent_alloc_mem_fail_enabled()) {
54 		struct printbuf buf = PRINTBUF;
55 
56 		bch2_bkey_val_to_text(&buf, c, k);
57 		trace_move_extent_alloc_mem_fail(c, buf.buf);
58 		printbuf_exit(&buf);
59 	}
60 }
61 
62 static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
63 {
64 	mutex_lock(&c->data_progress_lock);
65 	list_add(&stats->list, &c->data_progress_list);
66 	mutex_unlock(&c->data_progress_lock);
67 }
68 
69 static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
70 {
71 	mutex_lock(&c->data_progress_lock);
72 	list_del(&stats->list);
73 	mutex_unlock(&c->data_progress_lock);
74 }
75 
76 struct moving_io {
77 	struct list_head		read_list;
78 	struct list_head		io_list;
79 	struct move_bucket_in_flight	*b;
80 	struct closure			cl;
81 	bool				read_completed;
82 
83 	unsigned			read_sectors;
84 	unsigned			write_sectors;
85 
86 	struct bch_read_bio		rbio;
87 
88 	struct data_update		write;
89 	/* Must be last since it is variable size */
90 	struct bio_vec			bi_inline_vecs[0];
91 };
92 
93 static void move_free(struct moving_io *io)
94 {
95 	struct moving_context *ctxt = io->write.ctxt;
96 
97 	if (io->b)
98 		atomic_dec(&io->b->count);
99 
100 	bch2_data_update_exit(&io->write);
101 
102 	mutex_lock(&ctxt->lock);
103 	list_del(&io->io_list);
104 	wake_up(&ctxt->wait);
105 	mutex_unlock(&ctxt->lock);
106 
107 	kfree(io);
108 }
109 
110 static void move_write_done(struct bch_write_op *op)
111 {
112 	struct moving_io *io = container_of(op, struct moving_io, write.op);
113 	struct moving_context *ctxt = io->write.ctxt;
114 
115 	if (io->write.op.error)
116 		ctxt->write_error = true;
117 
118 	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
119 	atomic_dec(&io->write.ctxt->write_ios);
120 	move_free(io);
121 	closure_put(&ctxt->cl);
122 }
123 
124 static void move_write(struct moving_io *io)
125 {
126 	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
127 		move_free(io);
128 		return;
129 	}
130 
131 	closure_get(&io->write.ctxt->cl);
132 	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
133 	atomic_inc(&io->write.ctxt->write_ios);
134 
135 	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
136 }
137 
138 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
139 {
140 	struct moving_io *io =
141 		list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
142 
143 	return io && io->read_completed ? io : NULL;
144 }
145 
146 static void move_read_endio(struct bio *bio)
147 {
148 	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
149 	struct moving_context *ctxt = io->write.ctxt;
150 
151 	atomic_sub(io->read_sectors, &ctxt->read_sectors);
152 	atomic_dec(&ctxt->read_ios);
153 	io->read_completed = true;
154 
155 	wake_up(&ctxt->wait);
156 	closure_put(&ctxt->cl);
157 }
158 
159 void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
160 					struct btree_trans *trans)
161 {
162 	struct moving_io *io;
163 
164 	if (trans)
165 		bch2_trans_unlock(trans);
166 
167 	while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
168 		list_del(&io->read_list);
169 		move_write(io);
170 	}
171 }
172 
173 static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
174 				       struct btree_trans *trans)
175 {
176 	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
177 
178 	move_ctxt_wait_event(ctxt, trans,
179 		!atomic_read(&ctxt->write_sectors) ||
180 		atomic_read(&ctxt->write_sectors) != sectors_pending);
181 }
182 
183 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
184 {
185 	struct bch_fs *c = ctxt->c;
186 
187 	move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
188 	closure_sync(&ctxt->cl);
189 
190 	EBUG_ON(atomic_read(&ctxt->write_sectors));
191 	EBUG_ON(atomic_read(&ctxt->write_ios));
192 	EBUG_ON(atomic_read(&ctxt->read_sectors));
193 	EBUG_ON(atomic_read(&ctxt->read_ios));
194 
195 	if (ctxt->stats) {
196 		progress_list_del(c, ctxt->stats);
197 		trace_move_data(c,
198 				atomic64_read(&ctxt->stats->sectors_moved),
199 				atomic64_read(&ctxt->stats->keys_moved));
200 	}
201 
202 	mutex_lock(&c->moving_context_lock);
203 	list_del(&ctxt->list);
204 	mutex_unlock(&c->moving_context_lock);
205 }
206 
207 void bch2_moving_ctxt_init(struct moving_context *ctxt,
208 			   struct bch_fs *c,
209 			   struct bch_ratelimit *rate,
210 			   struct bch_move_stats *stats,
211 			   struct write_point_specifier wp,
212 			   bool wait_on_copygc)
213 {
214 	memset(ctxt, 0, sizeof(*ctxt));
215 
216 	ctxt->c		= c;
217 	ctxt->fn	= (void *) _RET_IP_;
218 	ctxt->rate	= rate;
219 	ctxt->stats	= stats;
220 	ctxt->wp	= wp;
221 	ctxt->wait_on_copygc = wait_on_copygc;
222 
223 	closure_init_stack(&ctxt->cl);
224 
225 	mutex_init(&ctxt->lock);
226 	INIT_LIST_HEAD(&ctxt->reads);
227 	INIT_LIST_HEAD(&ctxt->ios);
228 	init_waitqueue_head(&ctxt->wait);
229 
230 	mutex_lock(&c->moving_context_lock);
231 	list_add(&ctxt->list, &c->moving_context_list);
232 	mutex_unlock(&c->moving_context_lock);
233 
234 	if (stats) {
235 		progress_list_add(c, stats);
236 		stats->data_type = BCH_DATA_user;
237 	}
238 }
239 
240 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
241 {
242 	memset(stats, 0, sizeof(*stats));
243 	scnprintf(stats->name, sizeof(stats->name), "%s", name);
244 }
245 
246 static int bch2_extent_drop_ptrs(struct btree_trans *trans,
247 				 struct btree_iter *iter,
248 				 struct bkey_s_c k,
249 				 struct data_update_opts data_opts)
250 {
251 	struct bch_fs *c = trans->c;
252 	struct bkey_i *n;
253 	int ret;
254 
255 	n = bch2_bkey_make_mut_noupdate(trans, k);
256 	ret = PTR_ERR_OR_ZERO(n);
257 	if (ret)
258 		return ret;
259 
260 	while (data_opts.kill_ptrs) {
261 		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
262 		struct bch_extent_ptr *ptr;
263 
264 		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
265 		data_opts.kill_ptrs ^= 1U << drop;
266 	}
267 
268 	/*
269 	 * If the new extent no longer has any pointers, bch2_extent_normalize()
270 	 * will do the appropriate thing with it (turning it into a
271 	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
272 	 */
273 	bch2_extent_normalize(c, bkey_i_to_s(n));
274 
275 	/*
276 	 * Since we're not inserting through an extent iterator
277 	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
278 	 * we aren't using the extent overwrite path to delete, we're
279 	 * just using the normal key deletion path:
280 	 */
281 	if (bkey_deleted(&n->k))
282 		n->k.size = 0;
283 
284 	return bch2_trans_relock(trans) ?:
285 		bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
286 		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
287 }
288 
289 static int bch2_move_extent(struct btree_trans *trans,
290 			    struct btree_iter *iter,
291 			    struct moving_context *ctxt,
292 			    struct move_bucket_in_flight *bucket_in_flight,
293 			    struct bch_io_opts io_opts,
294 			    enum btree_id btree_id,
295 			    struct bkey_s_c k,
296 			    struct data_update_opts data_opts)
297 {
298 	struct bch_fs *c = trans->c;
299 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
300 	struct moving_io *io;
301 	const union bch_extent_entry *entry;
302 	struct extent_ptr_decoded p;
303 	unsigned sectors = k.k->size, pages;
304 	int ret = -ENOMEM;
305 
306 	trace_move_extent2(c, k);
307 
308 	bch2_data_update_opts_normalize(k, &data_opts);
309 
310 	if (!data_opts.rewrite_ptrs &&
311 	    !data_opts.extra_replicas) {
312 		if (data_opts.kill_ptrs)
313 			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
314 		return 0;
315 	}
316 
317 	/*
318 	 * Before memory allocations & taking nocow locks in
319 	 * bch2_data_update_init():
320 	 */
321 	bch2_trans_unlock(trans);
322 
323 	/* write path might have to decompress data: */
324 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
325 		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
326 
327 	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
328 	io = kzalloc(sizeof(struct moving_io) +
329 		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
330 	if (!io)
331 		goto err;
332 
333 	INIT_LIST_HEAD(&io->io_list);
334 	io->write.ctxt		= ctxt;
335 	io->read_sectors	= k.k->size;
336 	io->write_sectors	= k.k->size;
337 
338 	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
339 	bio_set_prio(&io->write.op.wbio.bio,
340 		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
341 
342 	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
343 				 GFP_KERNEL))
344 		goto err_free;
345 
346 	io->rbio.c		= c;
347 	io->rbio.opts		= io_opts;
348 	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
349 	io->rbio.bio.bi_vcnt = pages;
350 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
351 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
352 
353 	io->rbio.bio.bi_opf		= REQ_OP_READ;
354 	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
355 	io->rbio.bio.bi_end_io		= move_read_endio;
356 
357 	ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
358 				    io_opts, data_opts, btree_id, k);
359 	if (ret && ret != -BCH_ERR_unwritten_extent_update)
360 		goto err_free_pages;
361 
362 	if (ret == -BCH_ERR_unwritten_extent_update) {
363 		bch2_update_unwritten_extent(trans, &io->write);
364 		move_free(io);
365 		return 0;
366 	}
367 
368 	BUG_ON(ret);
369 
370 	io->write.ctxt = ctxt;
371 	io->write.op.end_io = move_write_done;
372 
373 	if (ctxt->stats) {
374 		atomic64_inc(&ctxt->stats->keys_moved);
375 		atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
376 	}
377 
378 	if (bucket_in_flight) {
379 		io->b = bucket_in_flight;
380 		atomic_inc(&io->b->count);
381 	}
382 
383 	this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
384 	this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
385 	trace_move_extent_read2(c, k);
386 
387 	mutex_lock(&ctxt->lock);
388 	atomic_add(io->read_sectors, &ctxt->read_sectors);
389 	atomic_inc(&ctxt->read_ios);
390 
391 	list_add_tail(&io->read_list, &ctxt->reads);
392 	list_add_tail(&io->io_list, &ctxt->ios);
393 	mutex_unlock(&ctxt->lock);
394 
395 	/*
396 	 * dropped by move_read_endio() - guards against use after free of
397 	 * ctxt when doing wakeup
398 	 */
399 	closure_get(&ctxt->cl);
400 	bch2_read_extent(trans, &io->rbio,
401 			 bkey_start_pos(k.k),
402 			 btree_id, k, 0,
403 			 BCH_READ_NODECODE|
404 			 BCH_READ_LAST_FRAGMENT);
405 	return 0;
406 err_free_pages:
407 	bio_free_pages(&io->write.op.wbio.bio);
408 err_free:
409 	kfree(io);
410 err:
411 	this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]);
412 	trace_move_extent_alloc_mem_fail2(c, k);
413 	return ret;
414 }
415 
416 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
417 			struct bch_inode_unpacked *inode)
418 {
419 	struct btree_iter iter;
420 	struct bkey_s_c k;
421 	int ret;
422 
423 	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
424 			     BTREE_ITER_ALL_SNAPSHOTS);
425 	k = bch2_btree_iter_peek(&iter);
426 	ret = bkey_err(k);
427 	if (ret)
428 		goto err;
429 
430 	if (!k.k || !bkey_eq(k.k->p, pos)) {
431 		ret = -BCH_ERR_ENOENT_inode;
432 		goto err;
433 	}
434 
435 	ret = bkey_is_inode(k.k) ? 0 : -EIO;
436 	if (ret)
437 		goto err;
438 
439 	ret = bch2_inode_unpack(k, inode);
440 	if (ret)
441 		goto err;
442 err:
443 	bch2_trans_iter_exit(trans, &iter);
444 	return ret;
445 }
446 
447 static int move_ratelimit(struct btree_trans *trans,
448 			  struct moving_context *ctxt)
449 {
450 	struct bch_fs *c = trans->c;
451 	u64 delay;
452 
453 	if (ctxt->wait_on_copygc) {
454 		bch2_trans_unlock(trans);
455 		wait_event_killable(c->copygc_running_wq,
456 				    !c->copygc_running ||
457 				    kthread_should_stop());
458 	}
459 
460 	do {
461 		delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
462 
463 		if (delay) {
464 			bch2_trans_unlock(trans);
465 			set_current_state(TASK_INTERRUPTIBLE);
466 		}
467 
468 		if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
469 			__set_current_state(TASK_RUNNING);
470 			return 1;
471 		}
472 
473 		if (delay)
474 			schedule_timeout(delay);
475 
476 		if (unlikely(freezing(current))) {
477 			move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
478 			try_to_freeze();
479 		}
480 	} while (delay);
481 
482 	/*
483 	 * XXX: these limits really ought to be per device, SSDs and hard drives
484 	 * will want different limits
485 	 */
486 	move_ctxt_wait_event(ctxt, trans,
487 		atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
488 		atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
489 		atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
490 		atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
491 
492 	return 0;
493 }
494 
495 static int move_get_io_opts(struct btree_trans *trans,
496 			    struct bch_io_opts *io_opts,
497 			    struct bkey_s_c k, u64 *cur_inum)
498 {
499 	struct bch_inode_unpacked inode;
500 	int ret;
501 
502 	if (*cur_inum == k.k->p.inode)
503 		return 0;
504 
505 	ret = lookup_inode(trans,
506 			   SPOS(0, k.k->p.inode, k.k->p.snapshot),
507 			   &inode);
508 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
509 		return ret;
510 
511 	if (!ret)
512 		bch2_inode_opts_get(io_opts, trans->c, &inode);
513 	else
514 		*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
515 	*cur_inum = k.k->p.inode;
516 	return 0;
517 }
518 
519 static int __bch2_move_data(struct moving_context *ctxt,
520 			    struct bpos start,
521 			    struct bpos end,
522 			    move_pred_fn pred, void *arg,
523 			    enum btree_id btree_id)
524 {
525 	struct bch_fs *c = ctxt->c;
526 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
527 	struct bkey_buf sk;
528 	struct btree_trans *trans = bch2_trans_get(c);
529 	struct btree_iter iter;
530 	struct bkey_s_c k;
531 	struct data_update_opts data_opts;
532 	u64 cur_inum = U64_MAX;
533 	int ret = 0, ret2;
534 
535 	bch2_bkey_buf_init(&sk);
536 
537 	if (ctxt->stats) {
538 		ctxt->stats->data_type	= BCH_DATA_user;
539 		ctxt->stats->btree_id	= btree_id;
540 		ctxt->stats->pos	= start;
541 	}
542 
543 	bch2_trans_iter_init(trans, &iter, btree_id, start,
544 			     BTREE_ITER_PREFETCH|
545 			     BTREE_ITER_ALL_SNAPSHOTS);
546 
547 	if (ctxt->rate)
548 		bch2_ratelimit_reset(ctxt->rate);
549 
550 	while (!move_ratelimit(trans, ctxt)) {
551 		bch2_trans_begin(trans);
552 
553 		k = bch2_btree_iter_peek(&iter);
554 		if (!k.k)
555 			break;
556 
557 		ret = bkey_err(k);
558 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
559 			continue;
560 		if (ret)
561 			break;
562 
563 		if (bkey_ge(bkey_start_pos(k.k), end))
564 			break;
565 
566 		if (ctxt->stats)
567 			ctxt->stats->pos = iter.pos;
568 
569 		if (!bkey_extent_is_direct_data(k.k))
570 			goto next_nondata;
571 
572 		ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
573 		if (ret)
574 			continue;
575 
576 		memset(&data_opts, 0, sizeof(data_opts));
577 		if (!pred(c, arg, k, &io_opts, &data_opts))
578 			goto next;
579 
580 		/*
581 		 * The iterator gets unlocked by __bch2_read_extent - need to
582 		 * save a copy of @k elsewhere:
583 		 */
584 		bch2_bkey_buf_reassemble(&sk, c, k);
585 		k = bkey_i_to_s_c(sk.k);
586 
587 		ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
588 					io_opts, btree_id, k, data_opts);
589 		if (ret2) {
590 			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
591 				continue;
592 
593 			if (ret2 == -ENOMEM) {
594 				/* memory allocation failure, wait for some IO to finish */
595 				bch2_move_ctxt_wait_for_io(ctxt, trans);
596 				continue;
597 			}
598 
599 			/* XXX signal failure */
600 			goto next;
601 		}
602 
603 		if (ctxt->rate)
604 			bch2_ratelimit_increment(ctxt->rate, k.k->size);
605 next:
606 		if (ctxt->stats)
607 			atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
608 next_nondata:
609 		bch2_btree_iter_advance(&iter);
610 	}
611 
612 	bch2_trans_iter_exit(trans, &iter);
613 	bch2_trans_put(trans);
614 	bch2_bkey_buf_exit(&sk, c);
615 
616 	return ret;
617 }
618 
619 int bch2_move_data(struct bch_fs *c,
620 		   enum btree_id start_btree_id, struct bpos start_pos,
621 		   enum btree_id end_btree_id,   struct bpos end_pos,
622 		   struct bch_ratelimit *rate,
623 		   struct bch_move_stats *stats,
624 		   struct write_point_specifier wp,
625 		   bool wait_on_copygc,
626 		   move_pred_fn pred, void *arg)
627 {
628 	struct moving_context ctxt;
629 	enum btree_id id;
630 	int ret = 0;
631 
632 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
633 
634 	for (id = start_btree_id;
635 	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
636 	     id++) {
637 		stats->btree_id = id;
638 
639 		if (id != BTREE_ID_extents &&
640 		    id != BTREE_ID_reflink)
641 			continue;
642 
643 		if (!bch2_btree_id_root(c, id)->b)
644 			continue;
645 
646 		ret = __bch2_move_data(&ctxt,
647 				       id == start_btree_id ? start_pos : POS_MIN,
648 				       id == end_btree_id   ? end_pos   : POS_MAX,
649 				       pred, arg, id);
650 		if (ret)
651 			break;
652 	}
653 
654 	bch2_moving_ctxt_exit(&ctxt);
655 
656 	return ret;
657 }
658 
659 int __bch2_evacuate_bucket(struct btree_trans *trans,
660 			   struct moving_context *ctxt,
661 			   struct move_bucket_in_flight *bucket_in_flight,
662 			   struct bpos bucket, int gen,
663 			   struct data_update_opts _data_opts)
664 {
665 	struct bch_fs *c = ctxt->c;
666 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
667 	struct btree_iter iter;
668 	struct bkey_buf sk;
669 	struct bch_backpointer bp;
670 	struct bch_alloc_v4 a_convert;
671 	const struct bch_alloc_v4 *a;
672 	struct bkey_s_c k;
673 	struct data_update_opts data_opts;
674 	unsigned dirty_sectors, bucket_size;
675 	u64 fragmentation;
676 	u64 cur_inum = U64_MAX;
677 	struct bpos bp_pos = POS_MIN;
678 	int ret = 0;
679 
680 	trace_bucket_evacuate(c, &bucket);
681 
682 	bch2_bkey_buf_init(&sk);
683 
684 	/*
685 	 * We're not run in a context that handles transaction restarts:
686 	 */
687 	bch2_trans_begin(trans);
688 
689 	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
690 			     bucket, BTREE_ITER_CACHED);
691 	ret = lockrestart_do(trans,
692 			bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
693 	bch2_trans_iter_exit(trans, &iter);
694 
695 	if (ret) {
696 		bch_err_msg(c, ret, "looking up alloc key");
697 		goto err;
698 	}
699 
700 	a = bch2_alloc_to_v4(k, &a_convert);
701 	dirty_sectors = a->dirty_sectors;
702 	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
703 	fragmentation = a->fragmentation_lru;
704 
705 	ret = bch2_btree_write_buffer_flush(trans);
706 	if (ret) {
707 		bch_err_msg(c, ret, "flushing btree write buffer");
708 		goto err;
709 	}
710 
711 	while (!(ret = move_ratelimit(trans, ctxt))) {
712 		bch2_trans_begin(trans);
713 
714 		ret = bch2_get_next_backpointer(trans, bucket, gen,
715 						&bp_pos, &bp,
716 						BTREE_ITER_CACHED);
717 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
718 			continue;
719 		if (ret)
720 			goto err;
721 		if (bkey_eq(bp_pos, POS_MAX))
722 			break;
723 
724 		if (!bp.level) {
725 			const struct bch_extent_ptr *ptr;
726 			unsigned i = 0;
727 
728 			k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
729 			ret = bkey_err(k);
730 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
731 				continue;
732 			if (ret)
733 				goto err;
734 			if (!k.k)
735 				goto next;
736 
737 			bch2_bkey_buf_reassemble(&sk, c, k);
738 			k = bkey_i_to_s_c(sk.k);
739 
740 			ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
741 			if (ret) {
742 				bch2_trans_iter_exit(trans, &iter);
743 				continue;
744 			}
745 
746 			data_opts = _data_opts;
747 			data_opts.target	= io_opts.background_target;
748 			data_opts.rewrite_ptrs = 0;
749 
750 			bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
751 				if (ptr->dev == bucket.inode) {
752 					data_opts.rewrite_ptrs |= 1U << i;
753 					if (ptr->cached) {
754 						bch2_trans_iter_exit(trans, &iter);
755 						goto next;
756 					}
757 				}
758 				i++;
759 			}
760 
761 			ret = bch2_move_extent(trans, &iter, ctxt,
762 					bucket_in_flight,
763 					io_opts, bp.btree_id, k, data_opts);
764 			bch2_trans_iter_exit(trans, &iter);
765 
766 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
767 				continue;
768 			if (ret == -ENOMEM) {
769 				/* memory allocation failure, wait for some IO to finish */
770 				bch2_move_ctxt_wait_for_io(ctxt, trans);
771 				continue;
772 			}
773 			if (ret)
774 				goto err;
775 
776 			if (ctxt->rate)
777 				bch2_ratelimit_increment(ctxt->rate, k.k->size);
778 			if (ctxt->stats)
779 				atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
780 		} else {
781 			struct btree *b;
782 
783 			b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
784 			ret = PTR_ERR_OR_ZERO(b);
785 			if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
786 				continue;
787 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
788 				continue;
789 			if (ret)
790 				goto err;
791 			if (!b)
792 				goto next;
793 
794 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
795 			bch2_trans_iter_exit(trans, &iter);
796 
797 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
798 				continue;
799 			if (ret)
800 				goto err;
801 
802 			if (ctxt->rate)
803 				bch2_ratelimit_increment(ctxt->rate,
804 							 c->opts.btree_node_size >> 9);
805 			if (ctxt->stats) {
806 				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
807 				atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
808 			}
809 		}
810 next:
811 		bp_pos = bpos_nosnap_successor(bp_pos);
812 	}
813 
814 	trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret);
815 err:
816 	bch2_bkey_buf_exit(&sk, c);
817 	return ret;
818 }
819 
820 int bch2_evacuate_bucket(struct bch_fs *c,
821 			 struct bpos bucket, int gen,
822 			 struct data_update_opts data_opts,
823 			 struct bch_ratelimit *rate,
824 			 struct bch_move_stats *stats,
825 			 struct write_point_specifier wp,
826 			 bool wait_on_copygc)
827 {
828 	struct btree_trans *trans = bch2_trans_get(c);
829 	struct moving_context ctxt;
830 	int ret;
831 
832 	bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
833 	ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
834 	bch2_moving_ctxt_exit(&ctxt);
835 	bch2_trans_put(trans);
836 
837 	return ret;
838 }
839 
840 typedef bool (*move_btree_pred)(struct bch_fs *, void *,
841 				struct btree *, struct bch_io_opts *,
842 				struct data_update_opts *);
843 
844 static int bch2_move_btree(struct bch_fs *c,
845 			   enum btree_id start_btree_id, struct bpos start_pos,
846 			   enum btree_id end_btree_id,   struct bpos end_pos,
847 			   move_btree_pred pred, void *arg,
848 			   struct bch_move_stats *stats)
849 {
850 	bool kthread = (current->flags & PF_KTHREAD) != 0;
851 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
852 	struct btree_trans *trans = bch2_trans_get(c);
853 	struct btree_iter iter;
854 	struct btree *b;
855 	enum btree_id id;
856 	struct data_update_opts data_opts;
857 	int ret = 0;
858 
859 	progress_list_add(c, stats);
860 
861 	stats->data_type = BCH_DATA_btree;
862 
863 	for (id = start_btree_id;
864 	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
865 	     id++) {
866 		stats->btree_id = id;
867 
868 		if (!bch2_btree_id_root(c, id)->b)
869 			continue;
870 
871 		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
872 					  BTREE_ITER_PREFETCH);
873 retry:
874 		ret = 0;
875 		while (bch2_trans_begin(trans),
876 		       (b = bch2_btree_iter_peek_node(&iter)) &&
877 		       !(ret = PTR_ERR_OR_ZERO(b))) {
878 			if (kthread && kthread_should_stop())
879 				break;
880 
881 			if ((cmp_int(id, end_btree_id) ?:
882 			     bpos_cmp(b->key.k.p, end_pos)) > 0)
883 				break;
884 
885 			stats->pos = iter.pos;
886 
887 			if (!pred(c, arg, b, &io_opts, &data_opts))
888 				goto next;
889 
890 			ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
891 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
892 				continue;
893 			if (ret)
894 				break;
895 next:
896 			bch2_btree_iter_next_node(&iter);
897 		}
898 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
899 			goto retry;
900 
901 		bch2_trans_iter_exit(trans, &iter);
902 
903 		if (kthread && kthread_should_stop())
904 			break;
905 	}
906 
907 	bch2_trans_put(trans);
908 
909 	if (ret)
910 		bch_err_fn(c, ret);
911 
912 	bch2_btree_interior_updates_flush(c);
913 
914 	progress_list_del(c, stats);
915 	return ret;
916 }
917 
918 static bool rereplicate_pred(struct bch_fs *c, void *arg,
919 			     struct bkey_s_c k,
920 			     struct bch_io_opts *io_opts,
921 			     struct data_update_opts *data_opts)
922 {
923 	unsigned nr_good = bch2_bkey_durability(c, k);
924 	unsigned replicas = bkey_is_btree_ptr(k.k)
925 		? c->opts.metadata_replicas
926 		: io_opts->data_replicas;
927 
928 	if (!nr_good || nr_good >= replicas)
929 		return false;
930 
931 	data_opts->target		= 0;
932 	data_opts->extra_replicas	= replicas - nr_good;
933 	data_opts->btree_insert_flags	= 0;
934 	return true;
935 }
936 
937 static bool migrate_pred(struct bch_fs *c, void *arg,
938 			 struct bkey_s_c k,
939 			 struct bch_io_opts *io_opts,
940 			 struct data_update_opts *data_opts)
941 {
942 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
943 	const struct bch_extent_ptr *ptr;
944 	struct bch_ioctl_data *op = arg;
945 	unsigned i = 0;
946 
947 	data_opts->rewrite_ptrs		= 0;
948 	data_opts->target		= 0;
949 	data_opts->extra_replicas	= 0;
950 	data_opts->btree_insert_flags	= 0;
951 
952 	bkey_for_each_ptr(ptrs, ptr) {
953 		if (ptr->dev == op->migrate.dev)
954 			data_opts->rewrite_ptrs |= 1U << i;
955 		i++;
956 	}
957 
958 	return data_opts->rewrite_ptrs != 0;
959 }
960 
961 static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
962 				   struct btree *b,
963 				   struct bch_io_opts *io_opts,
964 				   struct data_update_opts *data_opts)
965 {
966 	return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
967 }
968 
969 static bool migrate_btree_pred(struct bch_fs *c, void *arg,
970 			       struct btree *b,
971 			       struct bch_io_opts *io_opts,
972 			       struct data_update_opts *data_opts)
973 {
974 	return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
975 }
976 
977 static bool bformat_needs_redo(struct bkey_format *f)
978 {
979 	unsigned i;
980 
981 	for (i = 0; i < f->nr_fields; i++) {
982 		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
983 		u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
984 		u64 field_offset = le64_to_cpu(f->field_offset[i]);
985 
986 		if (f->bits_per_field[i] > unpacked_bits)
987 			return true;
988 
989 		if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
990 			return true;
991 
992 		if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
993 		     unpacked_mask) <
994 		    field_offset)
995 			return true;
996 	}
997 
998 	return false;
999 }
1000 
1001 static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
1002 				   struct btree *b,
1003 				   struct bch_io_opts *io_opts,
1004 				   struct data_update_opts *data_opts)
1005 {
1006 	if (b->version_ondisk != c->sb.version ||
1007 	    btree_node_need_rewrite(b) ||
1008 	    bformat_needs_redo(&b->format)) {
1009 		data_opts->target		= 0;
1010 		data_opts->extra_replicas	= 0;
1011 		data_opts->btree_insert_flags	= 0;
1012 		return true;
1013 	}
1014 
1015 	return false;
1016 }
1017 
1018 int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1019 {
1020 	int ret;
1021 
1022 	ret = bch2_move_btree(c,
1023 			      0,		POS_MIN,
1024 			      BTREE_ID_NR,	SPOS_MAX,
1025 			      rewrite_old_nodes_pred, c, stats);
1026 	if (!ret) {
1027 		mutex_lock(&c->sb_lock);
1028 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1029 		c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1030 		c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1031 		bch2_write_super(c);
1032 		mutex_unlock(&c->sb_lock);
1033 	}
1034 
1035 	if (ret)
1036 		bch_err_fn(c, ret);
1037 	return ret;
1038 }
1039 
1040 int bch2_data_job(struct bch_fs *c,
1041 		  struct bch_move_stats *stats,
1042 		  struct bch_ioctl_data op)
1043 {
1044 	int ret = 0;
1045 
1046 	switch (op.op) {
1047 	case BCH_DATA_OP_REREPLICATE:
1048 		bch2_move_stats_init(stats, "rereplicate");
1049 		stats->data_type = BCH_DATA_journal;
1050 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
1051 
1052 		ret = bch2_move_btree(c,
1053 				      op.start_btree,	op.start_pos,
1054 				      op.end_btree,	op.end_pos,
1055 				      rereplicate_btree_pred, c, stats) ?: ret;
1056 		ret = bch2_replicas_gc2(c) ?: ret;
1057 
1058 		ret = bch2_move_data(c,
1059 				     op.start_btree,	op.start_pos,
1060 				     op.end_btree,	op.end_pos,
1061 				     NULL,
1062 				     stats,
1063 				     writepoint_hashed((unsigned long) current),
1064 				     true,
1065 				     rereplicate_pred, c) ?: ret;
1066 		ret = bch2_replicas_gc2(c) ?: ret;
1067 		break;
1068 	case BCH_DATA_OP_MIGRATE:
1069 		if (op.migrate.dev >= c->sb.nr_devices)
1070 			return -EINVAL;
1071 
1072 		bch2_move_stats_init(stats, "migrate");
1073 		stats->data_type = BCH_DATA_journal;
1074 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1075 
1076 		ret = bch2_move_btree(c,
1077 				      op.start_btree,	op.start_pos,
1078 				      op.end_btree,	op.end_pos,
1079 				      migrate_btree_pred, &op, stats) ?: ret;
1080 		ret = bch2_replicas_gc2(c) ?: ret;
1081 
1082 		ret = bch2_move_data(c,
1083 				     op.start_btree,	op.start_pos,
1084 				     op.end_btree,	op.end_pos,
1085 				     NULL,
1086 				     stats,
1087 				     writepoint_hashed((unsigned long) current),
1088 				     true,
1089 				     migrate_pred, &op) ?: ret;
1090 		ret = bch2_replicas_gc2(c) ?: ret;
1091 		break;
1092 	case BCH_DATA_OP_REWRITE_OLD_NODES:
1093 		bch2_move_stats_init(stats, "rewrite_old_nodes");
1094 		ret = bch2_scan_old_btree_nodes(c, stats);
1095 		break;
1096 	default:
1097 		ret = -EINVAL;
1098 	}
1099 
1100 	return ret;
1101 }
1102 
1103 static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1104 {
1105 	struct bch_move_stats *stats = ctxt->stats;
1106 	struct moving_io *io;
1107 
1108 	prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
1109 	prt_newline(out);
1110 
1111 	prt_printf(out, " data type %s btree_id %s position: ",
1112 		   bch2_data_types[stats->data_type],
1113 		   bch2_btree_ids[stats->btree_id]);
1114 	bch2_bpos_to_text(out, stats->pos);
1115 	prt_newline(out);
1116 	printbuf_indent_add(out, 2);
1117 
1118 	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1119 		   atomic_read(&ctxt->read_ios),
1120 		   c->opts.move_ios_in_flight,
1121 		   atomic_read(&ctxt->read_sectors),
1122 		   c->opts.move_bytes_in_flight >> 9);
1123 	prt_newline(out);
1124 
1125 	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1126 		   atomic_read(&ctxt->write_ios),
1127 		   c->opts.move_ios_in_flight,
1128 		   atomic_read(&ctxt->write_sectors),
1129 		   c->opts.move_bytes_in_flight >> 9);
1130 	prt_newline(out);
1131 
1132 	printbuf_indent_add(out, 2);
1133 
1134 	mutex_lock(&ctxt->lock);
1135 	list_for_each_entry(io, &ctxt->ios, io_list)
1136 		bch2_write_op_to_text(out, &io->write.op);
1137 	mutex_unlock(&ctxt->lock);
1138 
1139 	printbuf_indent_sub(out, 4);
1140 }
1141 
1142 void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1143 {
1144 	struct moving_context *ctxt;
1145 
1146 	mutex_lock(&c->moving_context_lock);
1147 	list_for_each_entry(ctxt, &c->moving_context_list, list)
1148 		bch2_moving_ctxt_to_text(out, c, ctxt);
1149 	mutex_unlock(&c->moving_context_lock);
1150 }
1151 
1152 void bch2_fs_move_init(struct bch_fs *c)
1153 {
1154 	INIT_LIST_HEAD(&c->moving_context_list);
1155 	mutex_init(&c->moving_context_lock);
1156 
1157 	INIT_LIST_HEAD(&c->data_progress_list);
1158 	mutex_init(&c->data_progress_lock);
1159 }
1160