xref: /linux/fs/bcachefs/rebalance.c (revision ff0905bbf991f4337b5ebc19c0d43525ebb0d96b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "btree_iter.h"
7 #include "btree_update.h"
8 #include "btree_write_buffer.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "compress.h"
12 #include "disk_groups.h"
13 #include "errcode.h"
14 #include "error.h"
15 #include "inode.h"
16 #include "io_write.h"
17 #include "move.h"
18 #include "rebalance.h"
19 #include "subvolume.h"
20 #include "super-io.h"
21 #include "trace.h"
22 
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/sched/cputime.h>
26 
27 /* bch_extent_rebalance: */
28 
bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)29 static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
30 {
31 	const union bch_extent_entry *entry;
32 
33 	bkey_extent_entry_for_each(ptrs, entry)
34 		if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
35 			return &entry->rebalance;
36 
37 	return NULL;
38 }
39 
bch2_bkey_rebalance_opts(struct bkey_s_c k)40 static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
41 {
42 	return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
43 }
44 
bch2_bkey_ptrs_need_compress(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k,struct bkey_ptrs_c ptrs)45 static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
46 					   struct bch_io_opts *opts,
47 					   struct bkey_s_c k,
48 					   struct bkey_ptrs_c ptrs)
49 {
50 	if (!opts->background_compression)
51 		return 0;
52 
53 	unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
54 	const union bch_extent_entry *entry;
55 	struct extent_ptr_decoded p;
56 	unsigned ptr_bit = 1;
57 	unsigned rewrite_ptrs = 0;
58 
59 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
60 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
61 		    p.ptr.unwritten)
62 			return 0;
63 
64 		if (!p.ptr.cached && p.crc.compression_type != compression_type)
65 			rewrite_ptrs |= ptr_bit;
66 		ptr_bit <<= 1;
67 	}
68 
69 	return rewrite_ptrs;
70 }
71 
bch2_bkey_ptrs_need_move(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_ptrs_c ptrs)72 static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
73 				       struct bch_io_opts *opts,
74 				       struct bkey_ptrs_c ptrs)
75 {
76 	if (!opts->background_target ||
77 	    !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
78 		return 0;
79 
80 	unsigned ptr_bit = 1;
81 	unsigned rewrite_ptrs = 0;
82 
83 	guard(rcu)();
84 	bkey_for_each_ptr(ptrs, ptr) {
85 		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
86 			rewrite_ptrs |= ptr_bit;
87 		ptr_bit <<= 1;
88 	}
89 
90 	return rewrite_ptrs;
91 }
92 
bch2_bkey_ptrs_need_rebalance(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k)93 static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
94 					      struct bch_io_opts *opts,
95 					      struct bkey_s_c k)
96 {
97 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
98 
99 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
100 		return 0;
101 
102 	return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
103 		bch2_bkey_ptrs_need_move(c, opts, ptrs);
104 }
105 
bch2_bkey_sectors_need_rebalance(struct bch_fs * c,struct bkey_s_c k)106 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
107 {
108 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
109 
110 	const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
111 	if (!opts)
112 		return 0;
113 
114 	if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
115 		return 0;
116 
117 	const union bch_extent_entry *entry;
118 	struct extent_ptr_decoded p;
119 	u64 sectors = 0;
120 
121 	if (opts->background_compression) {
122 		unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
123 
124 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
125 			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
126 			    p.ptr.unwritten) {
127 				sectors = 0;
128 				goto incompressible;
129 			}
130 
131 			if (!p.ptr.cached && p.crc.compression_type != compression_type)
132 				sectors += p.crc.compressed_size;
133 		}
134 	}
135 incompressible:
136 	if (opts->background_target) {
137 		guard(rcu)();
138 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
139 			if (!p.ptr.cached &&
140 			    !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
141 				sectors += p.crc.compressed_size;
142 	}
143 
144 	return sectors;
145 }
146 
bch2_bkey_rebalance_needs_update(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k)147 static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
148 					     struct bkey_s_c k)
149 {
150 	if (!bkey_extent_is_direct_data(k.k))
151 		return 0;
152 
153 	const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
154 
155 	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
156 		struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
157 		return old == NULL || memcmp(old, &new, sizeof(new));
158 	} else {
159 		return old != NULL;
160 	}
161 }
162 
bch2_bkey_set_needs_rebalance(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_i * _k)163 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
164 				  struct bkey_i *_k)
165 {
166 	if (!bkey_extent_is_direct_data(&_k->k))
167 		return 0;
168 
169 	struct bkey_s k = bkey_i_to_s(_k);
170 	struct bch_extent_rebalance *old =
171 		(struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
172 
173 	if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
174 		if (!old) {
175 			old = bkey_val_end(k);
176 			k.k->u64s += sizeof(*old) / sizeof(u64);
177 		}
178 
179 		*old = io_opts_to_rebalance_opts(c, opts);
180 	} else {
181 		if (old)
182 			extent_entry_drop(k, (union bch_extent_entry *) old);
183 	}
184 
185 	return 0;
186 }
187 
bch2_get_update_rebalance_opts(struct btree_trans * trans,struct bch_io_opts * io_opts,struct btree_iter * iter,struct bkey_s_c k)188 int bch2_get_update_rebalance_opts(struct btree_trans *trans,
189 				   struct bch_io_opts *io_opts,
190 				   struct btree_iter *iter,
191 				   struct bkey_s_c k)
192 {
193 	BUG_ON(iter->flags & BTREE_ITER_is_extents);
194 	BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
195 
196 	const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
197 		? bch2_bkey_rebalance_opts(k) : NULL;
198 	if (r) {
199 #define x(_name)							\
200 		if (r->_name##_from_inode) {				\
201 			io_opts->_name = r->_name;			\
202 			io_opts->_name##_from_inode = true;		\
203 		}
204 		BCH_REBALANCE_OPTS()
205 #undef x
206 	}
207 
208 	if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
209 		return 0;
210 
211 	struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
212 	int ret = PTR_ERR_OR_ZERO(n);
213 	if (ret)
214 		return ret;
215 
216 	bkey_reassemble(n, k);
217 
218 	/* On successfull transaction commit, @k was invalidated: */
219 
220 	return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
221 		bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
222 		bch2_trans_commit(trans, NULL, NULL, 0) ?:
223 		-BCH_ERR_transaction_restart_nested;
224 }
225 
226 #define REBALANCE_WORK_SCAN_OFFSET	(U64_MAX - 1)
227 
228 static const char * const bch2_rebalance_state_strs[] = {
229 #define x(t) #t,
230 	BCH_REBALANCE_STATES()
231 	NULL
232 #undef x
233 };
234 
bch2_set_rebalance_needs_scan_trans(struct btree_trans * trans,u64 inum)235 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
236 {
237 	struct btree_iter iter;
238 	struct bkey_s_c k;
239 	struct bkey_i_cookie *cookie;
240 	u64 v;
241 	int ret;
242 
243 	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
244 			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
245 			     BTREE_ITER_intent);
246 	k = bch2_btree_iter_peek_slot(trans, &iter);
247 	ret = bkey_err(k);
248 	if (ret)
249 		goto err;
250 
251 	v = k.k->type == KEY_TYPE_cookie
252 		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
253 		: 0;
254 
255 	cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
256 	ret = PTR_ERR_OR_ZERO(cookie);
257 	if (ret)
258 		goto err;
259 
260 	bkey_cookie_init(&cookie->k_i);
261 	cookie->k.p = iter.pos;
262 	cookie->v.cookie = cpu_to_le64(v + 1);
263 
264 	ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
265 err:
266 	bch2_trans_iter_exit(trans, &iter);
267 	return ret;
268 }
269 
bch2_set_rebalance_needs_scan(struct bch_fs * c,u64 inum)270 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
271 {
272 	int ret = bch2_trans_commit_do(c, NULL, NULL,
273 				       BCH_TRANS_COMMIT_no_enospc,
274 			    bch2_set_rebalance_needs_scan_trans(trans, inum));
275 	bch2_rebalance_wakeup(c);
276 	return ret;
277 }
278 
bch2_set_fs_needs_rebalance(struct bch_fs * c)279 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
280 {
281 	return bch2_set_rebalance_needs_scan(c, 0);
282 }
283 
bch2_clear_rebalance_needs_scan(struct btree_trans * trans,u64 inum,u64 cookie)284 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
285 {
286 	struct btree_iter iter;
287 	struct bkey_s_c k;
288 	u64 v;
289 	int ret;
290 
291 	bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
292 			     SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
293 			     BTREE_ITER_intent);
294 	k = bch2_btree_iter_peek_slot(trans, &iter);
295 	ret = bkey_err(k);
296 	if (ret)
297 		goto err;
298 
299 	v = k.k->type == KEY_TYPE_cookie
300 		? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
301 		: 0;
302 
303 	if (v == cookie)
304 		ret = bch2_btree_delete_at(trans, &iter, 0);
305 err:
306 	bch2_trans_iter_exit(trans, &iter);
307 	return ret;
308 }
309 
next_rebalance_entry(struct btree_trans * trans,struct btree_iter * work_iter)310 static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
311 					    struct btree_iter *work_iter)
312 {
313 	return !kthread_should_stop()
314 		? bch2_btree_iter_peek(trans, work_iter)
315 		: bkey_s_c_null;
316 }
317 
bch2_bkey_clear_needs_rebalance(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)318 static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
319 					   struct btree_iter *iter,
320 					   struct bkey_s_c k)
321 {
322 	if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
323 		return 0;
324 
325 	struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
326 	int ret = PTR_ERR_OR_ZERO(n);
327 	if (ret)
328 		return ret;
329 
330 	extent_entry_drop(bkey_i_to_s(n),
331 			  (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
332 	return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
333 }
334 
next_rebalance_extent(struct btree_trans * trans,struct bpos work_pos,struct btree_iter * extent_iter,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)335 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
336 			struct bpos work_pos,
337 			struct btree_iter *extent_iter,
338 			struct bch_io_opts *io_opts,
339 			struct data_update_opts *data_opts)
340 {
341 	struct bch_fs *c = trans->c;
342 
343 	bch2_trans_iter_exit(trans, extent_iter);
344 	bch2_trans_iter_init(trans, extent_iter,
345 			     work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
346 			     work_pos,
347 			     BTREE_ITER_all_snapshots);
348 	struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
349 	if (bkey_err(k))
350 		return k;
351 
352 	int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
353 	if (ret)
354 		return bkey_s_c_err(ret);
355 
356 	memset(data_opts, 0, sizeof(*data_opts));
357 	data_opts->rewrite_ptrs		= bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
358 	data_opts->target		= io_opts->background_target;
359 	data_opts->write_flags		|= BCH_WRITE_only_specified_devs;
360 
361 	if (!data_opts->rewrite_ptrs) {
362 		/*
363 		 * device we would want to write to offline? devices in target
364 		 * changed?
365 		 *
366 		 * We'll now need a full scan before this extent is picked up
367 		 * again:
368 		 */
369 		int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
370 		if (ret)
371 			return bkey_s_c_err(ret);
372 		return bkey_s_c_null;
373 	}
374 
375 	if (trace_rebalance_extent_enabled()) {
376 		struct printbuf buf = PRINTBUF;
377 
378 		bch2_bkey_val_to_text(&buf, c, k);
379 		prt_newline(&buf);
380 
381 		struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
382 
383 		unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
384 		if (p) {
385 			prt_str(&buf, "compression=");
386 			bch2_compression_opt_to_text(&buf, io_opts->background_compression);
387 			prt_str(&buf, " ");
388 			bch2_prt_u64_base2(&buf, p);
389 			prt_newline(&buf);
390 		}
391 
392 		p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
393 		if (p) {
394 			prt_str(&buf, "move=");
395 			bch2_target_to_text(&buf, c, io_opts->background_target);
396 			prt_str(&buf, " ");
397 			bch2_prt_u64_base2(&buf, p);
398 			prt_newline(&buf);
399 		}
400 
401 		trace_rebalance_extent(c, buf.buf);
402 		printbuf_exit(&buf);
403 	}
404 
405 	return k;
406 }
407 
408 noinline_for_stack
do_rebalance_extent(struct moving_context * ctxt,struct bpos work_pos,struct btree_iter * extent_iter)409 static int do_rebalance_extent(struct moving_context *ctxt,
410 			       struct bpos work_pos,
411 			       struct btree_iter *extent_iter)
412 {
413 	struct btree_trans *trans = ctxt->trans;
414 	struct bch_fs *c = trans->c;
415 	struct bch_fs_rebalance *r = &trans->c->rebalance;
416 	struct data_update_opts data_opts;
417 	struct bch_io_opts io_opts;
418 	struct bkey_s_c k;
419 	struct bkey_buf sk;
420 	int ret;
421 
422 	ctxt->stats = &r->work_stats;
423 	r->state = BCH_REBALANCE_working;
424 
425 	bch2_bkey_buf_init(&sk);
426 
427 	ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
428 				extent_iter, &io_opts, &data_opts));
429 	if (ret || !k.k)
430 		goto out;
431 
432 	atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
433 
434 	/*
435 	 * The iterator gets unlocked by __bch2_read_extent - need to
436 	 * save a copy of @k elsewhere:
437 	 */
438 	bch2_bkey_buf_reassemble(&sk, c, k);
439 	k = bkey_i_to_s_c(sk.k);
440 
441 	ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
442 	if (ret) {
443 		if (bch2_err_matches(ret, ENOMEM)) {
444 			/* memory allocation failure, wait for some IO to finish */
445 			bch2_move_ctxt_wait_for_io(ctxt);
446 			ret = bch_err_throw(c, transaction_restart_nested);
447 		}
448 
449 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
450 			goto out;
451 
452 		/* skip it and continue, XXX signal failure */
453 		ret = 0;
454 	}
455 out:
456 	bch2_bkey_buf_exit(&sk, c);
457 	return ret;
458 }
459 
do_rebalance_scan(struct moving_context * ctxt,u64 inum,u64 cookie)460 static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
461 {
462 	struct btree_trans *trans = ctxt->trans;
463 	struct bch_fs *c = trans->c;
464 	struct bch_fs_rebalance *r = &trans->c->rebalance;
465 
466 	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
467 	ctxt->stats = &r->scan_stats;
468 
469 	if (!inum) {
470 		r->scan_start	= BBPOS_MIN;
471 		r->scan_end	= BBPOS_MAX;
472 	} else {
473 		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
474 		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
475 	}
476 
477 	r->state = BCH_REBALANCE_scanning;
478 
479 	struct per_snapshot_io_opts snapshot_io_opts;
480 	per_snapshot_io_opts_init(&snapshot_io_opts, c);
481 
482 	int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
483 				      r->scan_start.pos, r->scan_end.pos,
484 				      BTREE_ITER_all_snapshots|
485 				      BTREE_ITER_not_extents|
486 				      BTREE_ITER_prefetch, k, ({
487 		ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
488 
489 		struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
490 					&snapshot_io_opts, iter.pos, &iter, k);
491 		PTR_ERR_OR_ZERO(io_opts);
492 	})) ?:
493 	commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
494 		  bch2_clear_rebalance_needs_scan(trans, inum, cookie));
495 
496 	per_snapshot_io_opts_exit(&snapshot_io_opts);
497 	bch2_move_stats_exit(&r->scan_stats, trans->c);
498 
499 	/*
500 	 * Ensure that the rebalance_work entries we created are seen by the
501 	 * next iteration of do_rebalance(), so we don't end up stuck in
502 	 * rebalance_wait():
503 	 */
504 	atomic64_inc(&r->scan_stats.sectors_seen);
505 	bch2_btree_write_buffer_flush_sync(trans);
506 
507 	return ret;
508 }
509 
rebalance_wait(struct bch_fs * c)510 static void rebalance_wait(struct bch_fs *c)
511 {
512 	struct bch_fs_rebalance *r = &c->rebalance;
513 	struct io_clock *clock = &c->io_clock[WRITE];
514 	u64 now = atomic64_read(&clock->now);
515 	u64 min_member_capacity = bch2_min_rw_member_capacity(c);
516 
517 	if (min_member_capacity == U64_MAX)
518 		min_member_capacity = 128 * 2048;
519 
520 	r->wait_iotime_end		= now + (min_member_capacity >> 6);
521 
522 	if (r->state != BCH_REBALANCE_waiting) {
523 		r->wait_iotime_start	= now;
524 		r->wait_wallclock_start	= ktime_get_real_ns();
525 		r->state		= BCH_REBALANCE_waiting;
526 	}
527 
528 	bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
529 }
530 
bch2_rebalance_enabled(struct bch_fs * c)531 static bool bch2_rebalance_enabled(struct bch_fs *c)
532 {
533 	return c->opts.rebalance_enabled &&
534 		!(c->opts.rebalance_on_ac_only &&
535 		  c->rebalance.on_battery);
536 }
537 
do_rebalance(struct moving_context * ctxt)538 static int do_rebalance(struct moving_context *ctxt)
539 {
540 	struct btree_trans *trans = ctxt->trans;
541 	struct bch_fs *c = trans->c;
542 	struct bch_fs_rebalance *r = &c->rebalance;
543 	struct btree_iter rebalance_work_iter, extent_iter = {};
544 	struct bkey_s_c k;
545 	u32 kick = r->kick;
546 	int ret = 0;
547 
548 	bch2_trans_begin(trans);
549 
550 	bch2_move_stats_init(&r->work_stats, "rebalance_work");
551 	bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
552 
553 	bch2_trans_iter_init(trans, &rebalance_work_iter,
554 			     BTREE_ID_rebalance_work, POS_MIN,
555 			     BTREE_ITER_all_snapshots);
556 
557 	while (!bch2_move_ratelimit(ctxt)) {
558 		if (!bch2_rebalance_enabled(c)) {
559 			bch2_moving_ctxt_flush_all(ctxt);
560 			kthread_wait_freezable(bch2_rebalance_enabled(c) ||
561 					       kthread_should_stop());
562 		}
563 
564 		if (kthread_should_stop())
565 			break;
566 
567 		bch2_trans_begin(trans);
568 
569 		ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
570 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571 			continue;
572 		if (ret || !k.k)
573 			break;
574 
575 		ret = k.k->type == KEY_TYPE_cookie
576 			? do_rebalance_scan(ctxt, k.k->p.inode,
577 					    le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
578 			: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
579 
580 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
581 			continue;
582 		if (ret)
583 			break;
584 
585 		bch2_btree_iter_advance(trans, &rebalance_work_iter);
586 	}
587 
588 	bch2_trans_iter_exit(trans, &extent_iter);
589 	bch2_trans_iter_exit(trans, &rebalance_work_iter);
590 	bch2_move_stats_exit(&r->scan_stats, c);
591 
592 	if (!ret &&
593 	    !kthread_should_stop() &&
594 	    !atomic64_read(&r->work_stats.sectors_seen) &&
595 	    !atomic64_read(&r->scan_stats.sectors_seen) &&
596 	    kick == r->kick) {
597 		bch2_moving_ctxt_flush_all(ctxt);
598 		bch2_trans_unlock_long(trans);
599 		rebalance_wait(c);
600 	}
601 
602 	if (!bch2_err_matches(ret, EROFS))
603 		bch_err_fn(c, ret);
604 	return ret;
605 }
606 
bch2_rebalance_thread(void * arg)607 static int bch2_rebalance_thread(void *arg)
608 {
609 	struct bch_fs *c = arg;
610 	struct bch_fs_rebalance *r = &c->rebalance;
611 	struct moving_context ctxt;
612 
613 	set_freezable();
614 
615 	/*
616 	 * Data move operations can't run until after check_snapshots has
617 	 * completed, and bch2_snapshot_is_ancestor() is available.
618 	 */
619 	kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
620 			       kthread_should_stop());
621 
622 	bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
623 			      writepoint_ptr(&c->rebalance_write_point),
624 			      true);
625 
626 	while (!kthread_should_stop() && !do_rebalance(&ctxt))
627 		;
628 
629 	bch2_moving_ctxt_exit(&ctxt);
630 
631 	return 0;
632 }
633 
bch2_rebalance_status_to_text(struct printbuf * out,struct bch_fs * c)634 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
635 {
636 	printbuf_tabstop_push(out, 32);
637 
638 	struct bch_fs_rebalance *r = &c->rebalance;
639 
640 	/* print pending work */
641 	struct disk_accounting_pos acc;
642 	disk_accounting_key_init(acc, rebalance_work);
643 	u64 v;
644 	bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
645 
646 	prt_printf(out, "pending work:\t");
647 	prt_human_readable_u64(out, v << 9);
648 	prt_printf(out, "\n\n");
649 
650 	prt_str(out, bch2_rebalance_state_strs[r->state]);
651 	prt_newline(out);
652 	printbuf_indent_add(out, 2);
653 
654 	switch (r->state) {
655 	case BCH_REBALANCE_waiting: {
656 		u64 now = atomic64_read(&c->io_clock[WRITE].now);
657 
658 		prt_printf(out, "io wait duration:\t");
659 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
660 		prt_newline(out);
661 
662 		prt_printf(out, "io wait remaining:\t");
663 		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
664 		prt_newline(out);
665 
666 		prt_printf(out, "duration waited:\t");
667 		bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
668 		prt_newline(out);
669 		break;
670 	}
671 	case BCH_REBALANCE_working:
672 		bch2_move_stats_to_text(out, &r->work_stats);
673 		break;
674 	case BCH_REBALANCE_scanning:
675 		bch2_move_stats_to_text(out, &r->scan_stats);
676 		break;
677 	}
678 	prt_newline(out);
679 
680 	struct task_struct *t;
681 	scoped_guard(rcu) {
682 		t = rcu_dereference(c->rebalance.thread);
683 		if (t)
684 			get_task_struct(t);
685 	}
686 
687 	if (t) {
688 		bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
689 		put_task_struct(t);
690 	}
691 
692 	printbuf_indent_sub(out, 2);
693 }
694 
bch2_rebalance_stop(struct bch_fs * c)695 void bch2_rebalance_stop(struct bch_fs *c)
696 {
697 	struct task_struct *p;
698 
699 	c->rebalance.pd.rate.rate = UINT_MAX;
700 	bch2_ratelimit_reset(&c->rebalance.pd.rate);
701 
702 	p = rcu_dereference_protected(c->rebalance.thread, 1);
703 	c->rebalance.thread = NULL;
704 
705 	if (p) {
706 		/* for sychronizing with bch2_rebalance_wakeup() */
707 		synchronize_rcu();
708 
709 		kthread_stop(p);
710 		put_task_struct(p);
711 	}
712 }
713 
bch2_rebalance_start(struct bch_fs * c)714 int bch2_rebalance_start(struct bch_fs *c)
715 {
716 	struct task_struct *p;
717 	int ret;
718 
719 	if (c->rebalance.thread)
720 		return 0;
721 
722 	if (c->opts.nochanges)
723 		return 0;
724 
725 	p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
726 	ret = PTR_ERR_OR_ZERO(p);
727 	bch_err_msg(c, ret, "creating rebalance thread");
728 	if (ret)
729 		return ret;
730 
731 	get_task_struct(p);
732 	rcu_assign_pointer(c->rebalance.thread, p);
733 	wake_up_process(p);
734 	return 0;
735 }
736 
737 #ifdef CONFIG_POWER_SUPPLY
738 #include <linux/power_supply.h>
739 
bch2_rebalance_power_notifier(struct notifier_block * nb,unsigned long event,void * data)740 static int bch2_rebalance_power_notifier(struct notifier_block *nb,
741 					 unsigned long event, void *data)
742 {
743 	struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
744 
745 	c->rebalance.on_battery = !power_supply_is_system_supplied();
746 	bch2_rebalance_wakeup(c);
747 	return NOTIFY_OK;
748 }
749 #endif
750 
bch2_fs_rebalance_exit(struct bch_fs * c)751 void bch2_fs_rebalance_exit(struct bch_fs *c)
752 {
753 #ifdef CONFIG_POWER_SUPPLY
754 	power_supply_unreg_notifier(&c->rebalance.power_notifier);
755 #endif
756 }
757 
bch2_fs_rebalance_init(struct bch_fs * c)758 int bch2_fs_rebalance_init(struct bch_fs *c)
759 {
760 	struct bch_fs_rebalance *r = &c->rebalance;
761 
762 	bch2_pd_controller_init(&r->pd);
763 
764 #ifdef CONFIG_POWER_SUPPLY
765 	r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
766 	int ret = power_supply_reg_notifier(&r->power_notifier);
767 	if (ret)
768 		return ret;
769 
770 	r->on_battery = !power_supply_is_system_supplied();
771 #endif
772 	return 0;
773 }
774 
check_rebalance_work_one(struct btree_trans * trans,struct btree_iter * extent_iter,struct btree_iter * rebalance_iter,struct bkey_buf * last_flushed)775 static int check_rebalance_work_one(struct btree_trans *trans,
776 				    struct btree_iter *extent_iter,
777 				    struct btree_iter *rebalance_iter,
778 				    struct bkey_buf *last_flushed)
779 {
780 	struct bch_fs *c = trans->c;
781 	struct bkey_s_c extent_k, rebalance_k;
782 	struct printbuf buf = PRINTBUF;
783 
784 	int ret = bkey_err(extent_k	= bch2_btree_iter_peek(trans, extent_iter)) ?:
785 		  bkey_err(rebalance_k	= bch2_btree_iter_peek(trans, rebalance_iter));
786 	if (ret)
787 		return ret;
788 
789 	if (!extent_k.k &&
790 	    extent_iter->btree_id == BTREE_ID_reflink &&
791 	    (!rebalance_k.k ||
792 	     rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
793 		bch2_trans_iter_exit(trans, extent_iter);
794 		bch2_trans_iter_init(trans, extent_iter,
795 				     BTREE_ID_extents, POS_MIN,
796 				     BTREE_ITER_prefetch|
797 				     BTREE_ITER_all_snapshots);
798 		return bch_err_throw(c, transaction_restart_nested);
799 	}
800 
801 	if (!extent_k.k && !rebalance_k.k)
802 		return 1;
803 
804 	int cmp = bpos_cmp(extent_k.k	 ? extent_k.k->p    : SPOS_MAX,
805 			   rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
806 
807 	struct bkey deleted;
808 	bkey_init(&deleted);
809 
810 	if (cmp < 0) {
811 		deleted.p = extent_k.k->p;
812 		rebalance_k.k = &deleted;
813 	} else if (cmp > 0) {
814 		deleted.p = rebalance_k.k->p;
815 		extent_k.k = &deleted;
816 	}
817 
818 	bool should_have_rebalance =
819 		bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
820 	bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
821 
822 	if (should_have_rebalance != have_rebalance) {
823 		ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
824 		if (ret)
825 			return ret;
826 
827 		bch2_bkey_val_to_text(&buf, c, extent_k);
828 	}
829 
830 	if (fsck_err_on(!should_have_rebalance && have_rebalance,
831 			trans, rebalance_work_incorrectly_set,
832 			"rebalance work incorrectly set\n%s", buf.buf)) {
833 		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
834 						  extent_k.k->p, false);
835 		if (ret)
836 			goto err;
837 	}
838 
839 	if (fsck_err_on(should_have_rebalance && !have_rebalance,
840 			trans, rebalance_work_incorrectly_unset,
841 			"rebalance work incorrectly unset\n%s", buf.buf)) {
842 		ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
843 						  extent_k.k->p, true);
844 		if (ret)
845 			goto err;
846 	}
847 
848 	if (cmp <= 0)
849 		bch2_btree_iter_advance(trans, extent_iter);
850 	if (cmp >= 0)
851 		bch2_btree_iter_advance(trans, rebalance_iter);
852 err:
853 fsck_err:
854 	printbuf_exit(&buf);
855 	return ret;
856 }
857 
bch2_check_rebalance_work(struct bch_fs * c)858 int bch2_check_rebalance_work(struct bch_fs *c)
859 {
860 	struct btree_trans *trans = bch2_trans_get(c);
861 	struct btree_iter rebalance_iter, extent_iter;
862 	int ret = 0;
863 
864 	bch2_trans_iter_init(trans, &extent_iter,
865 			     BTREE_ID_reflink, POS_MIN,
866 			     BTREE_ITER_prefetch);
867 	bch2_trans_iter_init(trans, &rebalance_iter,
868 			     BTREE_ID_rebalance_work, POS_MIN,
869 			     BTREE_ITER_prefetch);
870 
871 	struct bkey_buf last_flushed;
872 	bch2_bkey_buf_init(&last_flushed);
873 	bkey_init(&last_flushed.k->k);
874 
875 	while (!ret) {
876 		bch2_trans_begin(trans);
877 
878 		ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
879 
880 		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
881 			ret = 0;
882 	}
883 
884 	bch2_bkey_buf_exit(&last_flushed, c);
885 	bch2_trans_iter_exit(trans, &extent_iter);
886 	bch2_trans_iter_exit(trans, &rebalance_iter);
887 	bch2_trans_put(trans);
888 	return ret < 0 ? ret : 0;
889 }
890