1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "alloc_foreground.h"
6 #include "btree_iter.h"
7 #include "btree_update.h"
8 #include "btree_write_buffer.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "compress.h"
12 #include "disk_groups.h"
13 #include "errcode.h"
14 #include "error.h"
15 #include "inode.h"
16 #include "io_write.h"
17 #include "move.h"
18 #include "rebalance.h"
19 #include "subvolume.h"
20 #include "super-io.h"
21 #include "trace.h"
22
23 #include <linux/freezer.h>
24 #include <linux/kthread.h>
25 #include <linux/sched/cputime.h>
26
27 /* bch_extent_rebalance: */
28
bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)29 static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
30 {
31 const union bch_extent_entry *entry;
32
33 bkey_extent_entry_for_each(ptrs, entry)
34 if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
35 return &entry->rebalance;
36
37 return NULL;
38 }
39
bch2_bkey_rebalance_opts(struct bkey_s_c k)40 static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
41 {
42 return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
43 }
44
bch2_bkey_ptrs_need_compress(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k,struct bkey_ptrs_c ptrs)45 static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
46 struct bch_io_opts *opts,
47 struct bkey_s_c k,
48 struct bkey_ptrs_c ptrs)
49 {
50 if (!opts->background_compression)
51 return 0;
52
53 unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
54 const union bch_extent_entry *entry;
55 struct extent_ptr_decoded p;
56 unsigned ptr_bit = 1;
57 unsigned rewrite_ptrs = 0;
58
59 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
60 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
61 p.ptr.unwritten)
62 return 0;
63
64 if (!p.ptr.cached && p.crc.compression_type != compression_type)
65 rewrite_ptrs |= ptr_bit;
66 ptr_bit <<= 1;
67 }
68
69 return rewrite_ptrs;
70 }
71
bch2_bkey_ptrs_need_move(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_ptrs_c ptrs)72 static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
73 struct bch_io_opts *opts,
74 struct bkey_ptrs_c ptrs)
75 {
76 if (!opts->background_target ||
77 !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target))
78 return 0;
79
80 unsigned ptr_bit = 1;
81 unsigned rewrite_ptrs = 0;
82
83 guard(rcu)();
84 bkey_for_each_ptr(ptrs, ptr) {
85 if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
86 rewrite_ptrs |= ptr_bit;
87 ptr_bit <<= 1;
88 }
89
90 return rewrite_ptrs;
91 }
92
bch2_bkey_ptrs_need_rebalance(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k)93 static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
94 struct bch_io_opts *opts,
95 struct bkey_s_c k)
96 {
97 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
98
99 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
100 return 0;
101
102 return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
103 bch2_bkey_ptrs_need_move(c, opts, ptrs);
104 }
105
bch2_bkey_sectors_need_rebalance(struct bch_fs * c,struct bkey_s_c k)106 u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
107 {
108 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
109
110 const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
111 if (!opts)
112 return 0;
113
114 if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
115 return 0;
116
117 const union bch_extent_entry *entry;
118 struct extent_ptr_decoded p;
119 u64 sectors = 0;
120
121 if (opts->background_compression) {
122 unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression);
123
124 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
125 if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
126 p.ptr.unwritten) {
127 sectors = 0;
128 goto incompressible;
129 }
130
131 if (!p.ptr.cached && p.crc.compression_type != compression_type)
132 sectors += p.crc.compressed_size;
133 }
134 }
135 incompressible:
136 if (opts->background_target) {
137 guard(rcu)();
138 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
139 if (!p.ptr.cached &&
140 !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
141 sectors += p.crc.compressed_size;
142 }
143
144 return sectors;
145 }
146
bch2_bkey_rebalance_needs_update(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_s_c k)147 static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts,
148 struct bkey_s_c k)
149 {
150 if (!bkey_extent_is_direct_data(k.k))
151 return 0;
152
153 const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
154
155 if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
156 struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
157 return old == NULL || memcmp(old, &new, sizeof(new));
158 } else {
159 return old != NULL;
160 }
161 }
162
bch2_bkey_set_needs_rebalance(struct bch_fs * c,struct bch_io_opts * opts,struct bkey_i * _k)163 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
164 struct bkey_i *_k)
165 {
166 if (!bkey_extent_is_direct_data(&_k->k))
167 return 0;
168
169 struct bkey_s k = bkey_i_to_s(_k);
170 struct bch_extent_rebalance *old =
171 (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
172
173 if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) {
174 if (!old) {
175 old = bkey_val_end(k);
176 k.k->u64s += sizeof(*old) / sizeof(u64);
177 }
178
179 *old = io_opts_to_rebalance_opts(c, opts);
180 } else {
181 if (old)
182 extent_entry_drop(k, (union bch_extent_entry *) old);
183 }
184
185 return 0;
186 }
187
bch2_get_update_rebalance_opts(struct btree_trans * trans,struct bch_io_opts * io_opts,struct btree_iter * iter,struct bkey_s_c k)188 int bch2_get_update_rebalance_opts(struct btree_trans *trans,
189 struct bch_io_opts *io_opts,
190 struct btree_iter *iter,
191 struct bkey_s_c k)
192 {
193 BUG_ON(iter->flags & BTREE_ITER_is_extents);
194 BUG_ON(iter->flags & BTREE_ITER_filter_snapshots);
195
196 const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v
197 ? bch2_bkey_rebalance_opts(k) : NULL;
198 if (r) {
199 #define x(_name) \
200 if (r->_name##_from_inode) { \
201 io_opts->_name = r->_name; \
202 io_opts->_name##_from_inode = true; \
203 }
204 BCH_REBALANCE_OPTS()
205 #undef x
206 }
207
208 if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k))
209 return 0;
210
211 struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8);
212 int ret = PTR_ERR_OR_ZERO(n);
213 if (ret)
214 return ret;
215
216 bkey_reassemble(n, k);
217
218 /* On successfull transaction commit, @k was invalidated: */
219
220 return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?:
221 bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?:
222 bch2_trans_commit(trans, NULL, NULL, 0) ?:
223 -BCH_ERR_transaction_restart_nested;
224 }
225
226 #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
227
228 static const char * const bch2_rebalance_state_strs[] = {
229 #define x(t) #t,
230 BCH_REBALANCE_STATES()
231 NULL
232 #undef x
233 };
234
bch2_set_rebalance_needs_scan_trans(struct btree_trans * trans,u64 inum)235 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
236 {
237 struct btree_iter iter;
238 struct bkey_s_c k;
239 struct bkey_i_cookie *cookie;
240 u64 v;
241 int ret;
242
243 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
244 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
245 BTREE_ITER_intent);
246 k = bch2_btree_iter_peek_slot(trans, &iter);
247 ret = bkey_err(k);
248 if (ret)
249 goto err;
250
251 v = k.k->type == KEY_TYPE_cookie
252 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
253 : 0;
254
255 cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
256 ret = PTR_ERR_OR_ZERO(cookie);
257 if (ret)
258 goto err;
259
260 bkey_cookie_init(&cookie->k_i);
261 cookie->k.p = iter.pos;
262 cookie->v.cookie = cpu_to_le64(v + 1);
263
264 ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
265 err:
266 bch2_trans_iter_exit(trans, &iter);
267 return ret;
268 }
269
bch2_set_rebalance_needs_scan(struct bch_fs * c,u64 inum)270 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
271 {
272 int ret = bch2_trans_commit_do(c, NULL, NULL,
273 BCH_TRANS_COMMIT_no_enospc,
274 bch2_set_rebalance_needs_scan_trans(trans, inum));
275 bch2_rebalance_wakeup(c);
276 return ret;
277 }
278
bch2_set_fs_needs_rebalance(struct bch_fs * c)279 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
280 {
281 return bch2_set_rebalance_needs_scan(c, 0);
282 }
283
bch2_clear_rebalance_needs_scan(struct btree_trans * trans,u64 inum,u64 cookie)284 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
285 {
286 struct btree_iter iter;
287 struct bkey_s_c k;
288 u64 v;
289 int ret;
290
291 bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
292 SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
293 BTREE_ITER_intent);
294 k = bch2_btree_iter_peek_slot(trans, &iter);
295 ret = bkey_err(k);
296 if (ret)
297 goto err;
298
299 v = k.k->type == KEY_TYPE_cookie
300 ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
301 : 0;
302
303 if (v == cookie)
304 ret = bch2_btree_delete_at(trans, &iter, 0);
305 err:
306 bch2_trans_iter_exit(trans, &iter);
307 return ret;
308 }
309
next_rebalance_entry(struct btree_trans * trans,struct btree_iter * work_iter)310 static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
311 struct btree_iter *work_iter)
312 {
313 return !kthread_should_stop()
314 ? bch2_btree_iter_peek(trans, work_iter)
315 : bkey_s_c_null;
316 }
317
bch2_bkey_clear_needs_rebalance(struct btree_trans * trans,struct btree_iter * iter,struct bkey_s_c k)318 static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
319 struct btree_iter *iter,
320 struct bkey_s_c k)
321 {
322 if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
323 return 0;
324
325 struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
326 int ret = PTR_ERR_OR_ZERO(n);
327 if (ret)
328 return ret;
329
330 extent_entry_drop(bkey_i_to_s(n),
331 (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
332 return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
333 }
334
next_rebalance_extent(struct btree_trans * trans,struct bpos work_pos,struct btree_iter * extent_iter,struct bch_io_opts * io_opts,struct data_update_opts * data_opts)335 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
336 struct bpos work_pos,
337 struct btree_iter *extent_iter,
338 struct bch_io_opts *io_opts,
339 struct data_update_opts *data_opts)
340 {
341 struct bch_fs *c = trans->c;
342
343 bch2_trans_iter_exit(trans, extent_iter);
344 bch2_trans_iter_init(trans, extent_iter,
345 work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
346 work_pos,
347 BTREE_ITER_all_snapshots);
348 struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
349 if (bkey_err(k))
350 return k;
351
352 int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k);
353 if (ret)
354 return bkey_s_c_err(ret);
355
356 memset(data_opts, 0, sizeof(*data_opts));
357 data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
358 data_opts->target = io_opts->background_target;
359 data_opts->write_flags |= BCH_WRITE_only_specified_devs;
360
361 if (!data_opts->rewrite_ptrs) {
362 /*
363 * device we would want to write to offline? devices in target
364 * changed?
365 *
366 * We'll now need a full scan before this extent is picked up
367 * again:
368 */
369 int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
370 if (ret)
371 return bkey_s_c_err(ret);
372 return bkey_s_c_null;
373 }
374
375 if (trace_rebalance_extent_enabled()) {
376 struct printbuf buf = PRINTBUF;
377
378 bch2_bkey_val_to_text(&buf, c, k);
379 prt_newline(&buf);
380
381 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
382
383 unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs);
384 if (p) {
385 prt_str(&buf, "compression=");
386 bch2_compression_opt_to_text(&buf, io_opts->background_compression);
387 prt_str(&buf, " ");
388 bch2_prt_u64_base2(&buf, p);
389 prt_newline(&buf);
390 }
391
392 p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs);
393 if (p) {
394 prt_str(&buf, "move=");
395 bch2_target_to_text(&buf, c, io_opts->background_target);
396 prt_str(&buf, " ");
397 bch2_prt_u64_base2(&buf, p);
398 prt_newline(&buf);
399 }
400
401 trace_rebalance_extent(c, buf.buf);
402 printbuf_exit(&buf);
403 }
404
405 return k;
406 }
407
408 noinline_for_stack
do_rebalance_extent(struct moving_context * ctxt,struct bpos work_pos,struct btree_iter * extent_iter)409 static int do_rebalance_extent(struct moving_context *ctxt,
410 struct bpos work_pos,
411 struct btree_iter *extent_iter)
412 {
413 struct btree_trans *trans = ctxt->trans;
414 struct bch_fs *c = trans->c;
415 struct bch_fs_rebalance *r = &trans->c->rebalance;
416 struct data_update_opts data_opts;
417 struct bch_io_opts io_opts;
418 struct bkey_s_c k;
419 struct bkey_buf sk;
420 int ret;
421
422 ctxt->stats = &r->work_stats;
423 r->state = BCH_REBALANCE_working;
424
425 bch2_bkey_buf_init(&sk);
426
427 ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
428 extent_iter, &io_opts, &data_opts));
429 if (ret || !k.k)
430 goto out;
431
432 atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
433
434 /*
435 * The iterator gets unlocked by __bch2_read_extent - need to
436 * save a copy of @k elsewhere:
437 */
438 bch2_bkey_buf_reassemble(&sk, c, k);
439 k = bkey_i_to_s_c(sk.k);
440
441 ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
442 if (ret) {
443 if (bch2_err_matches(ret, ENOMEM)) {
444 /* memory allocation failure, wait for some IO to finish */
445 bch2_move_ctxt_wait_for_io(ctxt);
446 ret = bch_err_throw(c, transaction_restart_nested);
447 }
448
449 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
450 goto out;
451
452 /* skip it and continue, XXX signal failure */
453 ret = 0;
454 }
455 out:
456 bch2_bkey_buf_exit(&sk, c);
457 return ret;
458 }
459
do_rebalance_scan(struct moving_context * ctxt,u64 inum,u64 cookie)460 static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
461 {
462 struct btree_trans *trans = ctxt->trans;
463 struct bch_fs *c = trans->c;
464 struct bch_fs_rebalance *r = &trans->c->rebalance;
465
466 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
467 ctxt->stats = &r->scan_stats;
468
469 if (!inum) {
470 r->scan_start = BBPOS_MIN;
471 r->scan_end = BBPOS_MAX;
472 } else {
473 r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
474 r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
475 }
476
477 r->state = BCH_REBALANCE_scanning;
478
479 struct per_snapshot_io_opts snapshot_io_opts;
480 per_snapshot_io_opts_init(&snapshot_io_opts, c);
481
482 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
483 r->scan_start.pos, r->scan_end.pos,
484 BTREE_ITER_all_snapshots|
485 BTREE_ITER_not_extents|
486 BTREE_ITER_prefetch, k, ({
487 ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
488
489 struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
490 &snapshot_io_opts, iter.pos, &iter, k);
491 PTR_ERR_OR_ZERO(io_opts);
492 })) ?:
493 commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
494 bch2_clear_rebalance_needs_scan(trans, inum, cookie));
495
496 per_snapshot_io_opts_exit(&snapshot_io_opts);
497 bch2_move_stats_exit(&r->scan_stats, trans->c);
498
499 /*
500 * Ensure that the rebalance_work entries we created are seen by the
501 * next iteration of do_rebalance(), so we don't end up stuck in
502 * rebalance_wait():
503 */
504 atomic64_inc(&r->scan_stats.sectors_seen);
505 bch2_btree_write_buffer_flush_sync(trans);
506
507 return ret;
508 }
509
rebalance_wait(struct bch_fs * c)510 static void rebalance_wait(struct bch_fs *c)
511 {
512 struct bch_fs_rebalance *r = &c->rebalance;
513 struct io_clock *clock = &c->io_clock[WRITE];
514 u64 now = atomic64_read(&clock->now);
515 u64 min_member_capacity = bch2_min_rw_member_capacity(c);
516
517 if (min_member_capacity == U64_MAX)
518 min_member_capacity = 128 * 2048;
519
520 r->wait_iotime_end = now + (min_member_capacity >> 6);
521
522 if (r->state != BCH_REBALANCE_waiting) {
523 r->wait_iotime_start = now;
524 r->wait_wallclock_start = ktime_get_real_ns();
525 r->state = BCH_REBALANCE_waiting;
526 }
527
528 bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
529 }
530
bch2_rebalance_enabled(struct bch_fs * c)531 static bool bch2_rebalance_enabled(struct bch_fs *c)
532 {
533 return c->opts.rebalance_enabled &&
534 !(c->opts.rebalance_on_ac_only &&
535 c->rebalance.on_battery);
536 }
537
do_rebalance(struct moving_context * ctxt)538 static int do_rebalance(struct moving_context *ctxt)
539 {
540 struct btree_trans *trans = ctxt->trans;
541 struct bch_fs *c = trans->c;
542 struct bch_fs_rebalance *r = &c->rebalance;
543 struct btree_iter rebalance_work_iter, extent_iter = {};
544 struct bkey_s_c k;
545 u32 kick = r->kick;
546 int ret = 0;
547
548 bch2_trans_begin(trans);
549
550 bch2_move_stats_init(&r->work_stats, "rebalance_work");
551 bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
552
553 bch2_trans_iter_init(trans, &rebalance_work_iter,
554 BTREE_ID_rebalance_work, POS_MIN,
555 BTREE_ITER_all_snapshots);
556
557 while (!bch2_move_ratelimit(ctxt)) {
558 if (!bch2_rebalance_enabled(c)) {
559 bch2_moving_ctxt_flush_all(ctxt);
560 kthread_wait_freezable(bch2_rebalance_enabled(c) ||
561 kthread_should_stop());
562 }
563
564 if (kthread_should_stop())
565 break;
566
567 bch2_trans_begin(trans);
568
569 ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
570 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571 continue;
572 if (ret || !k.k)
573 break;
574
575 ret = k.k->type == KEY_TYPE_cookie
576 ? do_rebalance_scan(ctxt, k.k->p.inode,
577 le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
578 : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
579
580 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
581 continue;
582 if (ret)
583 break;
584
585 bch2_btree_iter_advance(trans, &rebalance_work_iter);
586 }
587
588 bch2_trans_iter_exit(trans, &extent_iter);
589 bch2_trans_iter_exit(trans, &rebalance_work_iter);
590 bch2_move_stats_exit(&r->scan_stats, c);
591
592 if (!ret &&
593 !kthread_should_stop() &&
594 !atomic64_read(&r->work_stats.sectors_seen) &&
595 !atomic64_read(&r->scan_stats.sectors_seen) &&
596 kick == r->kick) {
597 bch2_moving_ctxt_flush_all(ctxt);
598 bch2_trans_unlock_long(trans);
599 rebalance_wait(c);
600 }
601
602 if (!bch2_err_matches(ret, EROFS))
603 bch_err_fn(c, ret);
604 return ret;
605 }
606
bch2_rebalance_thread(void * arg)607 static int bch2_rebalance_thread(void *arg)
608 {
609 struct bch_fs *c = arg;
610 struct bch_fs_rebalance *r = &c->rebalance;
611 struct moving_context ctxt;
612
613 set_freezable();
614
615 /*
616 * Data move operations can't run until after check_snapshots has
617 * completed, and bch2_snapshot_is_ancestor() is available.
618 */
619 kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
620 kthread_should_stop());
621
622 bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
623 writepoint_ptr(&c->rebalance_write_point),
624 true);
625
626 while (!kthread_should_stop() && !do_rebalance(&ctxt))
627 ;
628
629 bch2_moving_ctxt_exit(&ctxt);
630
631 return 0;
632 }
633
bch2_rebalance_status_to_text(struct printbuf * out,struct bch_fs * c)634 void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
635 {
636 printbuf_tabstop_push(out, 32);
637
638 struct bch_fs_rebalance *r = &c->rebalance;
639
640 /* print pending work */
641 struct disk_accounting_pos acc;
642 disk_accounting_key_init(acc, rebalance_work);
643 u64 v;
644 bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
645
646 prt_printf(out, "pending work:\t");
647 prt_human_readable_u64(out, v << 9);
648 prt_printf(out, "\n\n");
649
650 prt_str(out, bch2_rebalance_state_strs[r->state]);
651 prt_newline(out);
652 printbuf_indent_add(out, 2);
653
654 switch (r->state) {
655 case BCH_REBALANCE_waiting: {
656 u64 now = atomic64_read(&c->io_clock[WRITE].now);
657
658 prt_printf(out, "io wait duration:\t");
659 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
660 prt_newline(out);
661
662 prt_printf(out, "io wait remaining:\t");
663 bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
664 prt_newline(out);
665
666 prt_printf(out, "duration waited:\t");
667 bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
668 prt_newline(out);
669 break;
670 }
671 case BCH_REBALANCE_working:
672 bch2_move_stats_to_text(out, &r->work_stats);
673 break;
674 case BCH_REBALANCE_scanning:
675 bch2_move_stats_to_text(out, &r->scan_stats);
676 break;
677 }
678 prt_newline(out);
679
680 struct task_struct *t;
681 scoped_guard(rcu) {
682 t = rcu_dereference(c->rebalance.thread);
683 if (t)
684 get_task_struct(t);
685 }
686
687 if (t) {
688 bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
689 put_task_struct(t);
690 }
691
692 printbuf_indent_sub(out, 2);
693 }
694
bch2_rebalance_stop(struct bch_fs * c)695 void bch2_rebalance_stop(struct bch_fs *c)
696 {
697 struct task_struct *p;
698
699 c->rebalance.pd.rate.rate = UINT_MAX;
700 bch2_ratelimit_reset(&c->rebalance.pd.rate);
701
702 p = rcu_dereference_protected(c->rebalance.thread, 1);
703 c->rebalance.thread = NULL;
704
705 if (p) {
706 /* for sychronizing with bch2_rebalance_wakeup() */
707 synchronize_rcu();
708
709 kthread_stop(p);
710 put_task_struct(p);
711 }
712 }
713
bch2_rebalance_start(struct bch_fs * c)714 int bch2_rebalance_start(struct bch_fs *c)
715 {
716 struct task_struct *p;
717 int ret;
718
719 if (c->rebalance.thread)
720 return 0;
721
722 if (c->opts.nochanges)
723 return 0;
724
725 p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
726 ret = PTR_ERR_OR_ZERO(p);
727 bch_err_msg(c, ret, "creating rebalance thread");
728 if (ret)
729 return ret;
730
731 get_task_struct(p);
732 rcu_assign_pointer(c->rebalance.thread, p);
733 wake_up_process(p);
734 return 0;
735 }
736
737 #ifdef CONFIG_POWER_SUPPLY
738 #include <linux/power_supply.h>
739
bch2_rebalance_power_notifier(struct notifier_block * nb,unsigned long event,void * data)740 static int bch2_rebalance_power_notifier(struct notifier_block *nb,
741 unsigned long event, void *data)
742 {
743 struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
744
745 c->rebalance.on_battery = !power_supply_is_system_supplied();
746 bch2_rebalance_wakeup(c);
747 return NOTIFY_OK;
748 }
749 #endif
750
bch2_fs_rebalance_exit(struct bch_fs * c)751 void bch2_fs_rebalance_exit(struct bch_fs *c)
752 {
753 #ifdef CONFIG_POWER_SUPPLY
754 power_supply_unreg_notifier(&c->rebalance.power_notifier);
755 #endif
756 }
757
bch2_fs_rebalance_init(struct bch_fs * c)758 int bch2_fs_rebalance_init(struct bch_fs *c)
759 {
760 struct bch_fs_rebalance *r = &c->rebalance;
761
762 bch2_pd_controller_init(&r->pd);
763
764 #ifdef CONFIG_POWER_SUPPLY
765 r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
766 int ret = power_supply_reg_notifier(&r->power_notifier);
767 if (ret)
768 return ret;
769
770 r->on_battery = !power_supply_is_system_supplied();
771 #endif
772 return 0;
773 }
774
check_rebalance_work_one(struct btree_trans * trans,struct btree_iter * extent_iter,struct btree_iter * rebalance_iter,struct bkey_buf * last_flushed)775 static int check_rebalance_work_one(struct btree_trans *trans,
776 struct btree_iter *extent_iter,
777 struct btree_iter *rebalance_iter,
778 struct bkey_buf *last_flushed)
779 {
780 struct bch_fs *c = trans->c;
781 struct bkey_s_c extent_k, rebalance_k;
782 struct printbuf buf = PRINTBUF;
783
784 int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
785 bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
786 if (ret)
787 return ret;
788
789 if (!extent_k.k &&
790 extent_iter->btree_id == BTREE_ID_reflink &&
791 (!rebalance_k.k ||
792 rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
793 bch2_trans_iter_exit(trans, extent_iter);
794 bch2_trans_iter_init(trans, extent_iter,
795 BTREE_ID_extents, POS_MIN,
796 BTREE_ITER_prefetch|
797 BTREE_ITER_all_snapshots);
798 return bch_err_throw(c, transaction_restart_nested);
799 }
800
801 if (!extent_k.k && !rebalance_k.k)
802 return 1;
803
804 int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
805 rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
806
807 struct bkey deleted;
808 bkey_init(&deleted);
809
810 if (cmp < 0) {
811 deleted.p = extent_k.k->p;
812 rebalance_k.k = &deleted;
813 } else if (cmp > 0) {
814 deleted.p = rebalance_k.k->p;
815 extent_k.k = &deleted;
816 }
817
818 bool should_have_rebalance =
819 bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
820 bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
821
822 if (should_have_rebalance != have_rebalance) {
823 ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
824 if (ret)
825 return ret;
826
827 bch2_bkey_val_to_text(&buf, c, extent_k);
828 }
829
830 if (fsck_err_on(!should_have_rebalance && have_rebalance,
831 trans, rebalance_work_incorrectly_set,
832 "rebalance work incorrectly set\n%s", buf.buf)) {
833 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
834 extent_k.k->p, false);
835 if (ret)
836 goto err;
837 }
838
839 if (fsck_err_on(should_have_rebalance && !have_rebalance,
840 trans, rebalance_work_incorrectly_unset,
841 "rebalance work incorrectly unset\n%s", buf.buf)) {
842 ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
843 extent_k.k->p, true);
844 if (ret)
845 goto err;
846 }
847
848 if (cmp <= 0)
849 bch2_btree_iter_advance(trans, extent_iter);
850 if (cmp >= 0)
851 bch2_btree_iter_advance(trans, rebalance_iter);
852 err:
853 fsck_err:
854 printbuf_exit(&buf);
855 return ret;
856 }
857
bch2_check_rebalance_work(struct bch_fs * c)858 int bch2_check_rebalance_work(struct bch_fs *c)
859 {
860 struct btree_trans *trans = bch2_trans_get(c);
861 struct btree_iter rebalance_iter, extent_iter;
862 int ret = 0;
863
864 bch2_trans_iter_init(trans, &extent_iter,
865 BTREE_ID_reflink, POS_MIN,
866 BTREE_ITER_prefetch);
867 bch2_trans_iter_init(trans, &rebalance_iter,
868 BTREE_ID_rebalance_work, POS_MIN,
869 BTREE_ITER_prefetch);
870
871 struct bkey_buf last_flushed;
872 bch2_bkey_buf_init(&last_flushed);
873 bkey_init(&last_flushed.k->k);
874
875 while (!ret) {
876 bch2_trans_begin(trans);
877
878 ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
879
880 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
881 ret = 0;
882 }
883
884 bch2_bkey_buf_exit(&last_flushed, c);
885 bch2_trans_iter_exit(trans, &extent_iter);
886 bch2_trans_iter_exit(trans, &rebalance_iter);
887 bch2_trans_put(trans);
888 return ret < 0 ? ret : 0;
889 }
890