1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9
10 struct tree_mod_root {
11 u64 logical;
12 u8 level;
13 };
14
15 struct tree_mod_elem {
16 struct rb_node node;
17 u64 logical;
18 u64 seq;
19 enum btrfs_mod_log_op op;
20
21 /*
22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23 * operations.
24 */
25 int slot;
26
27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28 u64 generation;
29
30 union {
31 /*
32 * This is used for the following op types:
33 *
34 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
35 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
36 * BTRFS_MOD_LOG_KEY_REMOVE
37 * BTRFS_MOD_LOG_KEY_REPLACE
38 */
39 struct {
40 struct btrfs_disk_key key;
41 u64 blockptr;
42 } slot_change;
43
44 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
45 struct {
46 int dst_slot;
47 int nr_items;
48 } move;
49
50 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
51 struct tree_mod_root old_root;
52 };
53 };
54
55 /*
56 * Pull a new tree mod seq number for our operation.
57 */
btrfs_inc_tree_mod_seq(struct btrfs_fs_info * fs_info)58 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
59 {
60 return atomic64_inc_return(&fs_info->tree_mod_seq);
61 }
62
63 /*
64 * This adds a new blocker to the tree mod log's blocker list if the @elem
65 * passed does not already have a sequence number set. So when a caller expects
66 * to record tree modifications, it should ensure to set elem->seq to zero
67 * before calling btrfs_get_tree_mod_seq.
68 * Returns a fresh, unused tree log modification sequence number, even if no new
69 * blocker was added.
70 */
btrfs_get_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)71 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
72 struct btrfs_seq_list *elem)
73 {
74 write_lock(&fs_info->tree_mod_log_lock);
75 if (!elem->seq) {
76 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
77 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
78 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
79 }
80 write_unlock(&fs_info->tree_mod_log_lock);
81
82 return elem->seq;
83 }
84
btrfs_put_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)85 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
86 struct btrfs_seq_list *elem)
87 {
88 struct rb_root *tm_root;
89 struct rb_node *node;
90 struct rb_node *next;
91 struct tree_mod_elem *tm;
92 u64 min_seq = BTRFS_SEQ_LAST;
93 u64 seq_putting = elem->seq;
94
95 if (!seq_putting)
96 return;
97
98 write_lock(&fs_info->tree_mod_log_lock);
99 list_del(&elem->list);
100 elem->seq = 0;
101
102 if (list_empty(&fs_info->tree_mod_seq_list)) {
103 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
104 } else {
105 struct btrfs_seq_list *first;
106
107 first = list_first_entry(&fs_info->tree_mod_seq_list,
108 struct btrfs_seq_list, list);
109 if (seq_putting > first->seq) {
110 /*
111 * Blocker with lower sequence number exists, we cannot
112 * remove anything from the log.
113 */
114 write_unlock(&fs_info->tree_mod_log_lock);
115 return;
116 }
117 min_seq = first->seq;
118 }
119
120 /*
121 * Anything that's lower than the lowest existing (read: blocked)
122 * sequence number can be removed from the tree.
123 */
124 tm_root = &fs_info->tree_mod_log;
125 for (node = rb_first(tm_root); node; node = next) {
126 next = rb_next(node);
127 tm = rb_entry(node, struct tree_mod_elem, node);
128 if (tm->seq >= min_seq)
129 continue;
130 rb_erase(node, tm_root);
131 kfree(tm);
132 }
133 write_unlock(&fs_info->tree_mod_log_lock);
134 }
135
136 /*
137 * Key order of the log:
138 * node/leaf start address -> sequence
139 *
140 * The 'start address' is the logical address of the *new* root node for root
141 * replace operations, or the logical address of the affected block for all
142 * other operations.
143 */
tree_mod_log_insert(struct btrfs_fs_info * fs_info,struct tree_mod_elem * tm)144 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
145 struct tree_mod_elem *tm)
146 {
147 struct rb_root *tm_root;
148 struct rb_node **new;
149 struct rb_node *parent = NULL;
150 struct tree_mod_elem *cur;
151
152 lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
153
154 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
155
156 tm_root = &fs_info->tree_mod_log;
157 new = &tm_root->rb_node;
158 while (*new) {
159 cur = rb_entry(*new, struct tree_mod_elem, node);
160 parent = *new;
161 if (cur->logical < tm->logical)
162 new = &((*new)->rb_left);
163 else if (cur->logical > tm->logical)
164 new = &((*new)->rb_right);
165 else if (cur->seq < tm->seq)
166 new = &((*new)->rb_left);
167 else if (cur->seq > tm->seq)
168 new = &((*new)->rb_right);
169 else
170 return -EEXIST;
171 }
172
173 rb_link_node(&tm->node, parent, new);
174 rb_insert_color(&tm->node, tm_root);
175 return 0;
176 }
177
skip_eb_logging(const struct extent_buffer * eb)178 static inline bool skip_eb_logging(const struct extent_buffer *eb)
179 {
180 const u64 owner = btrfs_header_owner(eb);
181
182 if (btrfs_header_level(eb) == 0)
183 return true;
184
185 /*
186 * Tree mod logging exists so that there's a consistent view of the
187 * extents and backrefs of inodes even if while a task is iterating over
188 * them other tasks are modifying subvolume trees and the extent tree
189 * (including running delayed refs). So we only need to log extent
190 * buffers from the extent tree and subvolume trees.
191 */
192
193 if (owner == BTRFS_EXTENT_TREE_OBJECTID)
194 return false;
195
196 if (btrfs_is_fstree(owner))
197 return false;
198
199 return true;
200 }
201
202 /*
203 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
204 * returns false with the tree_mod_log_lock acquired. The caller must hold
205 * this until all tree mod log insertions are recorded in the rb tree and then
206 * write unlock fs_info::tree_mod_log_lock.
207 */
tree_mod_dont_log(struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)208 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
209 {
210 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
211 return true;
212 if (eb && skip_eb_logging(eb))
213 return true;
214
215 write_lock(&fs_info->tree_mod_log_lock);
216 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
217 write_unlock(&fs_info->tree_mod_log_lock);
218 return true;
219 }
220
221 return false;
222 }
223
224 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
tree_mod_need_log(const struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)225 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
226 const struct extent_buffer *eb)
227 {
228 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
229 return false;
230 if (eb && skip_eb_logging(eb))
231 return false;
232
233 return true;
234 }
235
alloc_tree_mod_elem(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)236 static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
237 int slot,
238 enum btrfs_mod_log_op op)
239 {
240 struct tree_mod_elem *tm;
241
242 /* Can't be one of these types, due to union in struct tree_mod_elem. */
243 ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
244 ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
245
246 tm = kzalloc(sizeof(*tm), GFP_NOFS);
247 if (!tm)
248 return NULL;
249
250 tm->logical = eb->start;
251 btrfs_node_key(eb, &tm->slot_change.key, slot);
252 tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
253 tm->op = op;
254 tm->slot = slot;
255 tm->generation = btrfs_node_ptr_generation(eb, slot);
256 RB_CLEAR_NODE(&tm->node);
257
258 return tm;
259 }
260
btrfs_tree_mod_log_insert_key(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)261 int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
262 enum btrfs_mod_log_op op)
263 {
264 struct tree_mod_elem *tm;
265 int ret = 0;
266
267 if (!tree_mod_need_log(eb->fs_info, eb))
268 return 0;
269
270 tm = alloc_tree_mod_elem(eb, slot, op);
271 if (!tm)
272 ret = -ENOMEM;
273
274 if (tree_mod_dont_log(eb->fs_info, eb)) {
275 kfree(tm);
276 /*
277 * Don't error if we failed to allocate memory because we don't
278 * need to log.
279 */
280 return 0;
281 } else if (ret != 0) {
282 /*
283 * We previously failed to allocate memory and we need to log,
284 * so we have to fail.
285 */
286 goto out_unlock;
287 }
288
289 ret = tree_mod_log_insert(eb->fs_info, tm);
290 out_unlock:
291 write_unlock(&eb->fs_info->tree_mod_log_lock);
292 if (ret)
293 kfree(tm);
294
295 return ret;
296 }
297
tree_mod_log_alloc_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)298 static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
299 int dst_slot, int src_slot,
300 int nr_items)
301 {
302 struct tree_mod_elem *tm;
303
304 tm = kzalloc(sizeof(*tm), GFP_NOFS);
305 if (!tm)
306 return ERR_PTR(-ENOMEM);
307
308 tm->logical = eb->start;
309 tm->slot = src_slot;
310 tm->move.dst_slot = dst_slot;
311 tm->move.nr_items = nr_items;
312 tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
313 RB_CLEAR_NODE(&tm->node);
314
315 return tm;
316 }
317
btrfs_tree_mod_log_insert_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)318 int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
319 int dst_slot, int src_slot,
320 int nr_items)
321 {
322 struct tree_mod_elem *tm = NULL;
323 struct tree_mod_elem **tm_list = NULL;
324 int ret = 0;
325 int i;
326 bool locked = false;
327
328 if (!tree_mod_need_log(eb->fs_info, eb))
329 return 0;
330
331 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
332 if (!tm_list) {
333 ret = -ENOMEM;
334 goto lock;
335 }
336
337 tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
338 if (IS_ERR(tm)) {
339 ret = PTR_ERR(tm);
340 tm = NULL;
341 goto lock;
342 }
343
344 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
345 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
346 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
347 if (!tm_list[i]) {
348 ret = -ENOMEM;
349 goto lock;
350 }
351 }
352
353 lock:
354 if (tree_mod_dont_log(eb->fs_info, eb)) {
355 /*
356 * Don't error if we failed to allocate memory because we don't
357 * need to log.
358 */
359 ret = 0;
360 goto free_tms;
361 }
362 locked = true;
363
364 /*
365 * We previously failed to allocate memory and we need to log, so we
366 * have to fail.
367 */
368 if (ret != 0)
369 goto free_tms;
370
371 /*
372 * When we override something during the move, we log these removals.
373 * This can only happen when we move towards the beginning of the
374 * buffer, i.e. dst_slot < src_slot.
375 */
376 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
377 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
378 if (ret)
379 goto free_tms;
380 }
381
382 ret = tree_mod_log_insert(eb->fs_info, tm);
383 if (ret)
384 goto free_tms;
385 write_unlock(&eb->fs_info->tree_mod_log_lock);
386 kfree(tm_list);
387
388 return 0;
389
390 free_tms:
391 if (tm_list) {
392 for (i = 0; i < nr_items; i++) {
393 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
394 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
395 kfree(tm_list[i]);
396 }
397 }
398 if (locked)
399 write_unlock(&eb->fs_info->tree_mod_log_lock);
400 kfree(tm_list);
401 kfree(tm);
402
403 return ret;
404 }
405
tree_mod_log_free_eb(struct btrfs_fs_info * fs_info,struct tree_mod_elem ** tm_list,int nritems)406 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
407 struct tree_mod_elem **tm_list,
408 int nritems)
409 {
410 int i, j;
411 int ret;
412
413 for (i = nritems - 1; i >= 0; i--) {
414 ret = tree_mod_log_insert(fs_info, tm_list[i]);
415 if (ret) {
416 for (j = nritems - 1; j > i; j--)
417 rb_erase(&tm_list[j]->node,
418 &fs_info->tree_mod_log);
419 return ret;
420 }
421 }
422
423 return 0;
424 }
425
btrfs_tree_mod_log_insert_root(struct extent_buffer * old_root,struct extent_buffer * new_root,bool log_removal)426 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
427 struct extent_buffer *new_root,
428 bool log_removal)
429 {
430 struct btrfs_fs_info *fs_info = old_root->fs_info;
431 struct tree_mod_elem *tm = NULL;
432 struct tree_mod_elem **tm_list = NULL;
433 int nritems = 0;
434 int ret = 0;
435 int i;
436
437 if (!tree_mod_need_log(fs_info, NULL))
438 return 0;
439
440 if (log_removal && btrfs_header_level(old_root) > 0) {
441 nritems = btrfs_header_nritems(old_root);
442 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
443 GFP_NOFS);
444 if (!tm_list) {
445 ret = -ENOMEM;
446 goto lock;
447 }
448 for (i = 0; i < nritems; i++) {
449 tm_list[i] = alloc_tree_mod_elem(old_root, i,
450 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
451 if (!tm_list[i]) {
452 ret = -ENOMEM;
453 goto lock;
454 }
455 }
456 }
457
458 tm = kzalloc(sizeof(*tm), GFP_NOFS);
459 if (!tm) {
460 ret = -ENOMEM;
461 goto lock;
462 }
463
464 tm->logical = new_root->start;
465 tm->old_root.logical = old_root->start;
466 tm->old_root.level = btrfs_header_level(old_root);
467 tm->generation = btrfs_header_generation(old_root);
468 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
469
470 lock:
471 if (tree_mod_dont_log(fs_info, NULL)) {
472 /*
473 * Don't error if we failed to allocate memory because we don't
474 * need to log.
475 */
476 ret = 0;
477 goto free_tms;
478 } else if (ret != 0) {
479 /*
480 * We previously failed to allocate memory and we need to log,
481 * so we have to fail.
482 */
483 goto out_unlock;
484 }
485
486 if (tm_list)
487 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
488 if (!ret)
489 ret = tree_mod_log_insert(fs_info, tm);
490
491 out_unlock:
492 write_unlock(&fs_info->tree_mod_log_lock);
493 if (ret)
494 goto free_tms;
495 kfree(tm_list);
496
497 return ret;
498
499 free_tms:
500 if (tm_list) {
501 for (i = 0; i < nritems; i++)
502 kfree(tm_list[i]);
503 kfree(tm_list);
504 }
505 kfree(tm);
506
507 return ret;
508 }
509
__tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq,bool smallest)510 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
511 u64 start, u64 min_seq,
512 bool smallest)
513 {
514 struct rb_root *tm_root;
515 struct rb_node *node;
516 struct tree_mod_elem *cur = NULL;
517 struct tree_mod_elem *found = NULL;
518
519 read_lock(&fs_info->tree_mod_log_lock);
520 tm_root = &fs_info->tree_mod_log;
521 node = tm_root->rb_node;
522 while (node) {
523 cur = rb_entry(node, struct tree_mod_elem, node);
524 if (cur->logical < start) {
525 node = node->rb_left;
526 } else if (cur->logical > start) {
527 node = node->rb_right;
528 } else if (cur->seq < min_seq) {
529 node = node->rb_left;
530 } else if (!smallest) {
531 /* We want the node with the highest seq */
532 if (found)
533 BUG_ON(found->seq > cur->seq);
534 found = cur;
535 node = node->rb_left;
536 } else if (cur->seq > min_seq) {
537 /* We want the node with the smallest seq */
538 if (found)
539 BUG_ON(found->seq < cur->seq);
540 found = cur;
541 node = node->rb_right;
542 } else {
543 found = cur;
544 break;
545 }
546 }
547 read_unlock(&fs_info->tree_mod_log_lock);
548
549 return found;
550 }
551
552 /*
553 * This returns the element from the log with the smallest time sequence
554 * value that's in the log (the oldest log item). Any element with a time
555 * sequence lower than min_seq will be ignored.
556 */
tree_mod_log_search_oldest(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)557 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
558 u64 start, u64 min_seq)
559 {
560 return __tree_mod_log_search(fs_info, start, min_seq, true);
561 }
562
563 /*
564 * This returns the element from the log with the largest time sequence
565 * value that's in the log (the most recent log item). Any element with
566 * a time sequence lower than min_seq will be ignored.
567 */
tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)568 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
569 u64 start, u64 min_seq)
570 {
571 return __tree_mod_log_search(fs_info, start, min_seq, false);
572 }
573
btrfs_tree_mod_log_eb_copy(struct extent_buffer * dst,const struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,int nr_items)574 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
575 const struct extent_buffer *src,
576 unsigned long dst_offset,
577 unsigned long src_offset,
578 int nr_items)
579 {
580 struct btrfs_fs_info *fs_info = dst->fs_info;
581 int ret = 0;
582 struct tree_mod_elem **tm_list = NULL;
583 struct tree_mod_elem **tm_list_add = NULL;
584 struct tree_mod_elem **tm_list_rem = NULL;
585 int i;
586 bool locked = false;
587 struct tree_mod_elem *dst_move_tm = NULL;
588 struct tree_mod_elem *src_move_tm = NULL;
589 u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
590 u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
591
592 if (!tree_mod_need_log(fs_info, NULL))
593 return 0;
594
595 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
596 return 0;
597
598 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
599 GFP_NOFS);
600 if (!tm_list) {
601 ret = -ENOMEM;
602 goto lock;
603 }
604
605 if (dst_move_nr_items) {
606 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
607 dst_offset, dst_move_nr_items);
608 if (IS_ERR(dst_move_tm)) {
609 ret = PTR_ERR(dst_move_tm);
610 dst_move_tm = NULL;
611 goto lock;
612 }
613 }
614 if (src_move_nr_items) {
615 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
616 src_offset + nr_items,
617 src_move_nr_items);
618 if (IS_ERR(src_move_tm)) {
619 ret = PTR_ERR(src_move_tm);
620 src_move_tm = NULL;
621 goto lock;
622 }
623 }
624
625 tm_list_add = tm_list;
626 tm_list_rem = tm_list + nr_items;
627 for (i = 0; i < nr_items; i++) {
628 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
629 BTRFS_MOD_LOG_KEY_REMOVE);
630 if (!tm_list_rem[i]) {
631 ret = -ENOMEM;
632 goto lock;
633 }
634
635 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
636 BTRFS_MOD_LOG_KEY_ADD);
637 if (!tm_list_add[i]) {
638 ret = -ENOMEM;
639 goto lock;
640 }
641 }
642
643 lock:
644 if (tree_mod_dont_log(fs_info, NULL)) {
645 /*
646 * Don't error if we failed to allocate memory because we don't
647 * need to log.
648 */
649 ret = 0;
650 goto free_tms;
651 }
652 locked = true;
653
654 /*
655 * We previously failed to allocate memory and we need to log, so we
656 * have to fail.
657 */
658 if (ret != 0)
659 goto free_tms;
660
661 if (dst_move_tm) {
662 ret = tree_mod_log_insert(fs_info, dst_move_tm);
663 if (ret)
664 goto free_tms;
665 }
666 for (i = 0; i < nr_items; i++) {
667 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
668 if (ret)
669 goto free_tms;
670 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
671 if (ret)
672 goto free_tms;
673 }
674 if (src_move_tm) {
675 ret = tree_mod_log_insert(fs_info, src_move_tm);
676 if (ret)
677 goto free_tms;
678 }
679
680 write_unlock(&fs_info->tree_mod_log_lock);
681 kfree(tm_list);
682
683 return 0;
684
685 free_tms:
686 if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
687 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
688 kfree(dst_move_tm);
689 if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
690 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
691 kfree(src_move_tm);
692 if (tm_list) {
693 for (i = 0; i < nr_items * 2; i++) {
694 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
695 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
696 kfree(tm_list[i]);
697 }
698 }
699 if (locked)
700 write_unlock(&fs_info->tree_mod_log_lock);
701 kfree(tm_list);
702
703 return ret;
704 }
705
btrfs_tree_mod_log_free_eb(struct extent_buffer * eb)706 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
707 {
708 struct tree_mod_elem **tm_list = NULL;
709 int nritems = 0;
710 int i;
711 int ret = 0;
712
713 if (!tree_mod_need_log(eb->fs_info, eb))
714 return 0;
715
716 nritems = btrfs_header_nritems(eb);
717 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
718 if (!tm_list) {
719 ret = -ENOMEM;
720 goto lock;
721 }
722
723 for (i = 0; i < nritems; i++) {
724 tm_list[i] = alloc_tree_mod_elem(eb, i,
725 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
726 if (!tm_list[i]) {
727 ret = -ENOMEM;
728 goto lock;
729 }
730 }
731
732 lock:
733 if (tree_mod_dont_log(eb->fs_info, eb)) {
734 /*
735 * Don't error if we failed to allocate memory because we don't
736 * need to log.
737 */
738 ret = 0;
739 goto free_tms;
740 } else if (ret != 0) {
741 /*
742 * We previously failed to allocate memory and we need to log,
743 * so we have to fail.
744 */
745 goto out_unlock;
746 }
747
748 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
749 out_unlock:
750 write_unlock(&eb->fs_info->tree_mod_log_lock);
751 if (ret)
752 goto free_tms;
753 kfree(tm_list);
754
755 return 0;
756
757 free_tms:
758 if (tm_list) {
759 for (i = 0; i < nritems; i++)
760 kfree(tm_list[i]);
761 kfree(tm_list);
762 }
763
764 return ret;
765 }
766
767 /*
768 * Returns the logical address of the oldest predecessor of the given root.
769 * Entries older than time_seq are ignored.
770 */
tree_mod_log_oldest_root(struct extent_buffer * eb_root,u64 time_seq)771 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
772 u64 time_seq)
773 {
774 struct tree_mod_elem *tm;
775 struct tree_mod_elem *found = NULL;
776 u64 root_logical = eb_root->start;
777 bool looped = false;
778
779 if (!time_seq)
780 return NULL;
781
782 /*
783 * The very last operation that's logged for a root is the replacement
784 * operation (if it is replaced at all). This has the logical address
785 * of the *new* root, making it the very first operation that's logged
786 * for this root.
787 */
788 while (1) {
789 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
790 time_seq);
791 if (!looped && !tm)
792 return NULL;
793 /*
794 * If there are no tree operation for the oldest root, we simply
795 * return it. This should only happen if that (old) root is at
796 * level 0.
797 */
798 if (!tm)
799 break;
800
801 /*
802 * If there's an operation that's not a root replacement, we
803 * found the oldest version of our root. Normally, we'll find a
804 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
805 */
806 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
807 break;
808
809 found = tm;
810 root_logical = tm->old_root.logical;
811 looped = true;
812 }
813
814 /* If there's no old root to return, return what we found instead */
815 if (!found)
816 found = tm;
817
818 return found;
819 }
820
821
822 /*
823 * tm is a pointer to the first operation to rewind within eb. Then, all
824 * previous operations will be rewound (until we reach something older than
825 * time_seq).
826 */
tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq,struct tree_mod_elem * first_tm)827 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
828 struct extent_buffer *eb,
829 u64 time_seq,
830 struct tree_mod_elem *first_tm)
831 {
832 u32 n;
833 struct rb_node *next;
834 struct tree_mod_elem *tm = first_tm;
835 unsigned long o_dst;
836 unsigned long o_src;
837 unsigned long p_size = sizeof(struct btrfs_key_ptr);
838 /*
839 * max_slot tracks the maximum valid slot of the rewind eb at every
840 * step of the rewind. This is in contrast with 'n' which eventually
841 * matches the number of items, but can be wrong during moves or if
842 * removes overlap on already valid slots (which is probably separately
843 * a bug). We do this to validate the offsets of memmoves for rewinding
844 * moves and detect invalid memmoves.
845 *
846 * Since a rewind eb can start empty, max_slot is a signed integer with
847 * a special meaning for -1, which is that no slot is valid to move out
848 * of. Any other negative value is invalid.
849 */
850 int max_slot;
851 int move_src_end_slot;
852 int move_dst_end_slot;
853
854 n = btrfs_header_nritems(eb);
855 max_slot = n - 1;
856 read_lock(&fs_info->tree_mod_log_lock);
857 while (tm && tm->seq >= time_seq) {
858 ASSERT(max_slot >= -1);
859 /*
860 * All the operations are recorded with the operator used for
861 * the modification. As we're going backwards, we do the
862 * opposite of each operation here.
863 */
864 switch (tm->op) {
865 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
866 BUG_ON(tm->slot < n);
867 fallthrough;
868 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
869 case BTRFS_MOD_LOG_KEY_REMOVE:
870 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
871 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
872 btrfs_set_node_ptr_generation(eb, tm->slot,
873 tm->generation);
874 n++;
875 if (tm->slot > max_slot)
876 max_slot = tm->slot;
877 break;
878 case BTRFS_MOD_LOG_KEY_REPLACE:
879 BUG_ON(tm->slot >= n);
880 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
881 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
882 btrfs_set_node_ptr_generation(eb, tm->slot,
883 tm->generation);
884 break;
885 case BTRFS_MOD_LOG_KEY_ADD:
886 /*
887 * It is possible we could have already removed keys
888 * behind the known max slot, so this will be an
889 * overestimate. In practice, the copy operation
890 * inserts them in increasing order, and overestimating
891 * just means we miss some warnings, so it's OK. It
892 * isn't worth carefully tracking the full array of
893 * valid slots to check against when moving.
894 */
895 if (tm->slot == max_slot)
896 max_slot--;
897 /* if a move operation is needed it's in the log */
898 n--;
899 break;
900 case BTRFS_MOD_LOG_MOVE_KEYS:
901 ASSERT(tm->move.nr_items > 0);
902 move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
903 move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
904 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
905 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
906 if (WARN_ON(move_src_end_slot > max_slot ||
907 tm->move.nr_items <= 0)) {
908 btrfs_warn(fs_info,
909 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
910 eb->start, tm->slot,
911 tm->move.dst_slot, tm->move.nr_items,
912 tm->seq, n, max_slot);
913 }
914 memmove_extent_buffer(eb, o_dst, o_src,
915 tm->move.nr_items * p_size);
916 max_slot = move_dst_end_slot;
917 break;
918 case BTRFS_MOD_LOG_ROOT_REPLACE:
919 /*
920 * This operation is special. For roots, this must be
921 * handled explicitly before rewinding.
922 * For non-roots, this operation may exist if the node
923 * was a root: root A -> child B; then A gets empty and
924 * B is promoted to the new root. In the mod log, we'll
925 * have a root-replace operation for B, a tree block
926 * that is no root. We simply ignore that operation.
927 */
928 break;
929 }
930 next = rb_next(&tm->node);
931 if (!next)
932 break;
933 tm = rb_entry(next, struct tree_mod_elem, node);
934 if (tm->logical != first_tm->logical)
935 break;
936 }
937 read_unlock(&fs_info->tree_mod_log_lock);
938 btrfs_set_header_nritems(eb, n);
939 }
940
941 /*
942 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
943 * is returned. If rewind operations happen, a fresh buffer is returned. The
944 * returned buffer is always read-locked. If the returned buffer is not the
945 * input buffer, the lock on the input buffer is released and the input buffer
946 * is freed (its refcount is decremented).
947 */
btrfs_tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq)948 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
949 struct extent_buffer *eb,
950 u64 time_seq)
951 {
952 struct extent_buffer *eb_rewin;
953 struct tree_mod_elem *tm;
954
955 if (!time_seq)
956 return eb;
957
958 if (btrfs_header_level(eb) == 0)
959 return eb;
960
961 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
962 if (!tm)
963 return eb;
964
965 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
966 BUG_ON(tm->slot != 0);
967 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
968 if (!eb_rewin) {
969 btrfs_tree_read_unlock(eb);
970 free_extent_buffer(eb);
971 return NULL;
972 }
973 btrfs_set_header_bytenr(eb_rewin, eb->start);
974 btrfs_set_header_backref_rev(eb_rewin,
975 btrfs_header_backref_rev(eb));
976 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
977 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
978 } else {
979 eb_rewin = btrfs_clone_extent_buffer(eb);
980 if (!eb_rewin) {
981 btrfs_tree_read_unlock(eb);
982 free_extent_buffer(eb);
983 return NULL;
984 }
985 }
986
987 btrfs_tree_read_unlock(eb);
988 free_extent_buffer(eb);
989
990 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
991 eb_rewin, btrfs_header_level(eb_rewin));
992 btrfs_tree_read_lock(eb_rewin);
993 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
994 WARN_ON(btrfs_header_nritems(eb_rewin) >
995 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
996
997 return eb_rewin;
998 }
999
1000 /*
1001 * Rewind the state of @root's root node to the given @time_seq value.
1002 * If there are no changes, the current root->root_node is returned. If anything
1003 * changed in between, there's a fresh buffer allocated on which the rewind
1004 * operations are done. In any case, the returned buffer is read locked.
1005 * Returns NULL on error (with no locks held).
1006 */
btrfs_get_old_root(struct btrfs_root * root,u64 time_seq)1007 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
1008 {
1009 struct btrfs_fs_info *fs_info = root->fs_info;
1010 struct tree_mod_elem *tm;
1011 struct extent_buffer *eb = NULL;
1012 struct extent_buffer *eb_root;
1013 u64 eb_root_owner = 0;
1014 struct extent_buffer *old;
1015 struct tree_mod_root *old_root = NULL;
1016 u64 old_generation = 0;
1017 u64 logical;
1018 int level;
1019
1020 eb_root = btrfs_read_lock_root_node(root);
1021 tm = tree_mod_log_oldest_root(eb_root, time_seq);
1022 if (!tm)
1023 return eb_root;
1024
1025 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
1026 old_root = &tm->old_root;
1027 old_generation = tm->generation;
1028 logical = old_root->logical;
1029 level = old_root->level;
1030 } else {
1031 logical = eb_root->start;
1032 level = btrfs_header_level(eb_root);
1033 }
1034
1035 tm = tree_mod_log_search(fs_info, logical, time_seq);
1036 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1037 struct btrfs_tree_parent_check check = { 0 };
1038
1039 btrfs_tree_read_unlock(eb_root);
1040 free_extent_buffer(eb_root);
1041
1042 check.level = level;
1043 check.owner_root = btrfs_root_id(root);
1044
1045 old = read_tree_block(fs_info, logical, &check);
1046 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1047 if (!IS_ERR(old))
1048 free_extent_buffer(old);
1049 btrfs_warn(fs_info,
1050 "failed to read tree block %llu from get_old_root",
1051 logical);
1052 } else {
1053 struct tree_mod_elem *tm2;
1054
1055 btrfs_tree_read_lock(old);
1056 eb = btrfs_clone_extent_buffer(old);
1057 /*
1058 * After the lookup for the most recent tree mod operation
1059 * above and before we locked and cloned the extent buffer
1060 * 'old', a new tree mod log operation may have been added.
1061 * So lookup for a more recent one to make sure the number
1062 * of mod log operations we replay is consistent with the
1063 * number of items we have in the cloned extent buffer,
1064 * otherwise we can hit a BUG_ON when rewinding the extent
1065 * buffer.
1066 */
1067 tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1068 btrfs_tree_read_unlock(old);
1069 free_extent_buffer(old);
1070 ASSERT(tm2);
1071 ASSERT(tm2 == tm || tm2->seq > tm->seq);
1072 if (!tm2 || tm2->seq < tm->seq) {
1073 free_extent_buffer(eb);
1074 return NULL;
1075 }
1076 tm = tm2;
1077 }
1078 } else if (old_root) {
1079 eb_root_owner = btrfs_header_owner(eb_root);
1080 btrfs_tree_read_unlock(eb_root);
1081 free_extent_buffer(eb_root);
1082 eb = alloc_dummy_extent_buffer(fs_info, logical);
1083 } else {
1084 eb = btrfs_clone_extent_buffer(eb_root);
1085 btrfs_tree_read_unlock(eb_root);
1086 free_extent_buffer(eb_root);
1087 }
1088
1089 if (!eb)
1090 return NULL;
1091 if (old_root) {
1092 btrfs_set_header_bytenr(eb, eb->start);
1093 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1094 btrfs_set_header_owner(eb, eb_root_owner);
1095 btrfs_set_header_level(eb, old_root->level);
1096 btrfs_set_header_generation(eb, old_generation);
1097 }
1098 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1099 btrfs_header_level(eb));
1100 btrfs_tree_read_lock(eb);
1101 if (tm)
1102 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1103 else
1104 WARN_ON(btrfs_header_level(eb) != 0);
1105 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1106
1107 return eb;
1108 }
1109
btrfs_old_root_level(struct btrfs_root * root,u64 time_seq)1110 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1111 {
1112 struct tree_mod_elem *tm;
1113 int level;
1114 struct extent_buffer *eb_root = btrfs_root_node(root);
1115
1116 tm = tree_mod_log_oldest_root(eb_root, time_seq);
1117 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1118 level = tm->old_root.level;
1119 else
1120 level = btrfs_header_level(eb_root);
1121
1122 free_extent_buffer(eb_root);
1123
1124 return level;
1125 }
1126
1127 /*
1128 * Return the lowest sequence number in the tree modification log.
1129 *
1130 * Return the sequence number of the oldest tree modification log user, which
1131 * corresponds to the lowest sequence number of all existing users. If there are
1132 * no users it returns 0.
1133 */
btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info * fs_info)1134 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1135 {
1136 u64 ret = 0;
1137
1138 read_lock(&fs_info->tree_mod_log_lock);
1139 if (!list_empty(&fs_info->tree_mod_seq_list)) {
1140 struct btrfs_seq_list *elem;
1141
1142 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1143 struct btrfs_seq_list, list);
1144 ret = elem->seq;
1145 }
1146 read_unlock(&fs_info->tree_mod_log_lock);
1147
1148 return ret;
1149 }
1150