1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9
10 struct tree_mod_root {
11 u64 logical;
12 u8 level;
13 };
14
15 struct tree_mod_elem {
16 struct rb_node node;
17 u64 logical;
18 u64 seq;
19 enum btrfs_mod_log_op op;
20
21 /*
22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23 * operations.
24 */
25 int slot;
26
27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28 u64 generation;
29
30 union {
31 /*
32 * This is used for the following op types:
33 *
34 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
35 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
36 * BTRFS_MOD_LOG_KEY_REMOVE
37 * BTRFS_MOD_LOG_KEY_REPLACE
38 */
39 struct {
40 struct btrfs_disk_key key;
41 u64 blockptr;
42 } slot_change;
43
44 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
45 struct {
46 int dst_slot;
47 int nr_items;
48 } move;
49
50 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
51 struct tree_mod_root old_root;
52 };
53 };
54
55 /*
56 * Pull a new tree mod seq number for our operation.
57 */
btrfs_inc_tree_mod_seq(struct btrfs_fs_info * fs_info)58 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
59 {
60 return atomic64_inc_return(&fs_info->tree_mod_seq);
61 }
62
63 /*
64 * This adds a new blocker to the tree mod log's blocker list if the @elem
65 * passed does not already have a sequence number set. So when a caller expects
66 * to record tree modifications, it should ensure to set elem->seq to zero
67 * before calling btrfs_get_tree_mod_seq.
68 * Returns a fresh, unused tree log modification sequence number, even if no new
69 * blocker was added.
70 */
btrfs_get_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)71 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
72 struct btrfs_seq_list *elem)
73 {
74 write_lock(&fs_info->tree_mod_log_lock);
75 if (!elem->seq) {
76 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
77 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
78 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
79 }
80 write_unlock(&fs_info->tree_mod_log_lock);
81
82 return elem->seq;
83 }
84
btrfs_put_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)85 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
86 struct btrfs_seq_list *elem)
87 {
88 struct rb_root *tm_root;
89 struct rb_node *node;
90 struct rb_node *next;
91 struct tree_mod_elem *tm;
92 u64 min_seq = BTRFS_SEQ_LAST;
93 u64 seq_putting = elem->seq;
94
95 if (!seq_putting)
96 return;
97
98 write_lock(&fs_info->tree_mod_log_lock);
99 list_del(&elem->list);
100 elem->seq = 0;
101
102 if (list_empty(&fs_info->tree_mod_seq_list)) {
103 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
104 } else {
105 struct btrfs_seq_list *first;
106
107 first = list_first_entry(&fs_info->tree_mod_seq_list,
108 struct btrfs_seq_list, list);
109 if (seq_putting > first->seq) {
110 /*
111 * Blocker with lower sequence number exists, we cannot
112 * remove anything from the log.
113 */
114 write_unlock(&fs_info->tree_mod_log_lock);
115 return;
116 }
117 min_seq = first->seq;
118 }
119
120 /*
121 * Anything that's lower than the lowest existing (read: blocked)
122 * sequence number can be removed from the tree.
123 */
124 tm_root = &fs_info->tree_mod_log;
125 for (node = rb_first(tm_root); node; node = next) {
126 next = rb_next(node);
127 tm = rb_entry(node, struct tree_mod_elem, node);
128 if (tm->seq >= min_seq)
129 continue;
130 rb_erase(node, tm_root);
131 kfree(tm);
132 }
133 write_unlock(&fs_info->tree_mod_log_lock);
134 }
135
136 /*
137 * Key order of the log:
138 * node/leaf start address -> sequence
139 *
140 * The 'start address' is the logical address of the *new* root node for root
141 * replace operations, or the logical address of the affected block for all
142 * other operations.
143 */
tree_mod_log_insert(struct btrfs_fs_info * fs_info,struct tree_mod_elem * tm)144 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
145 struct tree_mod_elem *tm)
146 {
147 struct rb_root *tm_root;
148 struct rb_node **new;
149 struct rb_node *parent = NULL;
150 struct tree_mod_elem *cur;
151
152 lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
153
154 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
155
156 tm_root = &fs_info->tree_mod_log;
157 new = &tm_root->rb_node;
158 while (*new) {
159 cur = rb_entry(*new, struct tree_mod_elem, node);
160 parent = *new;
161 if (cur->logical < tm->logical)
162 new = &((*new)->rb_left);
163 else if (cur->logical > tm->logical)
164 new = &((*new)->rb_right);
165 else if (cur->seq < tm->seq)
166 new = &((*new)->rb_left);
167 else if (cur->seq > tm->seq)
168 new = &((*new)->rb_right);
169 else
170 return -EEXIST;
171 }
172
173 rb_link_node(&tm->node, parent, new);
174 rb_insert_color(&tm->node, tm_root);
175 return 0;
176 }
177
skip_eb_logging(const struct extent_buffer * eb)178 static inline bool skip_eb_logging(const struct extent_buffer *eb)
179 {
180 const u64 owner = btrfs_header_owner(eb);
181
182 if (btrfs_header_level(eb) == 0)
183 return true;
184
185 /*
186 * Tree mod logging exists so that there's a consistent view of the
187 * extents and backrefs of inodes even if while a task is iterating over
188 * them other tasks are modifying subvolume trees and the extent tree
189 * (including running delayed refs). So we only need to log extent
190 * buffers from the extent tree and subvolume trees.
191 */
192
193 if (owner == BTRFS_EXTENT_TREE_OBJECTID)
194 return false;
195
196 if (btrfs_is_fstree(owner))
197 return false;
198
199 return true;
200 }
201
202 /*
203 * Determines if logging can be omitted. Returns true if it can. Otherwise, it
204 * returns false with the tree_mod_log_lock acquired. The caller must hold
205 * this until all tree mod log insertions are recorded in the rb tree and then
206 * write unlock fs_info::tree_mod_log_lock.
207 */
tree_mod_dont_log(struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)208 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
209 {
210 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
211 return true;
212 if (eb && skip_eb_logging(eb))
213 return true;
214
215 write_lock(&fs_info->tree_mod_log_lock);
216 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
217 write_unlock(&fs_info->tree_mod_log_lock);
218 return true;
219 }
220
221 return false;
222 }
223
224 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
tree_mod_need_log(const struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)225 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
226 const struct extent_buffer *eb)
227 {
228 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
229 return false;
230 if (eb && skip_eb_logging(eb))
231 return false;
232
233 return true;
234 }
235
alloc_tree_mod_elem(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)236 static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
237 int slot,
238 enum btrfs_mod_log_op op)
239 {
240 struct tree_mod_elem *tm;
241
242 /* Can't be one of these types, due to union in struct tree_mod_elem. */
243 ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
244 ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
245
246 tm = kzalloc_obj(*tm, GFP_NOFS);
247 if (!tm)
248 return NULL;
249
250 tm->logical = eb->start;
251 btrfs_node_key(eb, &tm->slot_change.key, slot);
252 tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
253 tm->op = op;
254 tm->slot = slot;
255 tm->generation = btrfs_node_ptr_generation(eb, slot);
256 RB_CLEAR_NODE(&tm->node);
257
258 return tm;
259 }
260
btrfs_tree_mod_log_insert_key(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)261 int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
262 enum btrfs_mod_log_op op)
263 {
264 struct tree_mod_elem *tm;
265 int ret = 0;
266
267 if (!tree_mod_need_log(eb->fs_info, eb))
268 return 0;
269
270 tm = alloc_tree_mod_elem(eb, slot, op);
271 if (!tm)
272 ret = -ENOMEM;
273
274 if (tree_mod_dont_log(eb->fs_info, eb)) {
275 kfree(tm);
276 /*
277 * Don't error if we failed to allocate memory because we don't
278 * need to log.
279 */
280 return 0;
281 } else if (ret != 0) {
282 /*
283 * We previously failed to allocate memory and we need to log,
284 * so we have to fail.
285 */
286 goto out_unlock;
287 }
288
289 ret = tree_mod_log_insert(eb->fs_info, tm);
290 out_unlock:
291 write_unlock(&eb->fs_info->tree_mod_log_lock);
292 if (ret)
293 kfree(tm);
294
295 return ret;
296 }
297
tree_mod_log_alloc_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)298 static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
299 int dst_slot, int src_slot,
300 int nr_items)
301 {
302 struct tree_mod_elem *tm;
303
304 tm = kzalloc_obj(*tm, GFP_NOFS);
305 if (!tm)
306 return ERR_PTR(-ENOMEM);
307
308 tm->logical = eb->start;
309 tm->slot = src_slot;
310 tm->move.dst_slot = dst_slot;
311 tm->move.nr_items = nr_items;
312 tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
313 RB_CLEAR_NODE(&tm->node);
314
315 return tm;
316 }
317
btrfs_tree_mod_log_insert_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)318 int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
319 int dst_slot, int src_slot,
320 int nr_items)
321 {
322 struct tree_mod_elem *tm = NULL;
323 struct tree_mod_elem **tm_list = NULL;
324 int ret = 0;
325 int i;
326 bool locked = false;
327
328 if (!tree_mod_need_log(eb->fs_info, eb))
329 return 0;
330
331 tm_list = kzalloc_objs(struct tree_mod_elem *, nr_items, GFP_NOFS);
332 if (!tm_list) {
333 ret = -ENOMEM;
334 goto lock;
335 }
336
337 tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
338 if (IS_ERR(tm)) {
339 ret = PTR_ERR(tm);
340 tm = NULL;
341 goto lock;
342 }
343
344 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
345 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
346 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
347 if (!tm_list[i]) {
348 ret = -ENOMEM;
349 goto lock;
350 }
351 }
352
353 lock:
354 if (tree_mod_dont_log(eb->fs_info, eb)) {
355 /*
356 * Don't error if we failed to allocate memory because we don't
357 * need to log.
358 */
359 ret = 0;
360 goto free_tms;
361 }
362 locked = true;
363
364 /*
365 * We previously failed to allocate memory and we need to log, so we
366 * have to fail.
367 */
368 if (ret != 0)
369 goto free_tms;
370
371 /*
372 * When we override something during the move, we log these removals.
373 * This can only happen when we move towards the beginning of the
374 * buffer, i.e. dst_slot < src_slot.
375 */
376 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
377 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
378 if (ret)
379 goto free_tms;
380 }
381
382 ret = tree_mod_log_insert(eb->fs_info, tm);
383 if (ret)
384 goto free_tms;
385 write_unlock(&eb->fs_info->tree_mod_log_lock);
386 kfree(tm_list);
387
388 return 0;
389
390 free_tms:
391 if (tm_list) {
392 for (i = 0; i < nr_items; i++) {
393 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
394 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
395 kfree(tm_list[i]);
396 }
397 }
398 if (locked)
399 write_unlock(&eb->fs_info->tree_mod_log_lock);
400 kfree(tm_list);
401 kfree(tm);
402
403 return ret;
404 }
405
tree_mod_log_free_eb(struct btrfs_fs_info * fs_info,struct tree_mod_elem ** tm_list,int nritems)406 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
407 struct tree_mod_elem **tm_list,
408 int nritems)
409 {
410 int i, j;
411 int ret;
412
413 for (i = nritems - 1; i >= 0; i--) {
414 ret = tree_mod_log_insert(fs_info, tm_list[i]);
415 if (ret) {
416 for (j = nritems - 1; j > i; j--)
417 rb_erase(&tm_list[j]->node,
418 &fs_info->tree_mod_log);
419 return ret;
420 }
421 }
422
423 return 0;
424 }
425
btrfs_tree_mod_log_insert_root(struct extent_buffer * old_root,struct extent_buffer * new_root,bool log_removal)426 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
427 struct extent_buffer *new_root,
428 bool log_removal)
429 {
430 struct btrfs_fs_info *fs_info = old_root->fs_info;
431 struct tree_mod_elem *tm = NULL;
432 struct tree_mod_elem **tm_list = NULL;
433 int nritems = 0;
434 int ret = 0;
435 int i;
436
437 if (!tree_mod_need_log(fs_info, NULL))
438 return 0;
439
440 if (log_removal && btrfs_header_level(old_root) > 0) {
441 nritems = btrfs_header_nritems(old_root);
442 tm_list = kzalloc_objs(struct tree_mod_elem *, nritems,
443 GFP_NOFS);
444 if (!tm_list) {
445 ret = -ENOMEM;
446 goto lock;
447 }
448 for (i = 0; i < nritems; i++) {
449 tm_list[i] = alloc_tree_mod_elem(old_root, i,
450 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
451 if (!tm_list[i]) {
452 ret = -ENOMEM;
453 goto lock;
454 }
455 }
456 }
457
458 tm = kzalloc_obj(*tm, GFP_NOFS);
459 if (!tm) {
460 ret = -ENOMEM;
461 goto lock;
462 }
463
464 tm->logical = new_root->start;
465 tm->old_root.logical = old_root->start;
466 tm->old_root.level = btrfs_header_level(old_root);
467 tm->generation = btrfs_header_generation(old_root);
468 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
469
470 lock:
471 if (tree_mod_dont_log(fs_info, NULL)) {
472 /*
473 * Don't error if we failed to allocate memory because we don't
474 * need to log.
475 */
476 ret = 0;
477 goto free_tms;
478 } else if (ret != 0) {
479 /*
480 * We previously failed to allocate memory and we need to log,
481 * so we have to fail.
482 */
483 goto out_unlock;
484 }
485
486 if (tm_list)
487 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
488 if (!ret)
489 ret = tree_mod_log_insert(fs_info, tm);
490
491 out_unlock:
492 write_unlock(&fs_info->tree_mod_log_lock);
493 if (ret)
494 goto free_tms;
495 kfree(tm_list);
496
497 return ret;
498
499 free_tms:
500 if (tm_list) {
501 for (i = 0; i < nritems; i++)
502 kfree(tm_list[i]);
503 kfree(tm_list);
504 }
505 kfree(tm);
506
507 return ret;
508 }
509
__tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq,bool smallest)510 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
511 u64 start, u64 min_seq,
512 bool smallest)
513 {
514 struct rb_root *tm_root;
515 struct rb_node *node;
516 struct tree_mod_elem *cur = NULL;
517 struct tree_mod_elem *found = NULL;
518
519 read_lock(&fs_info->tree_mod_log_lock);
520 tm_root = &fs_info->tree_mod_log;
521 node = tm_root->rb_node;
522 while (node) {
523 cur = rb_entry(node, struct tree_mod_elem, node);
524 if (cur->logical < start) {
525 node = node->rb_left;
526 } else if (cur->logical > start) {
527 node = node->rb_right;
528 } else if (cur->seq < min_seq) {
529 node = node->rb_left;
530 } else if (!smallest) {
531 /* We want the node with the highest seq */
532 if (found)
533 BUG_ON(found->seq > cur->seq);
534 found = cur;
535 node = node->rb_left;
536 } else if (cur->seq > min_seq) {
537 /* We want the node with the smallest seq */
538 if (found)
539 BUG_ON(found->seq < cur->seq);
540 found = cur;
541 node = node->rb_right;
542 } else {
543 found = cur;
544 break;
545 }
546 }
547 read_unlock(&fs_info->tree_mod_log_lock);
548
549 return found;
550 }
551
552 /*
553 * This returns the element from the log with the smallest time sequence
554 * value that's in the log (the oldest log item). Any element with a time
555 * sequence lower than min_seq will be ignored.
556 */
tree_mod_log_search_oldest(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)557 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
558 u64 start, u64 min_seq)
559 {
560 return __tree_mod_log_search(fs_info, start, min_seq, true);
561 }
562
563 /*
564 * This returns the element from the log with the largest time sequence
565 * value that's in the log (the most recent log item). Any element with
566 * a time sequence lower than min_seq will be ignored.
567 */
tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)568 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
569 u64 start, u64 min_seq)
570 {
571 return __tree_mod_log_search(fs_info, start, min_seq, false);
572 }
573
btrfs_tree_mod_log_eb_copy(struct extent_buffer * dst,const struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,int nr_items)574 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
575 const struct extent_buffer *src,
576 unsigned long dst_offset,
577 unsigned long src_offset,
578 int nr_items)
579 {
580 struct btrfs_fs_info *fs_info = dst->fs_info;
581 int ret = 0;
582 struct tree_mod_elem **tm_list = NULL;
583 struct tree_mod_elem **tm_list_add = NULL;
584 struct tree_mod_elem **tm_list_rem = NULL;
585 int i;
586 bool locked = false;
587 struct tree_mod_elem *dst_move_tm = NULL;
588 struct tree_mod_elem *src_move_tm = NULL;
589 u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
590 u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
591
592 if (!tree_mod_need_log(fs_info, NULL))
593 return 0;
594
595 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
596 return 0;
597
598 tm_list = kzalloc_objs(struct tree_mod_elem *, nr_items * 2, GFP_NOFS);
599 if (!tm_list) {
600 ret = -ENOMEM;
601 goto lock;
602 }
603
604 if (dst_move_nr_items) {
605 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
606 dst_offset, dst_move_nr_items);
607 if (IS_ERR(dst_move_tm)) {
608 ret = PTR_ERR(dst_move_tm);
609 dst_move_tm = NULL;
610 goto lock;
611 }
612 }
613 if (src_move_nr_items) {
614 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
615 src_offset + nr_items,
616 src_move_nr_items);
617 if (IS_ERR(src_move_tm)) {
618 ret = PTR_ERR(src_move_tm);
619 src_move_tm = NULL;
620 goto lock;
621 }
622 }
623
624 tm_list_add = tm_list;
625 tm_list_rem = tm_list + nr_items;
626 for (i = 0; i < nr_items; i++) {
627 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
628 BTRFS_MOD_LOG_KEY_REMOVE);
629 if (!tm_list_rem[i]) {
630 ret = -ENOMEM;
631 goto lock;
632 }
633
634 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
635 BTRFS_MOD_LOG_KEY_ADD);
636 if (!tm_list_add[i]) {
637 ret = -ENOMEM;
638 goto lock;
639 }
640 }
641
642 lock:
643 if (tree_mod_dont_log(fs_info, NULL)) {
644 /*
645 * Don't error if we failed to allocate memory because we don't
646 * need to log.
647 */
648 ret = 0;
649 goto free_tms;
650 }
651 locked = true;
652
653 /*
654 * We previously failed to allocate memory and we need to log, so we
655 * have to fail.
656 */
657 if (ret != 0)
658 goto free_tms;
659
660 if (dst_move_tm) {
661 ret = tree_mod_log_insert(fs_info, dst_move_tm);
662 if (ret)
663 goto free_tms;
664 }
665 for (i = 0; i < nr_items; i++) {
666 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
667 if (ret)
668 goto free_tms;
669 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
670 if (ret)
671 goto free_tms;
672 }
673 if (src_move_tm) {
674 ret = tree_mod_log_insert(fs_info, src_move_tm);
675 if (ret)
676 goto free_tms;
677 }
678
679 write_unlock(&fs_info->tree_mod_log_lock);
680 kfree(tm_list);
681
682 return 0;
683
684 free_tms:
685 if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
686 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
687 kfree(dst_move_tm);
688 if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
689 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
690 kfree(src_move_tm);
691 if (tm_list) {
692 for (i = 0; i < nr_items * 2; i++) {
693 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
694 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
695 kfree(tm_list[i]);
696 }
697 }
698 if (locked)
699 write_unlock(&fs_info->tree_mod_log_lock);
700 kfree(tm_list);
701
702 return ret;
703 }
704
btrfs_tree_mod_log_free_eb(struct extent_buffer * eb)705 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
706 {
707 struct tree_mod_elem **tm_list = NULL;
708 int nritems = 0;
709 int i;
710 int ret = 0;
711
712 if (!tree_mod_need_log(eb->fs_info, eb))
713 return 0;
714
715 nritems = btrfs_header_nritems(eb);
716 tm_list = kzalloc_objs(struct tree_mod_elem *, nritems, GFP_NOFS);
717 if (!tm_list) {
718 ret = -ENOMEM;
719 goto lock;
720 }
721
722 for (i = 0; i < nritems; i++) {
723 tm_list[i] = alloc_tree_mod_elem(eb, i,
724 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
725 if (!tm_list[i]) {
726 ret = -ENOMEM;
727 goto lock;
728 }
729 }
730
731 lock:
732 if (tree_mod_dont_log(eb->fs_info, eb)) {
733 /*
734 * Don't error if we failed to allocate memory because we don't
735 * need to log.
736 */
737 ret = 0;
738 goto free_tms;
739 } else if (ret != 0) {
740 /*
741 * We previously failed to allocate memory and we need to log,
742 * so we have to fail.
743 */
744 goto out_unlock;
745 }
746
747 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
748 out_unlock:
749 write_unlock(&eb->fs_info->tree_mod_log_lock);
750 if (ret)
751 goto free_tms;
752 kfree(tm_list);
753
754 return 0;
755
756 free_tms:
757 if (tm_list) {
758 for (i = 0; i < nritems; i++)
759 kfree(tm_list[i]);
760 kfree(tm_list);
761 }
762
763 return ret;
764 }
765
766 /*
767 * Returns the logical address of the oldest predecessor of the given root.
768 * Entries older than time_seq are ignored.
769 */
tree_mod_log_oldest_root(struct extent_buffer * eb_root,u64 time_seq)770 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
771 u64 time_seq)
772 {
773 struct tree_mod_elem *tm;
774 struct tree_mod_elem *found = NULL;
775 u64 root_logical = eb_root->start;
776 bool looped = false;
777
778 if (!time_seq)
779 return NULL;
780
781 /*
782 * The very last operation that's logged for a root is the replacement
783 * operation (if it is replaced at all). This has the logical address
784 * of the *new* root, making it the very first operation that's logged
785 * for this root.
786 */
787 while (1) {
788 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
789 time_seq);
790 if (!looped && !tm)
791 return NULL;
792 /*
793 * If there are no tree operation for the oldest root, we simply
794 * return it. This should only happen if that (old) root is at
795 * level 0.
796 */
797 if (!tm)
798 break;
799
800 /*
801 * If there's an operation that's not a root replacement, we
802 * found the oldest version of our root. Normally, we'll find a
803 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
804 */
805 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
806 break;
807
808 found = tm;
809 root_logical = tm->old_root.logical;
810 looped = true;
811 }
812
813 /* If there's no old root to return, return what we found instead */
814 if (!found)
815 found = tm;
816
817 return found;
818 }
819
820
821 /*
822 * tm is a pointer to the first operation to rewind within eb. Then, all
823 * previous operations will be rewound (until we reach something older than
824 * time_seq).
825 */
tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq,struct tree_mod_elem * first_tm)826 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
827 struct extent_buffer *eb,
828 u64 time_seq,
829 struct tree_mod_elem *first_tm)
830 {
831 u32 n;
832 struct rb_node *next;
833 struct tree_mod_elem *tm = first_tm;
834 unsigned long o_dst;
835 unsigned long o_src;
836 unsigned long p_size = sizeof(struct btrfs_key_ptr);
837 /*
838 * max_slot tracks the maximum valid slot of the rewind eb at every
839 * step of the rewind. This is in contrast with 'n' which eventually
840 * matches the number of items, but can be wrong during moves or if
841 * removes overlap on already valid slots (which is probably separately
842 * a bug). We do this to validate the offsets of memmoves for rewinding
843 * moves and detect invalid memmoves.
844 *
845 * Since a rewind eb can start empty, max_slot is a signed integer with
846 * a special meaning for -1, which is that no slot is valid to move out
847 * of. Any other negative value is invalid.
848 */
849 int max_slot;
850 int move_src_end_slot;
851 int move_dst_end_slot;
852
853 n = btrfs_header_nritems(eb);
854 max_slot = n - 1;
855 read_lock(&fs_info->tree_mod_log_lock);
856 while (tm && tm->seq >= time_seq) {
857 ASSERT(max_slot >= -1);
858 /*
859 * All the operations are recorded with the operator used for
860 * the modification. As we're going backwards, we do the
861 * opposite of each operation here.
862 */
863 switch (tm->op) {
864 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
865 BUG_ON(tm->slot < n);
866 fallthrough;
867 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
868 case BTRFS_MOD_LOG_KEY_REMOVE:
869 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
870 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
871 btrfs_set_node_ptr_generation(eb, tm->slot,
872 tm->generation);
873 n++;
874 if (tm->slot > max_slot)
875 max_slot = tm->slot;
876 break;
877 case BTRFS_MOD_LOG_KEY_REPLACE:
878 BUG_ON(tm->slot >= n);
879 btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
880 btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
881 btrfs_set_node_ptr_generation(eb, tm->slot,
882 tm->generation);
883 break;
884 case BTRFS_MOD_LOG_KEY_ADD:
885 /*
886 * It is possible we could have already removed keys
887 * behind the known max slot, so this will be an
888 * overestimate. In practice, the copy operation
889 * inserts them in increasing order, and overestimating
890 * just means we miss some warnings, so it's OK. It
891 * isn't worth carefully tracking the full array of
892 * valid slots to check against when moving.
893 */
894 if (tm->slot == max_slot)
895 max_slot--;
896 /* if a move operation is needed it's in the log */
897 n--;
898 break;
899 case BTRFS_MOD_LOG_MOVE_KEYS:
900 ASSERT(tm->move.nr_items > 0);
901 move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
902 move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
903 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
904 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
905 if (WARN_ON(move_src_end_slot > max_slot ||
906 tm->move.nr_items <= 0)) {
907 btrfs_warn(fs_info,
908 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
909 eb->start, tm->slot,
910 tm->move.dst_slot, tm->move.nr_items,
911 tm->seq, n, max_slot);
912 }
913 memmove_extent_buffer(eb, o_dst, o_src,
914 tm->move.nr_items * p_size);
915 max_slot = move_dst_end_slot;
916 break;
917 case BTRFS_MOD_LOG_ROOT_REPLACE:
918 /*
919 * This operation is special. For roots, this must be
920 * handled explicitly before rewinding.
921 * For non-roots, this operation may exist if the node
922 * was a root: root A -> child B; then A gets empty and
923 * B is promoted to the new root. In the mod log, we'll
924 * have a root-replace operation for B, a tree block
925 * that is no root. We simply ignore that operation.
926 */
927 break;
928 }
929 next = rb_next(&tm->node);
930 if (!next)
931 break;
932 tm = rb_entry(next, struct tree_mod_elem, node);
933 if (tm->logical != first_tm->logical)
934 break;
935 }
936 read_unlock(&fs_info->tree_mod_log_lock);
937 btrfs_set_header_nritems(eb, n);
938 }
939
940 /*
941 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
942 * is returned. If rewind operations happen, a fresh buffer is returned. The
943 * returned buffer is always read-locked. If the returned buffer is not the
944 * input buffer, the lock on the input buffer is released and the input buffer
945 * is freed (its refcount is decremented).
946 */
btrfs_tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq)947 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
948 struct extent_buffer *eb,
949 u64 time_seq)
950 {
951 struct extent_buffer *eb_rewin;
952 struct tree_mod_elem *tm;
953
954 if (!time_seq)
955 return eb;
956
957 if (btrfs_header_level(eb) == 0)
958 return eb;
959
960 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
961 if (!tm)
962 return eb;
963
964 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
965 BUG_ON(tm->slot != 0);
966 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
967 if (!eb_rewin) {
968 btrfs_tree_read_unlock(eb);
969 free_extent_buffer(eb);
970 return NULL;
971 }
972 btrfs_set_header_bytenr(eb_rewin, eb->start);
973 btrfs_set_header_backref_rev(eb_rewin,
974 btrfs_header_backref_rev(eb));
975 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
976 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
977 } else {
978 eb_rewin = btrfs_clone_extent_buffer(eb);
979 if (!eb_rewin) {
980 btrfs_tree_read_unlock(eb);
981 free_extent_buffer(eb);
982 return NULL;
983 }
984 }
985
986 btrfs_tree_read_unlock(eb);
987 free_extent_buffer(eb);
988
989 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
990 eb_rewin, btrfs_header_level(eb_rewin));
991 btrfs_tree_read_lock(eb_rewin);
992 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
993 WARN_ON(btrfs_header_nritems(eb_rewin) >
994 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
995
996 return eb_rewin;
997 }
998
999 /*
1000 * Rewind the state of @root's root node to the given @time_seq value.
1001 * If there are no changes, the current root->root_node is returned. If anything
1002 * changed in between, there's a fresh buffer allocated on which the rewind
1003 * operations are done. In any case, the returned buffer is read locked.
1004 * Returns NULL on error (with no locks held).
1005 */
btrfs_get_old_root(struct btrfs_root * root,u64 time_seq)1006 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
1007 {
1008 struct btrfs_fs_info *fs_info = root->fs_info;
1009 struct tree_mod_elem *tm;
1010 struct extent_buffer *eb = NULL;
1011 struct extent_buffer *eb_root;
1012 u64 eb_root_owner = 0;
1013 struct extent_buffer *old;
1014 struct tree_mod_root *old_root = NULL;
1015 u64 old_generation = 0;
1016 u64 logical;
1017 int level;
1018
1019 eb_root = btrfs_read_lock_root_node(root);
1020 tm = tree_mod_log_oldest_root(eb_root, time_seq);
1021 if (!tm)
1022 return eb_root;
1023
1024 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
1025 old_root = &tm->old_root;
1026 old_generation = tm->generation;
1027 logical = old_root->logical;
1028 level = old_root->level;
1029 } else {
1030 logical = eb_root->start;
1031 level = btrfs_header_level(eb_root);
1032 }
1033
1034 tm = tree_mod_log_search(fs_info, logical, time_seq);
1035 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1036 struct btrfs_tree_parent_check check = { 0 };
1037
1038 btrfs_tree_read_unlock(eb_root);
1039 free_extent_buffer(eb_root);
1040
1041 check.level = level;
1042 check.owner_root = btrfs_root_id(root);
1043
1044 old = read_tree_block(fs_info, logical, &check);
1045 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1046 if (!IS_ERR(old))
1047 free_extent_buffer(old);
1048 btrfs_warn(fs_info,
1049 "failed to read tree block %llu from get_old_root",
1050 logical);
1051 } else {
1052 struct tree_mod_elem *tm2;
1053
1054 btrfs_tree_read_lock(old);
1055 eb = btrfs_clone_extent_buffer(old);
1056 /*
1057 * After the lookup for the most recent tree mod operation
1058 * above and before we locked and cloned the extent buffer
1059 * 'old', a new tree mod log operation may have been added.
1060 * So lookup for a more recent one to make sure the number
1061 * of mod log operations we replay is consistent with the
1062 * number of items we have in the cloned extent buffer,
1063 * otherwise we can hit a BUG_ON when rewinding the extent
1064 * buffer.
1065 */
1066 tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1067 btrfs_tree_read_unlock(old);
1068 free_extent_buffer(old);
1069 ASSERT(tm2);
1070 ASSERT(tm2 == tm || tm2->seq > tm->seq);
1071 if (!tm2 || tm2->seq < tm->seq) {
1072 free_extent_buffer(eb);
1073 return NULL;
1074 }
1075 tm = tm2;
1076 }
1077 } else if (old_root) {
1078 eb_root_owner = btrfs_header_owner(eb_root);
1079 btrfs_tree_read_unlock(eb_root);
1080 free_extent_buffer(eb_root);
1081 eb = alloc_dummy_extent_buffer(fs_info, logical);
1082 } else {
1083 eb = btrfs_clone_extent_buffer(eb_root);
1084 btrfs_tree_read_unlock(eb_root);
1085 free_extent_buffer(eb_root);
1086 }
1087
1088 if (!eb)
1089 return NULL;
1090 if (old_root) {
1091 btrfs_set_header_bytenr(eb, eb->start);
1092 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1093 btrfs_set_header_owner(eb, eb_root_owner);
1094 btrfs_set_header_level(eb, old_root->level);
1095 btrfs_set_header_generation(eb, old_generation);
1096 }
1097 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1098 btrfs_header_level(eb));
1099 btrfs_tree_read_lock(eb);
1100 if (tm)
1101 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1102 else
1103 WARN_ON(btrfs_header_level(eb) != 0);
1104 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1105
1106 return eb;
1107 }
1108
btrfs_old_root_level(struct btrfs_root * root,u64 time_seq)1109 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1110 {
1111 struct tree_mod_elem *tm;
1112 int level;
1113 struct extent_buffer *eb_root = btrfs_root_node(root);
1114
1115 tm = tree_mod_log_oldest_root(eb_root, time_seq);
1116 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1117 level = tm->old_root.level;
1118 else
1119 level = btrfs_header_level(eb_root);
1120
1121 free_extent_buffer(eb_root);
1122
1123 return level;
1124 }
1125
1126 /*
1127 * Return the lowest sequence number in the tree modification log.
1128 *
1129 * Return the sequence number of the oldest tree modification log user, which
1130 * corresponds to the lowest sequence number of all existing users. If there are
1131 * no users it returns 0.
1132 */
btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info * fs_info)1133 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1134 {
1135 u64 ret = 0;
1136
1137 read_lock(&fs_info->tree_mod_log_lock);
1138 if (!list_empty(&fs_info->tree_mod_seq_list)) {
1139 struct btrfs_seq_list *elem;
1140
1141 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1142 struct btrfs_seq_list, list);
1143 ret = elem->seq;
1144 }
1145 read_unlock(&fs_info->tree_mod_log_lock);
1146
1147 return ret;
1148 }
1149