xref: /linux/fs/btrfs/tree-mod-log.c (revision 69050f8d6d075dc01af7a5f2f550a8067510366f)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9 
10 struct tree_mod_root {
11 	u64 logical;
12 	u8 level;
13 };
14 
15 struct tree_mod_elem {
16 	struct rb_node node;
17 	u64 logical;
18 	u64 seq;
19 	enum btrfs_mod_log_op op;
20 
21 	/*
22 	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23 	 * operations.
24 	 */
25 	int slot;
26 
27 	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28 	u64 generation;
29 
30 	union {
31 		/*
32 		 * This is used for the following op types:
33 		 *
34 		 *    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
35 		 *    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
36 		 *    BTRFS_MOD_LOG_KEY_REMOVE
37 		 *    BTRFS_MOD_LOG_KEY_REPLACE
38 		 */
39 		struct {
40 			struct btrfs_disk_key key;
41 			u64 blockptr;
42 		} slot_change;
43 
44 		/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
45 		struct {
46 			int dst_slot;
47 			int nr_items;
48 		} move;
49 
50 		/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
51 		struct tree_mod_root old_root;
52 	};
53 };
54 
55 /*
56  * Pull a new tree mod seq number for our operation.
57  */
58 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
59 {
60 	return atomic64_inc_return(&fs_info->tree_mod_seq);
61 }
62 
63 /*
64  * This adds a new blocker to the tree mod log's blocker list if the @elem
65  * passed does not already have a sequence number set. So when a caller expects
66  * to record tree modifications, it should ensure to set elem->seq to zero
67  * before calling btrfs_get_tree_mod_seq.
68  * Returns a fresh, unused tree log modification sequence number, even if no new
69  * blocker was added.
70  */
71 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
72 			   struct btrfs_seq_list *elem)
73 {
74 	write_lock(&fs_info->tree_mod_log_lock);
75 	if (!elem->seq) {
76 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
77 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
78 		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
79 	}
80 	write_unlock(&fs_info->tree_mod_log_lock);
81 
82 	return elem->seq;
83 }
84 
85 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
86 			    struct btrfs_seq_list *elem)
87 {
88 	struct rb_root *tm_root;
89 	struct rb_node *node;
90 	struct rb_node *next;
91 	struct tree_mod_elem *tm;
92 	u64 min_seq = BTRFS_SEQ_LAST;
93 	u64 seq_putting = elem->seq;
94 
95 	if (!seq_putting)
96 		return;
97 
98 	write_lock(&fs_info->tree_mod_log_lock);
99 	list_del(&elem->list);
100 	elem->seq = 0;
101 
102 	if (list_empty(&fs_info->tree_mod_seq_list)) {
103 		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
104 	} else {
105 		struct btrfs_seq_list *first;
106 
107 		first = list_first_entry(&fs_info->tree_mod_seq_list,
108 					 struct btrfs_seq_list, list);
109 		if (seq_putting > first->seq) {
110 			/*
111 			 * Blocker with lower sequence number exists, we cannot
112 			 * remove anything from the log.
113 			 */
114 			write_unlock(&fs_info->tree_mod_log_lock);
115 			return;
116 		}
117 		min_seq = first->seq;
118 	}
119 
120 	/*
121 	 * Anything that's lower than the lowest existing (read: blocked)
122 	 * sequence number can be removed from the tree.
123 	 */
124 	tm_root = &fs_info->tree_mod_log;
125 	for (node = rb_first(tm_root); node; node = next) {
126 		next = rb_next(node);
127 		tm = rb_entry(node, struct tree_mod_elem, node);
128 		if (tm->seq >= min_seq)
129 			continue;
130 		rb_erase(node, tm_root);
131 		kfree(tm);
132 	}
133 	write_unlock(&fs_info->tree_mod_log_lock);
134 }
135 
136 /*
137  * Key order of the log:
138  *       node/leaf start address -> sequence
139  *
140  * The 'start address' is the logical address of the *new* root node for root
141  * replace operations, or the logical address of the affected block for all
142  * other operations.
143  */
144 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
145 					struct tree_mod_elem *tm)
146 {
147 	struct rb_root *tm_root;
148 	struct rb_node **new;
149 	struct rb_node *parent = NULL;
150 	struct tree_mod_elem *cur;
151 
152 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
153 
154 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
155 
156 	tm_root = &fs_info->tree_mod_log;
157 	new = &tm_root->rb_node;
158 	while (*new) {
159 		cur = rb_entry(*new, struct tree_mod_elem, node);
160 		parent = *new;
161 		if (cur->logical < tm->logical)
162 			new = &((*new)->rb_left);
163 		else if (cur->logical > tm->logical)
164 			new = &((*new)->rb_right);
165 		else if (cur->seq < tm->seq)
166 			new = &((*new)->rb_left);
167 		else if (cur->seq > tm->seq)
168 			new = &((*new)->rb_right);
169 		else
170 			return -EEXIST;
171 	}
172 
173 	rb_link_node(&tm->node, parent, new);
174 	rb_insert_color(&tm->node, tm_root);
175 	return 0;
176 }
177 
178 static inline bool skip_eb_logging(const struct extent_buffer *eb)
179 {
180 	const u64 owner = btrfs_header_owner(eb);
181 
182 	if (btrfs_header_level(eb) == 0)
183 		return true;
184 
185 	/*
186 	 * Tree mod logging exists so that there's a consistent view of the
187 	 * extents and backrefs of inodes even if while a task is iterating over
188 	 * them other tasks are modifying subvolume trees and the extent tree
189 	 * (including running delayed refs). So we only need to log extent
190 	 * buffers from the extent tree and subvolume trees.
191 	 */
192 
193 	if (owner == BTRFS_EXTENT_TREE_OBJECTID)
194 		return false;
195 
196 	if (btrfs_is_fstree(owner))
197 		return false;
198 
199 	return true;
200 }
201 
202 /*
203  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
204  * returns false with the tree_mod_log_lock acquired. The caller must hold
205  * this until all tree mod log insertions are recorded in the rb tree and then
206  * write unlock fs_info::tree_mod_log_lock.
207  */
208 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
209 {
210 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
211 		return true;
212 	if (eb && skip_eb_logging(eb))
213 		return true;
214 
215 	write_lock(&fs_info->tree_mod_log_lock);
216 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
217 		write_unlock(&fs_info->tree_mod_log_lock);
218 		return true;
219 	}
220 
221 	return false;
222 }
223 
224 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
225 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
226 			      const struct extent_buffer *eb)
227 {
228 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
229 		return false;
230 	if (eb && skip_eb_logging(eb))
231 		return false;
232 
233 	return true;
234 }
235 
236 static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
237 						 int slot,
238 						 enum btrfs_mod_log_op op)
239 {
240 	struct tree_mod_elem *tm;
241 
242 	/* Can't be one of these types, due to union in struct tree_mod_elem. */
243 	ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
244 	ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
245 
246 	tm = kzalloc_obj(*tm, GFP_NOFS);
247 	if (!tm)
248 		return NULL;
249 
250 	tm->logical = eb->start;
251 	btrfs_node_key(eb, &tm->slot_change.key, slot);
252 	tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
253 	tm->op = op;
254 	tm->slot = slot;
255 	tm->generation = btrfs_node_ptr_generation(eb, slot);
256 	RB_CLEAR_NODE(&tm->node);
257 
258 	return tm;
259 }
260 
261 int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
262 				  enum btrfs_mod_log_op op)
263 {
264 	struct tree_mod_elem *tm;
265 	int ret = 0;
266 
267 	if (!tree_mod_need_log(eb->fs_info, eb))
268 		return 0;
269 
270 	tm = alloc_tree_mod_elem(eb, slot, op);
271 	if (!tm)
272 		ret = -ENOMEM;
273 
274 	if (tree_mod_dont_log(eb->fs_info, eb)) {
275 		kfree(tm);
276 		/*
277 		 * Don't error if we failed to allocate memory because we don't
278 		 * need to log.
279 		 */
280 		return 0;
281 	} else if (ret != 0) {
282 		/*
283 		 * We previously failed to allocate memory and we need to log,
284 		 * so we have to fail.
285 		 */
286 		goto out_unlock;
287 	}
288 
289 	ret = tree_mod_log_insert(eb->fs_info, tm);
290 out_unlock:
291 	write_unlock(&eb->fs_info->tree_mod_log_lock);
292 	if (ret)
293 		kfree(tm);
294 
295 	return ret;
296 }
297 
298 static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
299 						     int dst_slot, int src_slot,
300 						     int nr_items)
301 {
302 	struct tree_mod_elem *tm;
303 
304 	tm = kzalloc_obj(*tm, GFP_NOFS);
305 	if (!tm)
306 		return ERR_PTR(-ENOMEM);
307 
308 	tm->logical = eb->start;
309 	tm->slot = src_slot;
310 	tm->move.dst_slot = dst_slot;
311 	tm->move.nr_items = nr_items;
312 	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
313 	RB_CLEAR_NODE(&tm->node);
314 
315 	return tm;
316 }
317 
318 int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
319 				   int dst_slot, int src_slot,
320 				   int nr_items)
321 {
322 	struct tree_mod_elem *tm = NULL;
323 	struct tree_mod_elem **tm_list = NULL;
324 	int ret = 0;
325 	int i;
326 	bool locked = false;
327 
328 	if (!tree_mod_need_log(eb->fs_info, eb))
329 		return 0;
330 
331 	tm_list = kzalloc_objs(struct tree_mod_elem *, nr_items, GFP_NOFS);
332 	if (!tm_list) {
333 		ret = -ENOMEM;
334 		goto lock;
335 	}
336 
337 	tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
338 	if (IS_ERR(tm)) {
339 		ret = PTR_ERR(tm);
340 		tm = NULL;
341 		goto lock;
342 	}
343 
344 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
345 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
346 				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
347 		if (!tm_list[i]) {
348 			ret = -ENOMEM;
349 			goto lock;
350 		}
351 	}
352 
353 lock:
354 	if (tree_mod_dont_log(eb->fs_info, eb)) {
355 		/*
356 		 * Don't error if we failed to allocate memory because we don't
357 		 * need to log.
358 		 */
359 		ret = 0;
360 		goto free_tms;
361 	}
362 	locked = true;
363 
364 	/*
365 	 * We previously failed to allocate memory and we need to log, so we
366 	 * have to fail.
367 	 */
368 	if (ret != 0)
369 		goto free_tms;
370 
371 	/*
372 	 * When we override something during the move, we log these removals.
373 	 * This can only happen when we move towards the beginning of the
374 	 * buffer, i.e. dst_slot < src_slot.
375 	 */
376 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
377 		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
378 		if (ret)
379 			goto free_tms;
380 	}
381 
382 	ret = tree_mod_log_insert(eb->fs_info, tm);
383 	if (ret)
384 		goto free_tms;
385 	write_unlock(&eb->fs_info->tree_mod_log_lock);
386 	kfree(tm_list);
387 
388 	return 0;
389 
390 free_tms:
391 	if (tm_list) {
392 		for (i = 0; i < nr_items; i++) {
393 			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
394 				rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
395 			kfree(tm_list[i]);
396 		}
397 	}
398 	if (locked)
399 		write_unlock(&eb->fs_info->tree_mod_log_lock);
400 	kfree(tm_list);
401 	kfree(tm);
402 
403 	return ret;
404 }
405 
406 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
407 				struct tree_mod_elem **tm_list,
408 				int nritems)
409 {
410 	int i, j;
411 	int ret;
412 
413 	for (i = nritems - 1; i >= 0; i--) {
414 		ret = tree_mod_log_insert(fs_info, tm_list[i]);
415 		if (ret) {
416 			for (j = nritems - 1; j > i; j--)
417 				rb_erase(&tm_list[j]->node,
418 					 &fs_info->tree_mod_log);
419 			return ret;
420 		}
421 	}
422 
423 	return 0;
424 }
425 
426 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
427 				   struct extent_buffer *new_root,
428 				   bool log_removal)
429 {
430 	struct btrfs_fs_info *fs_info = old_root->fs_info;
431 	struct tree_mod_elem *tm = NULL;
432 	struct tree_mod_elem **tm_list = NULL;
433 	int nritems = 0;
434 	int ret = 0;
435 	int i;
436 
437 	if (!tree_mod_need_log(fs_info, NULL))
438 		return 0;
439 
440 	if (log_removal && btrfs_header_level(old_root) > 0) {
441 		nritems = btrfs_header_nritems(old_root);
442 		tm_list = kzalloc_objs(struct tree_mod_elem *, nritems,
443 				       GFP_NOFS);
444 		if (!tm_list) {
445 			ret = -ENOMEM;
446 			goto lock;
447 		}
448 		for (i = 0; i < nritems; i++) {
449 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
450 			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
451 			if (!tm_list[i]) {
452 				ret = -ENOMEM;
453 				goto lock;
454 			}
455 		}
456 	}
457 
458 	tm = kzalloc_obj(*tm, GFP_NOFS);
459 	if (!tm) {
460 		ret = -ENOMEM;
461 		goto lock;
462 	}
463 
464 	tm->logical = new_root->start;
465 	tm->old_root.logical = old_root->start;
466 	tm->old_root.level = btrfs_header_level(old_root);
467 	tm->generation = btrfs_header_generation(old_root);
468 	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
469 
470 lock:
471 	if (tree_mod_dont_log(fs_info, NULL)) {
472 		/*
473 		 * Don't error if we failed to allocate memory because we don't
474 		 * need to log.
475 		 */
476 		ret = 0;
477 		goto free_tms;
478 	} else if (ret != 0) {
479 		/*
480 		 * We previously failed to allocate memory and we need to log,
481 		 * so we have to fail.
482 		 */
483 		goto out_unlock;
484 	}
485 
486 	if (tm_list)
487 		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
488 	if (!ret)
489 		ret = tree_mod_log_insert(fs_info, tm);
490 
491 out_unlock:
492 	write_unlock(&fs_info->tree_mod_log_lock);
493 	if (ret)
494 		goto free_tms;
495 	kfree(tm_list);
496 
497 	return ret;
498 
499 free_tms:
500 	if (tm_list) {
501 		for (i = 0; i < nritems; i++)
502 			kfree(tm_list[i]);
503 		kfree(tm_list);
504 	}
505 	kfree(tm);
506 
507 	return ret;
508 }
509 
510 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
511 						   u64 start, u64 min_seq,
512 						   bool smallest)
513 {
514 	struct rb_root *tm_root;
515 	struct rb_node *node;
516 	struct tree_mod_elem *cur = NULL;
517 	struct tree_mod_elem *found = NULL;
518 
519 	read_lock(&fs_info->tree_mod_log_lock);
520 	tm_root = &fs_info->tree_mod_log;
521 	node = tm_root->rb_node;
522 	while (node) {
523 		cur = rb_entry(node, struct tree_mod_elem, node);
524 		if (cur->logical < start) {
525 			node = node->rb_left;
526 		} else if (cur->logical > start) {
527 			node = node->rb_right;
528 		} else if (cur->seq < min_seq) {
529 			node = node->rb_left;
530 		} else if (!smallest) {
531 			/* We want the node with the highest seq */
532 			if (found)
533 				BUG_ON(found->seq > cur->seq);
534 			found = cur;
535 			node = node->rb_left;
536 		} else if (cur->seq > min_seq) {
537 			/* We want the node with the smallest seq */
538 			if (found)
539 				BUG_ON(found->seq < cur->seq);
540 			found = cur;
541 			node = node->rb_right;
542 		} else {
543 			found = cur;
544 			break;
545 		}
546 	}
547 	read_unlock(&fs_info->tree_mod_log_lock);
548 
549 	return found;
550 }
551 
552 /*
553  * This returns the element from the log with the smallest time sequence
554  * value that's in the log (the oldest log item). Any element with a time
555  * sequence lower than min_seq will be ignored.
556  */
557 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
558 							u64 start, u64 min_seq)
559 {
560 	return __tree_mod_log_search(fs_info, start, min_seq, true);
561 }
562 
563 /*
564  * This returns the element from the log with the largest time sequence
565  * value that's in the log (the most recent log item). Any element with
566  * a time sequence lower than min_seq will be ignored.
567  */
568 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
569 						 u64 start, u64 min_seq)
570 {
571 	return __tree_mod_log_search(fs_info, start, min_seq, false);
572 }
573 
574 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
575 			       const struct extent_buffer *src,
576 			       unsigned long dst_offset,
577 			       unsigned long src_offset,
578 			       int nr_items)
579 {
580 	struct btrfs_fs_info *fs_info = dst->fs_info;
581 	int ret = 0;
582 	struct tree_mod_elem **tm_list = NULL;
583 	struct tree_mod_elem **tm_list_add = NULL;
584 	struct tree_mod_elem **tm_list_rem = NULL;
585 	int i;
586 	bool locked = false;
587 	struct tree_mod_elem *dst_move_tm = NULL;
588 	struct tree_mod_elem *src_move_tm = NULL;
589 	u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
590 	u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
591 
592 	if (!tree_mod_need_log(fs_info, NULL))
593 		return 0;
594 
595 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
596 		return 0;
597 
598 	tm_list = kzalloc_objs(struct tree_mod_elem *, nr_items * 2, GFP_NOFS);
599 	if (!tm_list) {
600 		ret = -ENOMEM;
601 		goto lock;
602 	}
603 
604 	if (dst_move_nr_items) {
605 		dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
606 						      dst_offset, dst_move_nr_items);
607 		if (IS_ERR(dst_move_tm)) {
608 			ret = PTR_ERR(dst_move_tm);
609 			dst_move_tm = NULL;
610 			goto lock;
611 		}
612 	}
613 	if (src_move_nr_items) {
614 		src_move_tm = tree_mod_log_alloc_move(src, src_offset,
615 						      src_offset + nr_items,
616 						      src_move_nr_items);
617 		if (IS_ERR(src_move_tm)) {
618 			ret = PTR_ERR(src_move_tm);
619 			src_move_tm = NULL;
620 			goto lock;
621 		}
622 	}
623 
624 	tm_list_add = tm_list;
625 	tm_list_rem = tm_list + nr_items;
626 	for (i = 0; i < nr_items; i++) {
627 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
628 						     BTRFS_MOD_LOG_KEY_REMOVE);
629 		if (!tm_list_rem[i]) {
630 			ret = -ENOMEM;
631 			goto lock;
632 		}
633 
634 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
635 						     BTRFS_MOD_LOG_KEY_ADD);
636 		if (!tm_list_add[i]) {
637 			ret = -ENOMEM;
638 			goto lock;
639 		}
640 	}
641 
642 lock:
643 	if (tree_mod_dont_log(fs_info, NULL)) {
644 		/*
645 		 * Don't error if we failed to allocate memory because we don't
646 		 * need to log.
647 		 */
648 		ret = 0;
649 		goto free_tms;
650 	}
651 	locked = true;
652 
653 	/*
654 	 * We previously failed to allocate memory and we need to log, so we
655 	 * have to fail.
656 	 */
657 	if (ret != 0)
658 		goto free_tms;
659 
660 	if (dst_move_tm) {
661 		ret = tree_mod_log_insert(fs_info, dst_move_tm);
662 		if (ret)
663 			goto free_tms;
664 	}
665 	for (i = 0; i < nr_items; i++) {
666 		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
667 		if (ret)
668 			goto free_tms;
669 		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
670 		if (ret)
671 			goto free_tms;
672 	}
673 	if (src_move_tm) {
674 		ret = tree_mod_log_insert(fs_info, src_move_tm);
675 		if (ret)
676 			goto free_tms;
677 	}
678 
679 	write_unlock(&fs_info->tree_mod_log_lock);
680 	kfree(tm_list);
681 
682 	return 0;
683 
684 free_tms:
685 	if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
686 		rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
687 	kfree(dst_move_tm);
688 	if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
689 		rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
690 	kfree(src_move_tm);
691 	if (tm_list) {
692 		for (i = 0; i < nr_items * 2; i++) {
693 			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
694 				rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
695 			kfree(tm_list[i]);
696 		}
697 	}
698 	if (locked)
699 		write_unlock(&fs_info->tree_mod_log_lock);
700 	kfree(tm_list);
701 
702 	return ret;
703 }
704 
705 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
706 {
707 	struct tree_mod_elem **tm_list = NULL;
708 	int nritems = 0;
709 	int i;
710 	int ret = 0;
711 
712 	if (!tree_mod_need_log(eb->fs_info, eb))
713 		return 0;
714 
715 	nritems = btrfs_header_nritems(eb);
716 	tm_list = kzalloc_objs(struct tree_mod_elem *, nritems, GFP_NOFS);
717 	if (!tm_list) {
718 		ret = -ENOMEM;
719 		goto lock;
720 	}
721 
722 	for (i = 0; i < nritems; i++) {
723 		tm_list[i] = alloc_tree_mod_elem(eb, i,
724 				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
725 		if (!tm_list[i]) {
726 			ret = -ENOMEM;
727 			goto lock;
728 		}
729 	}
730 
731 lock:
732 	if (tree_mod_dont_log(eb->fs_info, eb)) {
733 		/*
734 		 * Don't error if we failed to allocate memory because we don't
735 		 * need to log.
736 		 */
737 		ret = 0;
738 		goto free_tms;
739 	} else if (ret != 0) {
740 		/*
741 		 * We previously failed to allocate memory and we need to log,
742 		 * so we have to fail.
743 		 */
744 		goto out_unlock;
745 	}
746 
747 	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
748 out_unlock:
749 	write_unlock(&eb->fs_info->tree_mod_log_lock);
750 	if (ret)
751 		goto free_tms;
752 	kfree(tm_list);
753 
754 	return 0;
755 
756 free_tms:
757 	if (tm_list) {
758 		for (i = 0; i < nritems; i++)
759 			kfree(tm_list[i]);
760 		kfree(tm_list);
761 	}
762 
763 	return ret;
764 }
765 
766 /*
767  * Returns the logical address of the oldest predecessor of the given root.
768  * Entries older than time_seq are ignored.
769  */
770 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
771 						      u64 time_seq)
772 {
773 	struct tree_mod_elem *tm;
774 	struct tree_mod_elem *found = NULL;
775 	u64 root_logical = eb_root->start;
776 	bool looped = false;
777 
778 	if (!time_seq)
779 		return NULL;
780 
781 	/*
782 	 * The very last operation that's logged for a root is the replacement
783 	 * operation (if it is replaced at all). This has the logical address
784 	 * of the *new* root, making it the very first operation that's logged
785 	 * for this root.
786 	 */
787 	while (1) {
788 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
789 						time_seq);
790 		if (!looped && !tm)
791 			return NULL;
792 		/*
793 		 * If there are no tree operation for the oldest root, we simply
794 		 * return it. This should only happen if that (old) root is at
795 		 * level 0.
796 		 */
797 		if (!tm)
798 			break;
799 
800 		/*
801 		 * If there's an operation that's not a root replacement, we
802 		 * found the oldest version of our root. Normally, we'll find a
803 		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
804 		 */
805 		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
806 			break;
807 
808 		found = tm;
809 		root_logical = tm->old_root.logical;
810 		looped = true;
811 	}
812 
813 	/* If there's no old root to return, return what we found instead */
814 	if (!found)
815 		found = tm;
816 
817 	return found;
818 }
819 
820 
821 /*
822  * tm is a pointer to the first operation to rewind within eb. Then, all
823  * previous operations will be rewound (until we reach something older than
824  * time_seq).
825  */
826 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
827 				struct extent_buffer *eb,
828 				u64 time_seq,
829 				struct tree_mod_elem *first_tm)
830 {
831 	u32 n;
832 	struct rb_node *next;
833 	struct tree_mod_elem *tm = first_tm;
834 	unsigned long o_dst;
835 	unsigned long o_src;
836 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
837 	/*
838 	 * max_slot tracks the maximum valid slot of the rewind eb at every
839 	 * step of the rewind. This is in contrast with 'n' which eventually
840 	 * matches the number of items, but can be wrong during moves or if
841 	 * removes overlap on already valid slots (which is probably separately
842 	 * a bug). We do this to validate the offsets of memmoves for rewinding
843 	 * moves and detect invalid memmoves.
844 	 *
845 	 * Since a rewind eb can start empty, max_slot is a signed integer with
846 	 * a special meaning for -1, which is that no slot is valid to move out
847 	 * of. Any other negative value is invalid.
848 	 */
849 	int max_slot;
850 	int move_src_end_slot;
851 	int move_dst_end_slot;
852 
853 	n = btrfs_header_nritems(eb);
854 	max_slot = n - 1;
855 	read_lock(&fs_info->tree_mod_log_lock);
856 	while (tm && tm->seq >= time_seq) {
857 		ASSERT(max_slot >= -1);
858 		/*
859 		 * All the operations are recorded with the operator used for
860 		 * the modification. As we're going backwards, we do the
861 		 * opposite of each operation here.
862 		 */
863 		switch (tm->op) {
864 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
865 			BUG_ON(tm->slot < n);
866 			fallthrough;
867 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
868 		case BTRFS_MOD_LOG_KEY_REMOVE:
869 			btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
870 			btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
871 			btrfs_set_node_ptr_generation(eb, tm->slot,
872 						      tm->generation);
873 			n++;
874 			if (tm->slot > max_slot)
875 				max_slot = tm->slot;
876 			break;
877 		case BTRFS_MOD_LOG_KEY_REPLACE:
878 			BUG_ON(tm->slot >= n);
879 			btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
880 			btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
881 			btrfs_set_node_ptr_generation(eb, tm->slot,
882 						      tm->generation);
883 			break;
884 		case BTRFS_MOD_LOG_KEY_ADD:
885 			/*
886 			 * It is possible we could have already removed keys
887 			 * behind the known max slot, so this will be an
888 			 * overestimate. In practice, the copy operation
889 			 * inserts them in increasing order, and overestimating
890 			 * just means we miss some warnings, so it's OK. It
891 			 * isn't worth carefully tracking the full array of
892 			 * valid slots to check against when moving.
893 			 */
894 			if (tm->slot == max_slot)
895 				max_slot--;
896 			/* if a move operation is needed it's in the log */
897 			n--;
898 			break;
899 		case BTRFS_MOD_LOG_MOVE_KEYS:
900 			ASSERT(tm->move.nr_items > 0);
901 			move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
902 			move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
903 			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
904 			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
905 			if (WARN_ON(move_src_end_slot > max_slot ||
906 				    tm->move.nr_items <= 0)) {
907 				btrfs_warn(fs_info,
908 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
909 					   eb->start, tm->slot,
910 					   tm->move.dst_slot, tm->move.nr_items,
911 					   tm->seq, n, max_slot);
912 			}
913 			memmove_extent_buffer(eb, o_dst, o_src,
914 					      tm->move.nr_items * p_size);
915 			max_slot = move_dst_end_slot;
916 			break;
917 		case BTRFS_MOD_LOG_ROOT_REPLACE:
918 			/*
919 			 * This operation is special. For roots, this must be
920 			 * handled explicitly before rewinding.
921 			 * For non-roots, this operation may exist if the node
922 			 * was a root: root A -> child B; then A gets empty and
923 			 * B is promoted to the new root. In the mod log, we'll
924 			 * have a root-replace operation for B, a tree block
925 			 * that is no root. We simply ignore that operation.
926 			 */
927 			break;
928 		}
929 		next = rb_next(&tm->node);
930 		if (!next)
931 			break;
932 		tm = rb_entry(next, struct tree_mod_elem, node);
933 		if (tm->logical != first_tm->logical)
934 			break;
935 	}
936 	read_unlock(&fs_info->tree_mod_log_lock);
937 	btrfs_set_header_nritems(eb, n);
938 }
939 
940 /*
941  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
942  * is returned. If rewind operations happen, a fresh buffer is returned. The
943  * returned buffer is always read-locked. If the returned buffer is not the
944  * input buffer, the lock on the input buffer is released and the input buffer
945  * is freed (its refcount is decremented).
946  */
947 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
948 						struct extent_buffer *eb,
949 						u64 time_seq)
950 {
951 	struct extent_buffer *eb_rewin;
952 	struct tree_mod_elem *tm;
953 
954 	if (!time_seq)
955 		return eb;
956 
957 	if (btrfs_header_level(eb) == 0)
958 		return eb;
959 
960 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
961 	if (!tm)
962 		return eb;
963 
964 	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
965 		BUG_ON(tm->slot != 0);
966 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
967 		if (!eb_rewin) {
968 			btrfs_tree_read_unlock(eb);
969 			free_extent_buffer(eb);
970 			return NULL;
971 		}
972 		btrfs_set_header_bytenr(eb_rewin, eb->start);
973 		btrfs_set_header_backref_rev(eb_rewin,
974 					     btrfs_header_backref_rev(eb));
975 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
976 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
977 	} else {
978 		eb_rewin = btrfs_clone_extent_buffer(eb);
979 		if (!eb_rewin) {
980 			btrfs_tree_read_unlock(eb);
981 			free_extent_buffer(eb);
982 			return NULL;
983 		}
984 	}
985 
986 	btrfs_tree_read_unlock(eb);
987 	free_extent_buffer(eb);
988 
989 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
990 				       eb_rewin, btrfs_header_level(eb_rewin));
991 	btrfs_tree_read_lock(eb_rewin);
992 	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
993 	WARN_ON(btrfs_header_nritems(eb_rewin) >
994 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
995 
996 	return eb_rewin;
997 }
998 
999 /*
1000  * Rewind the state of @root's root node to the given @time_seq value.
1001  * If there are no changes, the current root->root_node is returned. If anything
1002  * changed in between, there's a fresh buffer allocated on which the rewind
1003  * operations are done. In any case, the returned buffer is read locked.
1004  * Returns NULL on error (with no locks held).
1005  */
1006 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
1007 {
1008 	struct btrfs_fs_info *fs_info = root->fs_info;
1009 	struct tree_mod_elem *tm;
1010 	struct extent_buffer *eb = NULL;
1011 	struct extent_buffer *eb_root;
1012 	u64 eb_root_owner = 0;
1013 	struct extent_buffer *old;
1014 	struct tree_mod_root *old_root = NULL;
1015 	u64 old_generation = 0;
1016 	u64 logical;
1017 	int level;
1018 
1019 	eb_root = btrfs_read_lock_root_node(root);
1020 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1021 	if (!tm)
1022 		return eb_root;
1023 
1024 	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
1025 		old_root = &tm->old_root;
1026 		old_generation = tm->generation;
1027 		logical = old_root->logical;
1028 		level = old_root->level;
1029 	} else {
1030 		logical = eb_root->start;
1031 		level = btrfs_header_level(eb_root);
1032 	}
1033 
1034 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1035 	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1036 		struct btrfs_tree_parent_check check = { 0 };
1037 
1038 		btrfs_tree_read_unlock(eb_root);
1039 		free_extent_buffer(eb_root);
1040 
1041 		check.level = level;
1042 		check.owner_root = btrfs_root_id(root);
1043 
1044 		old = read_tree_block(fs_info, logical, &check);
1045 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1046 			if (!IS_ERR(old))
1047 				free_extent_buffer(old);
1048 			btrfs_warn(fs_info,
1049 				   "failed to read tree block %llu from get_old_root",
1050 				   logical);
1051 		} else {
1052 			struct tree_mod_elem *tm2;
1053 
1054 			btrfs_tree_read_lock(old);
1055 			eb = btrfs_clone_extent_buffer(old);
1056 			/*
1057 			 * After the lookup for the most recent tree mod operation
1058 			 * above and before we locked and cloned the extent buffer
1059 			 * 'old', a new tree mod log operation may have been added.
1060 			 * So lookup for a more recent one to make sure the number
1061 			 * of mod log operations we replay is consistent with the
1062 			 * number of items we have in the cloned extent buffer,
1063 			 * otherwise we can hit a BUG_ON when rewinding the extent
1064 			 * buffer.
1065 			 */
1066 			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1067 			btrfs_tree_read_unlock(old);
1068 			free_extent_buffer(old);
1069 			ASSERT(tm2);
1070 			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1071 			if (!tm2 || tm2->seq < tm->seq) {
1072 				free_extent_buffer(eb);
1073 				return NULL;
1074 			}
1075 			tm = tm2;
1076 		}
1077 	} else if (old_root) {
1078 		eb_root_owner = btrfs_header_owner(eb_root);
1079 		btrfs_tree_read_unlock(eb_root);
1080 		free_extent_buffer(eb_root);
1081 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1082 	} else {
1083 		eb = btrfs_clone_extent_buffer(eb_root);
1084 		btrfs_tree_read_unlock(eb_root);
1085 		free_extent_buffer(eb_root);
1086 	}
1087 
1088 	if (!eb)
1089 		return NULL;
1090 	if (old_root) {
1091 		btrfs_set_header_bytenr(eb, eb->start);
1092 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1093 		btrfs_set_header_owner(eb, eb_root_owner);
1094 		btrfs_set_header_level(eb, old_root->level);
1095 		btrfs_set_header_generation(eb, old_generation);
1096 	}
1097 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1098 				       btrfs_header_level(eb));
1099 	btrfs_tree_read_lock(eb);
1100 	if (tm)
1101 		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1102 	else
1103 		WARN_ON(btrfs_header_level(eb) != 0);
1104 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1105 
1106 	return eb;
1107 }
1108 
1109 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1110 {
1111 	struct tree_mod_elem *tm;
1112 	int level;
1113 	struct extent_buffer *eb_root = btrfs_root_node(root);
1114 
1115 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1116 	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1117 		level = tm->old_root.level;
1118 	else
1119 		level = btrfs_header_level(eb_root);
1120 
1121 	free_extent_buffer(eb_root);
1122 
1123 	return level;
1124 }
1125 
1126 /*
1127  * Return the lowest sequence number in the tree modification log.
1128  *
1129  * Return the sequence number of the oldest tree modification log user, which
1130  * corresponds to the lowest sequence number of all existing users. If there are
1131  * no users it returns 0.
1132  */
1133 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1134 {
1135 	u64 ret = 0;
1136 
1137 	read_lock(&fs_info->tree_mod_log_lock);
1138 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
1139 		struct btrfs_seq_list *elem;
1140 
1141 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
1142 					struct btrfs_seq_list, list);
1143 		ret = elem->seq;
1144 	}
1145 	read_unlock(&fs_info->tree_mod_log_lock);
1146 
1147 	return ret;
1148 }
1149