xref: /linux/fs/btrfs/tree-mod-log.c (revision f92b71ffca8c7e45e194aecc85e31bd11582f4d2)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9 
10 struct tree_mod_root {
11 	u64 logical;
12 	u8 level;
13 };
14 
15 struct tree_mod_elem {
16 	struct rb_node node;
17 	u64 logical;
18 	u64 seq;
19 	enum btrfs_mod_log_op op;
20 
21 	/*
22 	 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23 	 * operations.
24 	 */
25 	int slot;
26 
27 	/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28 	u64 generation;
29 
30 	union {
31 		/*
32 		 * This is used for the following op types:
33 		 *
34 		 *    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
35 		 *    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
36 		 *    BTRFS_MOD_LOG_KEY_REMOVE
37 		 *    BTRFS_MOD_LOG_KEY_REPLACE
38 		 */
39 		struct {
40 			struct btrfs_disk_key key;
41 			u64 blockptr;
42 		} slot_change;
43 
44 		/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
45 		struct {
46 			int dst_slot;
47 			int nr_items;
48 		} move;
49 
50 		/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
51 		struct tree_mod_root old_root;
52 	};
53 };
54 
55 /*
56  * Pull a new tree mod seq number for our operation.
57  */
btrfs_inc_tree_mod_seq(struct btrfs_fs_info * fs_info)58 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
59 {
60 	return atomic64_inc_return(&fs_info->tree_mod_seq);
61 }
62 
63 /*
64  * This adds a new blocker to the tree mod log's blocker list if the @elem
65  * passed does not already have a sequence number set. So when a caller expects
66  * to record tree modifications, it should ensure to set elem->seq to zero
67  * before calling btrfs_get_tree_mod_seq.
68  * Returns a fresh, unused tree log modification sequence number, even if no new
69  * blocker was added.
70  */
btrfs_get_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)71 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
72 			   struct btrfs_seq_list *elem)
73 {
74 	write_lock(&fs_info->tree_mod_log_lock);
75 	if (!elem->seq) {
76 		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
77 		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
78 		set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
79 	}
80 	write_unlock(&fs_info->tree_mod_log_lock);
81 
82 	return elem->seq;
83 }
84 
btrfs_put_tree_mod_seq(struct btrfs_fs_info * fs_info,struct btrfs_seq_list * elem)85 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
86 			    struct btrfs_seq_list *elem)
87 {
88 	struct rb_root *tm_root;
89 	struct rb_node *node;
90 	struct rb_node *next;
91 	struct tree_mod_elem *tm;
92 	u64 min_seq = BTRFS_SEQ_LAST;
93 	u64 seq_putting = elem->seq;
94 
95 	if (!seq_putting)
96 		return;
97 
98 	write_lock(&fs_info->tree_mod_log_lock);
99 	list_del(&elem->list);
100 	elem->seq = 0;
101 
102 	if (list_empty(&fs_info->tree_mod_seq_list)) {
103 		clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
104 	} else {
105 		struct btrfs_seq_list *first;
106 
107 		first = list_first_entry(&fs_info->tree_mod_seq_list,
108 					 struct btrfs_seq_list, list);
109 		if (seq_putting > first->seq) {
110 			/*
111 			 * Blocker with lower sequence number exists, we cannot
112 			 * remove anything from the log.
113 			 */
114 			write_unlock(&fs_info->tree_mod_log_lock);
115 			return;
116 		}
117 		min_seq = first->seq;
118 	}
119 
120 	/*
121 	 * Anything that's lower than the lowest existing (read: blocked)
122 	 * sequence number can be removed from the tree.
123 	 */
124 	tm_root = &fs_info->tree_mod_log;
125 	for (node = rb_first(tm_root); node; node = next) {
126 		next = rb_next(node);
127 		tm = rb_entry(node, struct tree_mod_elem, node);
128 		if (tm->seq >= min_seq)
129 			continue;
130 		rb_erase(node, tm_root);
131 		kfree(tm);
132 	}
133 	write_unlock(&fs_info->tree_mod_log_lock);
134 }
135 
136 /*
137  * Key order of the log:
138  *       node/leaf start address -> sequence
139  *
140  * The 'start address' is the logical address of the *new* root node for root
141  * replace operations, or the logical address of the affected block for all
142  * other operations.
143  */
tree_mod_log_insert(struct btrfs_fs_info * fs_info,struct tree_mod_elem * tm)144 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
145 					struct tree_mod_elem *tm)
146 {
147 	struct rb_root *tm_root;
148 	struct rb_node **new;
149 	struct rb_node *parent = NULL;
150 	struct tree_mod_elem *cur;
151 
152 	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
153 
154 	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
155 
156 	tm_root = &fs_info->tree_mod_log;
157 	new = &tm_root->rb_node;
158 	while (*new) {
159 		cur = rb_entry(*new, struct tree_mod_elem, node);
160 		parent = *new;
161 		if (cur->logical < tm->logical)
162 			new = &((*new)->rb_left);
163 		else if (cur->logical > tm->logical)
164 			new = &((*new)->rb_right);
165 		else if (cur->seq < tm->seq)
166 			new = &((*new)->rb_left);
167 		else if (cur->seq > tm->seq)
168 			new = &((*new)->rb_right);
169 		else
170 			return -EEXIST;
171 	}
172 
173 	rb_link_node(&tm->node, parent, new);
174 	rb_insert_color(&tm->node, tm_root);
175 	return 0;
176 }
177 
skip_eb_logging(const struct extent_buffer * eb)178 static inline bool skip_eb_logging(const struct extent_buffer *eb)
179 {
180 	const u64 owner = btrfs_header_owner(eb);
181 
182 	if (btrfs_header_level(eb) == 0)
183 		return true;
184 
185 	/*
186 	 * Tree mod logging exists so that there's a consistent view of the
187 	 * extents and backrefs of inodes even if while a task is iterating over
188 	 * them other tasks are modifying subvolume trees and the extent tree
189 	 * (including running delayed refs). So we only need to log extent
190 	 * buffers from the extent tree and subvolume trees.
191 	 */
192 
193 	if (owner == BTRFS_EXTENT_TREE_OBJECTID)
194 		return false;
195 
196 	if (btrfs_is_fstree(owner))
197 		return false;
198 
199 	return true;
200 }
201 
202 /*
203  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
204  * returns false with the tree_mod_log_lock acquired. The caller must hold
205  * this until all tree mod log insertions are recorded in the rb tree and then
206  * write unlock fs_info::tree_mod_log_lock.
207  */
tree_mod_dont_log(struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)208 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb)
209 {
210 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
211 		return true;
212 	if (eb && skip_eb_logging(eb))
213 		return true;
214 
215 	write_lock(&fs_info->tree_mod_log_lock);
216 	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
217 		write_unlock(&fs_info->tree_mod_log_lock);
218 		return true;
219 	}
220 
221 	return false;
222 }
223 
224 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
tree_mod_need_log(const struct btrfs_fs_info * fs_info,const struct extent_buffer * eb)225 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
226 			      const struct extent_buffer *eb)
227 {
228 	if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
229 		return false;
230 	if (eb && skip_eb_logging(eb))
231 		return false;
232 
233 	return true;
234 }
235 
alloc_tree_mod_elem(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)236 static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
237 						 int slot,
238 						 enum btrfs_mod_log_op op)
239 {
240 	struct tree_mod_elem *tm;
241 
242 	/* Can't be one of these types, due to union in struct tree_mod_elem. */
243 	ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
244 	ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
245 
246 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
247 	if (!tm)
248 		return NULL;
249 
250 	tm->logical = eb->start;
251 	btrfs_node_key(eb, &tm->slot_change.key, slot);
252 	tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
253 	tm->op = op;
254 	tm->slot = slot;
255 	tm->generation = btrfs_node_ptr_generation(eb, slot);
256 	RB_CLEAR_NODE(&tm->node);
257 
258 	return tm;
259 }
260 
btrfs_tree_mod_log_insert_key(const struct extent_buffer * eb,int slot,enum btrfs_mod_log_op op)261 int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
262 				  enum btrfs_mod_log_op op)
263 {
264 	struct tree_mod_elem *tm;
265 	int ret = 0;
266 
267 	if (!tree_mod_need_log(eb->fs_info, eb))
268 		return 0;
269 
270 	tm = alloc_tree_mod_elem(eb, slot, op);
271 	if (!tm)
272 		ret = -ENOMEM;
273 
274 	if (tree_mod_dont_log(eb->fs_info, eb)) {
275 		kfree(tm);
276 		/*
277 		 * Don't error if we failed to allocate memory because we don't
278 		 * need to log.
279 		 */
280 		return 0;
281 	} else if (ret != 0) {
282 		/*
283 		 * We previously failed to allocate memory and we need to log,
284 		 * so we have to fail.
285 		 */
286 		goto out_unlock;
287 	}
288 
289 	ret = tree_mod_log_insert(eb->fs_info, tm);
290 out_unlock:
291 	write_unlock(&eb->fs_info->tree_mod_log_lock);
292 	if (ret)
293 		kfree(tm);
294 
295 	return ret;
296 }
297 
tree_mod_log_alloc_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)298 static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb,
299 						     int dst_slot, int src_slot,
300 						     int nr_items)
301 {
302 	struct tree_mod_elem *tm;
303 
304 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
305 	if (!tm)
306 		return ERR_PTR(-ENOMEM);
307 
308 	tm->logical = eb->start;
309 	tm->slot = src_slot;
310 	tm->move.dst_slot = dst_slot;
311 	tm->move.nr_items = nr_items;
312 	tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
313 	RB_CLEAR_NODE(&tm->node);
314 
315 	return tm;
316 }
317 
btrfs_tree_mod_log_insert_move(const struct extent_buffer * eb,int dst_slot,int src_slot,int nr_items)318 int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb,
319 				   int dst_slot, int src_slot,
320 				   int nr_items)
321 {
322 	struct tree_mod_elem *tm = NULL;
323 	struct tree_mod_elem **tm_list = NULL;
324 	int ret = 0;
325 	int i;
326 	bool locked = false;
327 
328 	if (!tree_mod_need_log(eb->fs_info, eb))
329 		return 0;
330 
331 	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
332 	if (!tm_list) {
333 		ret = -ENOMEM;
334 		goto lock;
335 	}
336 
337 	tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
338 	if (IS_ERR(tm)) {
339 		ret = PTR_ERR(tm);
340 		tm = NULL;
341 		goto lock;
342 	}
343 
344 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
345 		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
346 				BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
347 		if (!tm_list[i]) {
348 			ret = -ENOMEM;
349 			goto lock;
350 		}
351 	}
352 
353 lock:
354 	if (tree_mod_dont_log(eb->fs_info, eb)) {
355 		/*
356 		 * Don't error if we failed to allocate memory because we don't
357 		 * need to log.
358 		 */
359 		ret = 0;
360 		goto free_tms;
361 	}
362 	locked = true;
363 
364 	/*
365 	 * We previously failed to allocate memory and we need to log, so we
366 	 * have to fail.
367 	 */
368 	if (ret != 0)
369 		goto free_tms;
370 
371 	/*
372 	 * When we override something during the move, we log these removals.
373 	 * This can only happen when we move towards the beginning of the
374 	 * buffer, i.e. dst_slot < src_slot.
375 	 */
376 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
377 		ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
378 		if (ret)
379 			goto free_tms;
380 	}
381 
382 	ret = tree_mod_log_insert(eb->fs_info, tm);
383 	if (ret)
384 		goto free_tms;
385 	write_unlock(&eb->fs_info->tree_mod_log_lock);
386 	kfree(tm_list);
387 
388 	return 0;
389 
390 free_tms:
391 	if (tm_list) {
392 		for (i = 0; i < nr_items; i++) {
393 			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
394 				rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
395 			kfree(tm_list[i]);
396 		}
397 	}
398 	if (locked)
399 		write_unlock(&eb->fs_info->tree_mod_log_lock);
400 	kfree(tm_list);
401 	kfree(tm);
402 
403 	return ret;
404 }
405 
tree_mod_log_free_eb(struct btrfs_fs_info * fs_info,struct tree_mod_elem ** tm_list,int nritems)406 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
407 				struct tree_mod_elem **tm_list,
408 				int nritems)
409 {
410 	int i, j;
411 	int ret;
412 
413 	for (i = nritems - 1; i >= 0; i--) {
414 		ret = tree_mod_log_insert(fs_info, tm_list[i]);
415 		if (ret) {
416 			for (j = nritems - 1; j > i; j--)
417 				rb_erase(&tm_list[j]->node,
418 					 &fs_info->tree_mod_log);
419 			return ret;
420 		}
421 	}
422 
423 	return 0;
424 }
425 
btrfs_tree_mod_log_insert_root(struct extent_buffer * old_root,struct extent_buffer * new_root,bool log_removal)426 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
427 				   struct extent_buffer *new_root,
428 				   bool log_removal)
429 {
430 	struct btrfs_fs_info *fs_info = old_root->fs_info;
431 	struct tree_mod_elem *tm = NULL;
432 	struct tree_mod_elem **tm_list = NULL;
433 	int nritems = 0;
434 	int ret = 0;
435 	int i;
436 
437 	if (!tree_mod_need_log(fs_info, NULL))
438 		return 0;
439 
440 	if (log_removal && btrfs_header_level(old_root) > 0) {
441 		nritems = btrfs_header_nritems(old_root);
442 		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
443 				  GFP_NOFS);
444 		if (!tm_list) {
445 			ret = -ENOMEM;
446 			goto lock;
447 		}
448 		for (i = 0; i < nritems; i++) {
449 			tm_list[i] = alloc_tree_mod_elem(old_root, i,
450 			    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
451 			if (!tm_list[i]) {
452 				ret = -ENOMEM;
453 				goto lock;
454 			}
455 		}
456 	}
457 
458 	tm = kzalloc(sizeof(*tm), GFP_NOFS);
459 	if (!tm) {
460 		ret = -ENOMEM;
461 		goto lock;
462 	}
463 
464 	tm->logical = new_root->start;
465 	tm->old_root.logical = old_root->start;
466 	tm->old_root.level = btrfs_header_level(old_root);
467 	tm->generation = btrfs_header_generation(old_root);
468 	tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
469 
470 lock:
471 	if (tree_mod_dont_log(fs_info, NULL)) {
472 		/*
473 		 * Don't error if we failed to allocate memory because we don't
474 		 * need to log.
475 		 */
476 		ret = 0;
477 		goto free_tms;
478 	} else if (ret != 0) {
479 		/*
480 		 * We previously failed to allocate memory and we need to log,
481 		 * so we have to fail.
482 		 */
483 		goto out_unlock;
484 	}
485 
486 	if (tm_list)
487 		ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
488 	if (!ret)
489 		ret = tree_mod_log_insert(fs_info, tm);
490 
491 out_unlock:
492 	write_unlock(&fs_info->tree_mod_log_lock);
493 	if (ret)
494 		goto free_tms;
495 	kfree(tm_list);
496 
497 	return ret;
498 
499 free_tms:
500 	if (tm_list) {
501 		for (i = 0; i < nritems; i++)
502 			kfree(tm_list[i]);
503 		kfree(tm_list);
504 	}
505 	kfree(tm);
506 
507 	return ret;
508 }
509 
__tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq,bool smallest)510 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
511 						   u64 start, u64 min_seq,
512 						   bool smallest)
513 {
514 	struct rb_root *tm_root;
515 	struct rb_node *node;
516 	struct tree_mod_elem *cur = NULL;
517 	struct tree_mod_elem *found = NULL;
518 
519 	read_lock(&fs_info->tree_mod_log_lock);
520 	tm_root = &fs_info->tree_mod_log;
521 	node = tm_root->rb_node;
522 	while (node) {
523 		cur = rb_entry(node, struct tree_mod_elem, node);
524 		if (cur->logical < start) {
525 			node = node->rb_left;
526 		} else if (cur->logical > start) {
527 			node = node->rb_right;
528 		} else if (cur->seq < min_seq) {
529 			node = node->rb_left;
530 		} else if (!smallest) {
531 			/* We want the node with the highest seq */
532 			if (found)
533 				BUG_ON(found->seq > cur->seq);
534 			found = cur;
535 			node = node->rb_left;
536 		} else if (cur->seq > min_seq) {
537 			/* We want the node with the smallest seq */
538 			if (found)
539 				BUG_ON(found->seq < cur->seq);
540 			found = cur;
541 			node = node->rb_right;
542 		} else {
543 			found = cur;
544 			break;
545 		}
546 	}
547 	read_unlock(&fs_info->tree_mod_log_lock);
548 
549 	return found;
550 }
551 
552 /*
553  * This returns the element from the log with the smallest time sequence
554  * value that's in the log (the oldest log item). Any element with a time
555  * sequence lower than min_seq will be ignored.
556  */
tree_mod_log_search_oldest(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)557 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
558 							u64 start, u64 min_seq)
559 {
560 	return __tree_mod_log_search(fs_info, start, min_seq, true);
561 }
562 
563 /*
564  * This returns the element from the log with the largest time sequence
565  * value that's in the log (the most recent log item). Any element with
566  * a time sequence lower than min_seq will be ignored.
567  */
tree_mod_log_search(struct btrfs_fs_info * fs_info,u64 start,u64 min_seq)568 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
569 						 u64 start, u64 min_seq)
570 {
571 	return __tree_mod_log_search(fs_info, start, min_seq, false);
572 }
573 
btrfs_tree_mod_log_eb_copy(struct extent_buffer * dst,const struct extent_buffer * src,unsigned long dst_offset,unsigned long src_offset,int nr_items)574 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
575 			       const struct extent_buffer *src,
576 			       unsigned long dst_offset,
577 			       unsigned long src_offset,
578 			       int nr_items)
579 {
580 	struct btrfs_fs_info *fs_info = dst->fs_info;
581 	int ret = 0;
582 	struct tree_mod_elem **tm_list = NULL;
583 	struct tree_mod_elem **tm_list_add = NULL;
584 	struct tree_mod_elem **tm_list_rem = NULL;
585 	int i;
586 	bool locked = false;
587 	struct tree_mod_elem *dst_move_tm = NULL;
588 	struct tree_mod_elem *src_move_tm = NULL;
589 	u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
590 	u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
591 
592 	if (!tree_mod_need_log(fs_info, NULL))
593 		return 0;
594 
595 	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
596 		return 0;
597 
598 	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
599 			  GFP_NOFS);
600 	if (!tm_list) {
601 		ret = -ENOMEM;
602 		goto lock;
603 	}
604 
605 	if (dst_move_nr_items) {
606 		dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
607 						      dst_offset, dst_move_nr_items);
608 		if (IS_ERR(dst_move_tm)) {
609 			ret = PTR_ERR(dst_move_tm);
610 			dst_move_tm = NULL;
611 			goto lock;
612 		}
613 	}
614 	if (src_move_nr_items) {
615 		src_move_tm = tree_mod_log_alloc_move(src, src_offset,
616 						      src_offset + nr_items,
617 						      src_move_nr_items);
618 		if (IS_ERR(src_move_tm)) {
619 			ret = PTR_ERR(src_move_tm);
620 			src_move_tm = NULL;
621 			goto lock;
622 		}
623 	}
624 
625 	tm_list_add = tm_list;
626 	tm_list_rem = tm_list + nr_items;
627 	for (i = 0; i < nr_items; i++) {
628 		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
629 						     BTRFS_MOD_LOG_KEY_REMOVE);
630 		if (!tm_list_rem[i]) {
631 			ret = -ENOMEM;
632 			goto lock;
633 		}
634 
635 		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
636 						     BTRFS_MOD_LOG_KEY_ADD);
637 		if (!tm_list_add[i]) {
638 			ret = -ENOMEM;
639 			goto lock;
640 		}
641 	}
642 
643 lock:
644 	if (tree_mod_dont_log(fs_info, NULL)) {
645 		/*
646 		 * Don't error if we failed to allocate memory because we don't
647 		 * need to log.
648 		 */
649 		ret = 0;
650 		goto free_tms;
651 	}
652 	locked = true;
653 
654 	/*
655 	 * We previously failed to allocate memory and we need to log, so we
656 	 * have to fail.
657 	 */
658 	if (ret != 0)
659 		goto free_tms;
660 
661 	if (dst_move_tm) {
662 		ret = tree_mod_log_insert(fs_info, dst_move_tm);
663 		if (ret)
664 			goto free_tms;
665 	}
666 	for (i = 0; i < nr_items; i++) {
667 		ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
668 		if (ret)
669 			goto free_tms;
670 		ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
671 		if (ret)
672 			goto free_tms;
673 	}
674 	if (src_move_tm) {
675 		ret = tree_mod_log_insert(fs_info, src_move_tm);
676 		if (ret)
677 			goto free_tms;
678 	}
679 
680 	write_unlock(&fs_info->tree_mod_log_lock);
681 	kfree(tm_list);
682 
683 	return 0;
684 
685 free_tms:
686 	if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
687 		rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
688 	kfree(dst_move_tm);
689 	if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
690 		rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
691 	kfree(src_move_tm);
692 	if (tm_list) {
693 		for (i = 0; i < nr_items * 2; i++) {
694 			if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
695 				rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
696 			kfree(tm_list[i]);
697 		}
698 	}
699 	if (locked)
700 		write_unlock(&fs_info->tree_mod_log_lock);
701 	kfree(tm_list);
702 
703 	return ret;
704 }
705 
btrfs_tree_mod_log_free_eb(struct extent_buffer * eb)706 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
707 {
708 	struct tree_mod_elem **tm_list = NULL;
709 	int nritems = 0;
710 	int i;
711 	int ret = 0;
712 
713 	if (!tree_mod_need_log(eb->fs_info, eb))
714 		return 0;
715 
716 	nritems = btrfs_header_nritems(eb);
717 	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
718 	if (!tm_list) {
719 		ret = -ENOMEM;
720 		goto lock;
721 	}
722 
723 	for (i = 0; i < nritems; i++) {
724 		tm_list[i] = alloc_tree_mod_elem(eb, i,
725 				    BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
726 		if (!tm_list[i]) {
727 			ret = -ENOMEM;
728 			goto lock;
729 		}
730 	}
731 
732 lock:
733 	if (tree_mod_dont_log(eb->fs_info, eb)) {
734 		/*
735 		 * Don't error if we failed to allocate memory because we don't
736 		 * need to log.
737 		 */
738 		ret = 0;
739 		goto free_tms;
740 	} else if (ret != 0) {
741 		/*
742 		 * We previously failed to allocate memory and we need to log,
743 		 * so we have to fail.
744 		 */
745 		goto out_unlock;
746 	}
747 
748 	ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
749 out_unlock:
750 	write_unlock(&eb->fs_info->tree_mod_log_lock);
751 	if (ret)
752 		goto free_tms;
753 	kfree(tm_list);
754 
755 	return 0;
756 
757 free_tms:
758 	if (tm_list) {
759 		for (i = 0; i < nritems; i++)
760 			kfree(tm_list[i]);
761 		kfree(tm_list);
762 	}
763 
764 	return ret;
765 }
766 
767 /*
768  * Returns the logical address of the oldest predecessor of the given root.
769  * Entries older than time_seq are ignored.
770  */
tree_mod_log_oldest_root(struct extent_buffer * eb_root,u64 time_seq)771 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
772 						      u64 time_seq)
773 {
774 	struct tree_mod_elem *tm;
775 	struct tree_mod_elem *found = NULL;
776 	u64 root_logical = eb_root->start;
777 	bool looped = false;
778 
779 	if (!time_seq)
780 		return NULL;
781 
782 	/*
783 	 * The very last operation that's logged for a root is the replacement
784 	 * operation (if it is replaced at all). This has the logical address
785 	 * of the *new* root, making it the very first operation that's logged
786 	 * for this root.
787 	 */
788 	while (1) {
789 		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
790 						time_seq);
791 		if (!looped && !tm)
792 			return NULL;
793 		/*
794 		 * If there are no tree operation for the oldest root, we simply
795 		 * return it. This should only happen if that (old) root is at
796 		 * level 0.
797 		 */
798 		if (!tm)
799 			break;
800 
801 		/*
802 		 * If there's an operation that's not a root replacement, we
803 		 * found the oldest version of our root. Normally, we'll find a
804 		 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
805 		 */
806 		if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
807 			break;
808 
809 		found = tm;
810 		root_logical = tm->old_root.logical;
811 		looped = true;
812 	}
813 
814 	/* If there's no old root to return, return what we found instead */
815 	if (!found)
816 		found = tm;
817 
818 	return found;
819 }
820 
821 
822 /*
823  * tm is a pointer to the first operation to rewind within eb. Then, all
824  * previous operations will be rewound (until we reach something older than
825  * time_seq).
826  */
tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq,struct tree_mod_elem * first_tm)827 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
828 				struct extent_buffer *eb,
829 				u64 time_seq,
830 				struct tree_mod_elem *first_tm)
831 {
832 	u32 n;
833 	struct rb_node *next;
834 	struct tree_mod_elem *tm = first_tm;
835 	unsigned long o_dst;
836 	unsigned long o_src;
837 	unsigned long p_size = sizeof(struct btrfs_key_ptr);
838 	/*
839 	 * max_slot tracks the maximum valid slot of the rewind eb at every
840 	 * step of the rewind. This is in contrast with 'n' which eventually
841 	 * matches the number of items, but can be wrong during moves or if
842 	 * removes overlap on already valid slots (which is probably separately
843 	 * a bug). We do this to validate the offsets of memmoves for rewinding
844 	 * moves and detect invalid memmoves.
845 	 *
846 	 * Since a rewind eb can start empty, max_slot is a signed integer with
847 	 * a special meaning for -1, which is that no slot is valid to move out
848 	 * of. Any other negative value is invalid.
849 	 */
850 	int max_slot;
851 	int move_src_end_slot;
852 	int move_dst_end_slot;
853 
854 	n = btrfs_header_nritems(eb);
855 	max_slot = n - 1;
856 	read_lock(&fs_info->tree_mod_log_lock);
857 	while (tm && tm->seq >= time_seq) {
858 		ASSERT(max_slot >= -1);
859 		/*
860 		 * All the operations are recorded with the operator used for
861 		 * the modification. As we're going backwards, we do the
862 		 * opposite of each operation here.
863 		 */
864 		switch (tm->op) {
865 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
866 			BUG_ON(tm->slot < n);
867 			fallthrough;
868 		case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
869 		case BTRFS_MOD_LOG_KEY_REMOVE:
870 			btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
871 			btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
872 			btrfs_set_node_ptr_generation(eb, tm->slot,
873 						      tm->generation);
874 			n++;
875 			if (tm->slot > max_slot)
876 				max_slot = tm->slot;
877 			break;
878 		case BTRFS_MOD_LOG_KEY_REPLACE:
879 			BUG_ON(tm->slot >= n);
880 			btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
881 			btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
882 			btrfs_set_node_ptr_generation(eb, tm->slot,
883 						      tm->generation);
884 			break;
885 		case BTRFS_MOD_LOG_KEY_ADD:
886 			/*
887 			 * It is possible we could have already removed keys
888 			 * behind the known max slot, so this will be an
889 			 * overestimate. In practice, the copy operation
890 			 * inserts them in increasing order, and overestimating
891 			 * just means we miss some warnings, so it's OK. It
892 			 * isn't worth carefully tracking the full array of
893 			 * valid slots to check against when moving.
894 			 */
895 			if (tm->slot == max_slot)
896 				max_slot--;
897 			/* if a move operation is needed it's in the log */
898 			n--;
899 			break;
900 		case BTRFS_MOD_LOG_MOVE_KEYS:
901 			ASSERT(tm->move.nr_items > 0);
902 			move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
903 			move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
904 			o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
905 			o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
906 			if (WARN_ON(move_src_end_slot > max_slot ||
907 				    tm->move.nr_items <= 0)) {
908 				btrfs_warn(fs_info,
909 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
910 					   eb->start, tm->slot,
911 					   tm->move.dst_slot, tm->move.nr_items,
912 					   tm->seq, n, max_slot);
913 			}
914 			memmove_extent_buffer(eb, o_dst, o_src,
915 					      tm->move.nr_items * p_size);
916 			max_slot = move_dst_end_slot;
917 			break;
918 		case BTRFS_MOD_LOG_ROOT_REPLACE:
919 			/*
920 			 * This operation is special. For roots, this must be
921 			 * handled explicitly before rewinding.
922 			 * For non-roots, this operation may exist if the node
923 			 * was a root: root A -> child B; then A gets empty and
924 			 * B is promoted to the new root. In the mod log, we'll
925 			 * have a root-replace operation for B, a tree block
926 			 * that is no root. We simply ignore that operation.
927 			 */
928 			break;
929 		}
930 		next = rb_next(&tm->node);
931 		if (!next)
932 			break;
933 		tm = rb_entry(next, struct tree_mod_elem, node);
934 		if (tm->logical != first_tm->logical)
935 			break;
936 	}
937 	read_unlock(&fs_info->tree_mod_log_lock);
938 	btrfs_set_header_nritems(eb, n);
939 }
940 
941 /*
942  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
943  * is returned. If rewind operations happen, a fresh buffer is returned. The
944  * returned buffer is always read-locked. If the returned buffer is not the
945  * input buffer, the lock on the input buffer is released and the input buffer
946  * is freed (its refcount is decremented).
947  */
btrfs_tree_mod_log_rewind(struct btrfs_fs_info * fs_info,struct extent_buffer * eb,u64 time_seq)948 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
949 						struct extent_buffer *eb,
950 						u64 time_seq)
951 {
952 	struct extent_buffer *eb_rewin;
953 	struct tree_mod_elem *tm;
954 
955 	if (!time_seq)
956 		return eb;
957 
958 	if (btrfs_header_level(eb) == 0)
959 		return eb;
960 
961 	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
962 	if (!tm)
963 		return eb;
964 
965 	if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
966 		BUG_ON(tm->slot != 0);
967 		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
968 		if (!eb_rewin) {
969 			btrfs_tree_read_unlock(eb);
970 			free_extent_buffer(eb);
971 			return NULL;
972 		}
973 		btrfs_set_header_bytenr(eb_rewin, eb->start);
974 		btrfs_set_header_backref_rev(eb_rewin,
975 					     btrfs_header_backref_rev(eb));
976 		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
977 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
978 	} else {
979 		eb_rewin = btrfs_clone_extent_buffer(eb);
980 		if (!eb_rewin) {
981 			btrfs_tree_read_unlock(eb);
982 			free_extent_buffer(eb);
983 			return NULL;
984 		}
985 	}
986 
987 	btrfs_tree_read_unlock(eb);
988 	free_extent_buffer(eb);
989 
990 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
991 				       eb_rewin, btrfs_header_level(eb_rewin));
992 	btrfs_tree_read_lock(eb_rewin);
993 	tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
994 	WARN_ON(btrfs_header_nritems(eb_rewin) >
995 		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
996 
997 	return eb_rewin;
998 }
999 
1000 /*
1001  * Rewind the state of @root's root node to the given @time_seq value.
1002  * If there are no changes, the current root->root_node is returned. If anything
1003  * changed in between, there's a fresh buffer allocated on which the rewind
1004  * operations are done. In any case, the returned buffer is read locked.
1005  * Returns NULL on error (with no locks held).
1006  */
btrfs_get_old_root(struct btrfs_root * root,u64 time_seq)1007 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
1008 {
1009 	struct btrfs_fs_info *fs_info = root->fs_info;
1010 	struct tree_mod_elem *tm;
1011 	struct extent_buffer *eb = NULL;
1012 	struct extent_buffer *eb_root;
1013 	u64 eb_root_owner = 0;
1014 	struct extent_buffer *old;
1015 	struct tree_mod_root *old_root = NULL;
1016 	u64 old_generation = 0;
1017 	u64 logical;
1018 	int level;
1019 
1020 	eb_root = btrfs_read_lock_root_node(root);
1021 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1022 	if (!tm)
1023 		return eb_root;
1024 
1025 	if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
1026 		old_root = &tm->old_root;
1027 		old_generation = tm->generation;
1028 		logical = old_root->logical;
1029 		level = old_root->level;
1030 	} else {
1031 		logical = eb_root->start;
1032 		level = btrfs_header_level(eb_root);
1033 	}
1034 
1035 	tm = tree_mod_log_search(fs_info, logical, time_seq);
1036 	if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1037 		struct btrfs_tree_parent_check check = { 0 };
1038 
1039 		btrfs_tree_read_unlock(eb_root);
1040 		free_extent_buffer(eb_root);
1041 
1042 		check.level = level;
1043 		check.owner_root = btrfs_root_id(root);
1044 
1045 		old = read_tree_block(fs_info, logical, &check);
1046 		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1047 			if (!IS_ERR(old))
1048 				free_extent_buffer(old);
1049 			btrfs_warn(fs_info,
1050 				   "failed to read tree block %llu from get_old_root",
1051 				   logical);
1052 		} else {
1053 			struct tree_mod_elem *tm2;
1054 
1055 			btrfs_tree_read_lock(old);
1056 			eb = btrfs_clone_extent_buffer(old);
1057 			/*
1058 			 * After the lookup for the most recent tree mod operation
1059 			 * above and before we locked and cloned the extent buffer
1060 			 * 'old', a new tree mod log operation may have been added.
1061 			 * So lookup for a more recent one to make sure the number
1062 			 * of mod log operations we replay is consistent with the
1063 			 * number of items we have in the cloned extent buffer,
1064 			 * otherwise we can hit a BUG_ON when rewinding the extent
1065 			 * buffer.
1066 			 */
1067 			tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1068 			btrfs_tree_read_unlock(old);
1069 			free_extent_buffer(old);
1070 			ASSERT(tm2);
1071 			ASSERT(tm2 == tm || tm2->seq > tm->seq);
1072 			if (!tm2 || tm2->seq < tm->seq) {
1073 				free_extent_buffer(eb);
1074 				return NULL;
1075 			}
1076 			tm = tm2;
1077 		}
1078 	} else if (old_root) {
1079 		eb_root_owner = btrfs_header_owner(eb_root);
1080 		btrfs_tree_read_unlock(eb_root);
1081 		free_extent_buffer(eb_root);
1082 		eb = alloc_dummy_extent_buffer(fs_info, logical);
1083 	} else {
1084 		eb = btrfs_clone_extent_buffer(eb_root);
1085 		btrfs_tree_read_unlock(eb_root);
1086 		free_extent_buffer(eb_root);
1087 	}
1088 
1089 	if (!eb)
1090 		return NULL;
1091 	if (old_root) {
1092 		btrfs_set_header_bytenr(eb, eb->start);
1093 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1094 		btrfs_set_header_owner(eb, eb_root_owner);
1095 		btrfs_set_header_level(eb, old_root->level);
1096 		btrfs_set_header_generation(eb, old_generation);
1097 	}
1098 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1099 				       btrfs_header_level(eb));
1100 	btrfs_tree_read_lock(eb);
1101 	if (tm)
1102 		tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1103 	else
1104 		WARN_ON(btrfs_header_level(eb) != 0);
1105 	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1106 
1107 	return eb;
1108 }
1109 
btrfs_old_root_level(struct btrfs_root * root,u64 time_seq)1110 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1111 {
1112 	struct tree_mod_elem *tm;
1113 	int level;
1114 	struct extent_buffer *eb_root = btrfs_root_node(root);
1115 
1116 	tm = tree_mod_log_oldest_root(eb_root, time_seq);
1117 	if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1118 		level = tm->old_root.level;
1119 	else
1120 		level = btrfs_header_level(eb_root);
1121 
1122 	free_extent_buffer(eb_root);
1123 
1124 	return level;
1125 }
1126 
1127 /*
1128  * Return the lowest sequence number in the tree modification log.
1129  *
1130  * Return the sequence number of the oldest tree modification log user, which
1131  * corresponds to the lowest sequence number of all existing users. If there are
1132  * no users it returns 0.
1133  */
btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info * fs_info)1134 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1135 {
1136 	u64 ret = 0;
1137 
1138 	read_lock(&fs_info->tree_mod_log_lock);
1139 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
1140 		struct btrfs_seq_list *elem;
1141 
1142 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
1143 					struct btrfs_seq_list, list);
1144 		ret = elem->seq;
1145 	}
1146 	read_unlock(&fs_info->tree_mod_log_lock);
1147 
1148 	return ret;
1149 }
1150