xref: /linux/fs/btrfs/delayed-ref.c (revision b7019ac550eb3916f34d79db583e9b7ea2524afa)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/sort.h>
9 #include "ctree.h"
10 #include "delayed-ref.h"
11 #include "transaction.h"
12 #include "qgroup.h"
13 
14 struct kmem_cache *btrfs_delayed_ref_head_cachep;
15 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
16 struct kmem_cache *btrfs_delayed_data_ref_cachep;
17 struct kmem_cache *btrfs_delayed_extent_op_cachep;
18 /*
19  * delayed back reference update tracking.  For subvolume trees
20  * we queue up extent allocations and backref maintenance for
21  * delayed processing.   This avoids deep call chains where we
22  * add extents in the middle of btrfs_search_slot, and it allows
23  * us to buffer up frequently modified backrefs in an rb tree instead
24  * of hammering updates on the extent allocation tree.
25  */
26 
27 /*
28  * compare two delayed tree backrefs with same bytenr and type
29  */
30 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
31 			  struct btrfs_delayed_tree_ref *ref2)
32 {
33 	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
34 		if (ref1->root < ref2->root)
35 			return -1;
36 		if (ref1->root > ref2->root)
37 			return 1;
38 	} else {
39 		if (ref1->parent < ref2->parent)
40 			return -1;
41 		if (ref1->parent > ref2->parent)
42 			return 1;
43 	}
44 	return 0;
45 }
46 
47 /*
48  * compare two delayed data backrefs with same bytenr and type
49  */
50 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
51 			  struct btrfs_delayed_data_ref *ref2)
52 {
53 	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
54 		if (ref1->root < ref2->root)
55 			return -1;
56 		if (ref1->root > ref2->root)
57 			return 1;
58 		if (ref1->objectid < ref2->objectid)
59 			return -1;
60 		if (ref1->objectid > ref2->objectid)
61 			return 1;
62 		if (ref1->offset < ref2->offset)
63 			return -1;
64 		if (ref1->offset > ref2->offset)
65 			return 1;
66 	} else {
67 		if (ref1->parent < ref2->parent)
68 			return -1;
69 		if (ref1->parent > ref2->parent)
70 			return 1;
71 	}
72 	return 0;
73 }
74 
75 static int comp_refs(struct btrfs_delayed_ref_node *ref1,
76 		     struct btrfs_delayed_ref_node *ref2,
77 		     bool check_seq)
78 {
79 	int ret = 0;
80 
81 	if (ref1->type < ref2->type)
82 		return -1;
83 	if (ref1->type > ref2->type)
84 		return 1;
85 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
86 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
87 		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
88 				     btrfs_delayed_node_to_tree_ref(ref2));
89 	else
90 		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
91 				     btrfs_delayed_node_to_data_ref(ref2));
92 	if (ret)
93 		return ret;
94 	if (check_seq) {
95 		if (ref1->seq < ref2->seq)
96 			return -1;
97 		if (ref1->seq > ref2->seq)
98 			return 1;
99 	}
100 	return 0;
101 }
102 
103 /* insert a new ref to head ref rbtree */
104 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
105 						   struct rb_node *node)
106 {
107 	struct rb_node **p = &root->rb_root.rb_node;
108 	struct rb_node *parent_node = NULL;
109 	struct btrfs_delayed_ref_head *entry;
110 	struct btrfs_delayed_ref_head *ins;
111 	u64 bytenr;
112 	bool leftmost = true;
113 
114 	ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
115 	bytenr = ins->bytenr;
116 	while (*p) {
117 		parent_node = *p;
118 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
119 				 href_node);
120 
121 		if (bytenr < entry->bytenr) {
122 			p = &(*p)->rb_left;
123 		} else if (bytenr > entry->bytenr) {
124 			p = &(*p)->rb_right;
125 			leftmost = false;
126 		} else {
127 			return entry;
128 		}
129 	}
130 
131 	rb_link_node(node, parent_node, p);
132 	rb_insert_color_cached(node, root, leftmost);
133 	return NULL;
134 }
135 
136 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
137 		struct btrfs_delayed_ref_node *ins)
138 {
139 	struct rb_node **p = &root->rb_root.rb_node;
140 	struct rb_node *node = &ins->ref_node;
141 	struct rb_node *parent_node = NULL;
142 	struct btrfs_delayed_ref_node *entry;
143 	bool leftmost = true;
144 
145 	while (*p) {
146 		int comp;
147 
148 		parent_node = *p;
149 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
150 				 ref_node);
151 		comp = comp_refs(ins, entry, true);
152 		if (comp < 0) {
153 			p = &(*p)->rb_left;
154 		} else if (comp > 0) {
155 			p = &(*p)->rb_right;
156 			leftmost = false;
157 		} else {
158 			return entry;
159 		}
160 	}
161 
162 	rb_link_node(node, parent_node, p);
163 	rb_insert_color_cached(node, root, leftmost);
164 	return NULL;
165 }
166 
167 static struct btrfs_delayed_ref_head *find_first_ref_head(
168 		struct btrfs_delayed_ref_root *dr)
169 {
170 	struct rb_node *n;
171 	struct btrfs_delayed_ref_head *entry;
172 
173 	n = rb_first_cached(&dr->href_root);
174 	if (!n)
175 		return NULL;
176 
177 	entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
178 
179 	return entry;
180 }
181 
182 /*
183  * Find a head entry based on bytenr. This returns the delayed ref head if it
184  * was able to find one, or NULL if nothing was in that spot.  If return_bigger
185  * is given, the next bigger entry is returned if no exact match is found.
186  */
187 static struct btrfs_delayed_ref_head *find_ref_head(
188 		struct btrfs_delayed_ref_root *dr, u64 bytenr,
189 		bool return_bigger)
190 {
191 	struct rb_root *root = &dr->href_root.rb_root;
192 	struct rb_node *n;
193 	struct btrfs_delayed_ref_head *entry;
194 
195 	n = root->rb_node;
196 	entry = NULL;
197 	while (n) {
198 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
199 
200 		if (bytenr < entry->bytenr)
201 			n = n->rb_left;
202 		else if (bytenr > entry->bytenr)
203 			n = n->rb_right;
204 		else
205 			return entry;
206 	}
207 	if (entry && return_bigger) {
208 		if (bytenr > entry->bytenr) {
209 			n = rb_next(&entry->href_node);
210 			if (!n)
211 				return NULL;
212 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
213 					 href_node);
214 		}
215 		return entry;
216 	}
217 	return NULL;
218 }
219 
220 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
221 			   struct btrfs_delayed_ref_head *head)
222 {
223 	lockdep_assert_held(&delayed_refs->lock);
224 	if (mutex_trylock(&head->mutex))
225 		return 0;
226 
227 	refcount_inc(&head->refs);
228 	spin_unlock(&delayed_refs->lock);
229 
230 	mutex_lock(&head->mutex);
231 	spin_lock(&delayed_refs->lock);
232 	if (RB_EMPTY_NODE(&head->href_node)) {
233 		mutex_unlock(&head->mutex);
234 		btrfs_put_delayed_ref_head(head);
235 		return -EAGAIN;
236 	}
237 	btrfs_put_delayed_ref_head(head);
238 	return 0;
239 }
240 
241 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
242 				    struct btrfs_delayed_ref_root *delayed_refs,
243 				    struct btrfs_delayed_ref_head *head,
244 				    struct btrfs_delayed_ref_node *ref)
245 {
246 	lockdep_assert_held(&head->lock);
247 	rb_erase_cached(&ref->ref_node, &head->ref_tree);
248 	RB_CLEAR_NODE(&ref->ref_node);
249 	if (!list_empty(&ref->add_list))
250 		list_del(&ref->add_list);
251 	ref->in_tree = 0;
252 	btrfs_put_delayed_ref(ref);
253 	atomic_dec(&delayed_refs->num_entries);
254 }
255 
256 static bool merge_ref(struct btrfs_trans_handle *trans,
257 		      struct btrfs_delayed_ref_root *delayed_refs,
258 		      struct btrfs_delayed_ref_head *head,
259 		      struct btrfs_delayed_ref_node *ref,
260 		      u64 seq)
261 {
262 	struct btrfs_delayed_ref_node *next;
263 	struct rb_node *node = rb_next(&ref->ref_node);
264 	bool done = false;
265 
266 	while (!done && node) {
267 		int mod;
268 
269 		next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
270 		node = rb_next(node);
271 		if (seq && next->seq >= seq)
272 			break;
273 		if (comp_refs(ref, next, false))
274 			break;
275 
276 		if (ref->action == next->action) {
277 			mod = next->ref_mod;
278 		} else {
279 			if (ref->ref_mod < next->ref_mod) {
280 				swap(ref, next);
281 				done = true;
282 			}
283 			mod = -next->ref_mod;
284 		}
285 
286 		drop_delayed_ref(trans, delayed_refs, head, next);
287 		ref->ref_mod += mod;
288 		if (ref->ref_mod == 0) {
289 			drop_delayed_ref(trans, delayed_refs, head, ref);
290 			done = true;
291 		} else {
292 			/*
293 			 * Can't have multiples of the same ref on a tree block.
294 			 */
295 			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
296 				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
297 		}
298 	}
299 
300 	return done;
301 }
302 
303 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
304 			      struct btrfs_delayed_ref_root *delayed_refs,
305 			      struct btrfs_delayed_ref_head *head)
306 {
307 	struct btrfs_fs_info *fs_info = trans->fs_info;
308 	struct btrfs_delayed_ref_node *ref;
309 	struct rb_node *node;
310 	u64 seq = 0;
311 
312 	lockdep_assert_held(&head->lock);
313 
314 	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
315 		return;
316 
317 	/* We don't have too many refs to merge for data. */
318 	if (head->is_data)
319 		return;
320 
321 	spin_lock(&fs_info->tree_mod_seq_lock);
322 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
323 		struct seq_list *elem;
324 
325 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
326 					struct seq_list, list);
327 		seq = elem->seq;
328 	}
329 	spin_unlock(&fs_info->tree_mod_seq_lock);
330 
331 again:
332 	for (node = rb_first_cached(&head->ref_tree); node;
333 	     node = rb_next(node)) {
334 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
335 		if (seq && ref->seq >= seq)
336 			continue;
337 		if (merge_ref(trans, delayed_refs, head, ref, seq))
338 			goto again;
339 	}
340 }
341 
342 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
343 {
344 	struct seq_list *elem;
345 	int ret = 0;
346 
347 	spin_lock(&fs_info->tree_mod_seq_lock);
348 	if (!list_empty(&fs_info->tree_mod_seq_list)) {
349 		elem = list_first_entry(&fs_info->tree_mod_seq_list,
350 					struct seq_list, list);
351 		if (seq >= elem->seq) {
352 			btrfs_debug(fs_info,
353 				"holding back delayed_ref %#x.%x, lowest is %#x.%x",
354 				(u32)(seq >> 32), (u32)seq,
355 				(u32)(elem->seq >> 32), (u32)elem->seq);
356 			ret = 1;
357 		}
358 	}
359 
360 	spin_unlock(&fs_info->tree_mod_seq_lock);
361 	return ret;
362 }
363 
364 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
365 		struct btrfs_delayed_ref_root *delayed_refs)
366 {
367 	struct btrfs_delayed_ref_head *head;
368 
369 again:
370 	head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
371 			     true);
372 	if (!head && delayed_refs->run_delayed_start != 0) {
373 		delayed_refs->run_delayed_start = 0;
374 		head = find_first_ref_head(delayed_refs);
375 	}
376 	if (!head)
377 		return NULL;
378 
379 	while (head->processing) {
380 		struct rb_node *node;
381 
382 		node = rb_next(&head->href_node);
383 		if (!node) {
384 			if (delayed_refs->run_delayed_start == 0)
385 				return NULL;
386 			delayed_refs->run_delayed_start = 0;
387 			goto again;
388 		}
389 		head = rb_entry(node, struct btrfs_delayed_ref_head,
390 				href_node);
391 	}
392 
393 	head->processing = 1;
394 	WARN_ON(delayed_refs->num_heads_ready == 0);
395 	delayed_refs->num_heads_ready--;
396 	delayed_refs->run_delayed_start = head->bytenr +
397 		head->num_bytes;
398 	return head;
399 }
400 
401 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
402 			   struct btrfs_delayed_ref_head *head)
403 {
404 	lockdep_assert_held(&delayed_refs->lock);
405 	lockdep_assert_held(&head->lock);
406 
407 	rb_erase_cached(&head->href_node, &delayed_refs->href_root);
408 	RB_CLEAR_NODE(&head->href_node);
409 	atomic_dec(&delayed_refs->num_entries);
410 	delayed_refs->num_heads--;
411 	if (head->processing == 0)
412 		delayed_refs->num_heads_ready--;
413 }
414 
415 /*
416  * Helper to insert the ref_node to the tail or merge with tail.
417  *
418  * Return 0 for insert.
419  * Return >0 for merge.
420  */
421 static int insert_delayed_ref(struct btrfs_trans_handle *trans,
422 			      struct btrfs_delayed_ref_root *root,
423 			      struct btrfs_delayed_ref_head *href,
424 			      struct btrfs_delayed_ref_node *ref)
425 {
426 	struct btrfs_delayed_ref_node *exist;
427 	int mod;
428 	int ret = 0;
429 
430 	spin_lock(&href->lock);
431 	exist = tree_insert(&href->ref_tree, ref);
432 	if (!exist)
433 		goto inserted;
434 
435 	/* Now we are sure we can merge */
436 	ret = 1;
437 	if (exist->action == ref->action) {
438 		mod = ref->ref_mod;
439 	} else {
440 		/* Need to change action */
441 		if (exist->ref_mod < ref->ref_mod) {
442 			exist->action = ref->action;
443 			mod = -exist->ref_mod;
444 			exist->ref_mod = ref->ref_mod;
445 			if (ref->action == BTRFS_ADD_DELAYED_REF)
446 				list_add_tail(&exist->add_list,
447 					      &href->ref_add_list);
448 			else if (ref->action == BTRFS_DROP_DELAYED_REF) {
449 				ASSERT(!list_empty(&exist->add_list));
450 				list_del(&exist->add_list);
451 			} else {
452 				ASSERT(0);
453 			}
454 		} else
455 			mod = -ref->ref_mod;
456 	}
457 	exist->ref_mod += mod;
458 
459 	/* remove existing tail if its ref_mod is zero */
460 	if (exist->ref_mod == 0)
461 		drop_delayed_ref(trans, root, href, exist);
462 	spin_unlock(&href->lock);
463 	return ret;
464 inserted:
465 	if (ref->action == BTRFS_ADD_DELAYED_REF)
466 		list_add_tail(&ref->add_list, &href->ref_add_list);
467 	atomic_inc(&root->num_entries);
468 	spin_unlock(&href->lock);
469 	return ret;
470 }
471 
472 /*
473  * helper function to update the accounting in the head ref
474  * existing and update must have the same bytenr
475  */
476 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
477 			 struct btrfs_delayed_ref_head *existing,
478 			 struct btrfs_delayed_ref_head *update,
479 			 int *old_ref_mod_ret)
480 {
481 	struct btrfs_delayed_ref_root *delayed_refs =
482 		&trans->transaction->delayed_refs;
483 	struct btrfs_fs_info *fs_info = trans->fs_info;
484 	int old_ref_mod;
485 
486 	BUG_ON(existing->is_data != update->is_data);
487 
488 	spin_lock(&existing->lock);
489 	if (update->must_insert_reserved) {
490 		/* if the extent was freed and then
491 		 * reallocated before the delayed ref
492 		 * entries were processed, we can end up
493 		 * with an existing head ref without
494 		 * the must_insert_reserved flag set.
495 		 * Set it again here
496 		 */
497 		existing->must_insert_reserved = update->must_insert_reserved;
498 
499 		/*
500 		 * update the num_bytes so we make sure the accounting
501 		 * is done correctly
502 		 */
503 		existing->num_bytes = update->num_bytes;
504 
505 	}
506 
507 	if (update->extent_op) {
508 		if (!existing->extent_op) {
509 			existing->extent_op = update->extent_op;
510 		} else {
511 			if (update->extent_op->update_key) {
512 				memcpy(&existing->extent_op->key,
513 				       &update->extent_op->key,
514 				       sizeof(update->extent_op->key));
515 				existing->extent_op->update_key = true;
516 			}
517 			if (update->extent_op->update_flags) {
518 				existing->extent_op->flags_to_set |=
519 					update->extent_op->flags_to_set;
520 				existing->extent_op->update_flags = true;
521 			}
522 			btrfs_free_delayed_extent_op(update->extent_op);
523 		}
524 	}
525 	/*
526 	 * update the reference mod on the head to reflect this new operation,
527 	 * only need the lock for this case cause we could be processing it
528 	 * currently, for refs we just added we know we're a-ok.
529 	 */
530 	old_ref_mod = existing->total_ref_mod;
531 	if (old_ref_mod_ret)
532 		*old_ref_mod_ret = old_ref_mod;
533 	existing->ref_mod += update->ref_mod;
534 	existing->total_ref_mod += update->ref_mod;
535 
536 	/*
537 	 * If we are going to from a positive ref mod to a negative or vice
538 	 * versa we need to make sure to adjust pending_csums accordingly.
539 	 */
540 	if (existing->is_data) {
541 		u64 csum_leaves =
542 			btrfs_csum_bytes_to_leaves(fs_info,
543 						   existing->num_bytes);
544 
545 		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
546 			delayed_refs->pending_csums -= existing->num_bytes;
547 			btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
548 		}
549 		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
550 			delayed_refs->pending_csums += existing->num_bytes;
551 			trans->delayed_ref_updates += csum_leaves;
552 		}
553 	}
554 	spin_unlock(&existing->lock);
555 }
556 
557 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
558 				  struct btrfs_qgroup_extent_record *qrecord,
559 				  u64 bytenr, u64 num_bytes, u64 ref_root,
560 				  u64 reserved, int action, bool is_data,
561 				  bool is_system)
562 {
563 	int count_mod = 1;
564 	int must_insert_reserved = 0;
565 
566 	/* If reserved is provided, it must be a data extent. */
567 	BUG_ON(!is_data && reserved);
568 
569 	/*
570 	 * The head node stores the sum of all the mods, so dropping a ref
571 	 * should drop the sum in the head node by one.
572 	 */
573 	if (action == BTRFS_UPDATE_DELAYED_HEAD)
574 		count_mod = 0;
575 	else if (action == BTRFS_DROP_DELAYED_REF)
576 		count_mod = -1;
577 
578 	/*
579 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
580 	 * accounting when the extent is finally added, or if a later
581 	 * modification deletes the delayed ref without ever inserting the
582 	 * extent into the extent allocation tree.  ref->must_insert_reserved
583 	 * is the flag used to record that accounting mods are required.
584 	 *
585 	 * Once we record must_insert_reserved, switch the action to
586 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
587 	 */
588 	if (action == BTRFS_ADD_DELAYED_EXTENT)
589 		must_insert_reserved = 1;
590 	else
591 		must_insert_reserved = 0;
592 
593 	refcount_set(&head_ref->refs, 1);
594 	head_ref->bytenr = bytenr;
595 	head_ref->num_bytes = num_bytes;
596 	head_ref->ref_mod = count_mod;
597 	head_ref->must_insert_reserved = must_insert_reserved;
598 	head_ref->is_data = is_data;
599 	head_ref->is_system = is_system;
600 	head_ref->ref_tree = RB_ROOT_CACHED;
601 	INIT_LIST_HEAD(&head_ref->ref_add_list);
602 	RB_CLEAR_NODE(&head_ref->href_node);
603 	head_ref->processing = 0;
604 	head_ref->total_ref_mod = count_mod;
605 	spin_lock_init(&head_ref->lock);
606 	mutex_init(&head_ref->mutex);
607 
608 	if (qrecord) {
609 		if (ref_root && reserved) {
610 			qrecord->data_rsv = reserved;
611 			qrecord->data_rsv_refroot = ref_root;
612 		}
613 		qrecord->bytenr = bytenr;
614 		qrecord->num_bytes = num_bytes;
615 		qrecord->old_roots = NULL;
616 	}
617 }
618 
619 /*
620  * helper function to actually insert a head node into the rbtree.
621  * this does all the dirty work in terms of maintaining the correct
622  * overall modification count.
623  */
624 static noinline struct btrfs_delayed_ref_head *
625 add_delayed_ref_head(struct btrfs_trans_handle *trans,
626 		     struct btrfs_delayed_ref_head *head_ref,
627 		     struct btrfs_qgroup_extent_record *qrecord,
628 		     int action, int *qrecord_inserted_ret,
629 		     int *old_ref_mod, int *new_ref_mod)
630 {
631 	struct btrfs_delayed_ref_head *existing;
632 	struct btrfs_delayed_ref_root *delayed_refs;
633 	int qrecord_inserted = 0;
634 
635 	delayed_refs = &trans->transaction->delayed_refs;
636 
637 	/* Record qgroup extent info if provided */
638 	if (qrecord) {
639 		if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
640 					delayed_refs, qrecord))
641 			kfree(qrecord);
642 		else
643 			qrecord_inserted = 1;
644 	}
645 
646 	trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
647 
648 	existing = htree_insert(&delayed_refs->href_root,
649 				&head_ref->href_node);
650 	if (existing) {
651 		update_existing_head_ref(trans, existing, head_ref,
652 					 old_ref_mod);
653 		/*
654 		 * we've updated the existing ref, free the newly
655 		 * allocated ref
656 		 */
657 		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
658 		head_ref = existing;
659 	} else {
660 		if (old_ref_mod)
661 			*old_ref_mod = 0;
662 		if (head_ref->is_data && head_ref->ref_mod < 0) {
663 			delayed_refs->pending_csums += head_ref->num_bytes;
664 			trans->delayed_ref_updates +=
665 				btrfs_csum_bytes_to_leaves(trans->fs_info,
666 							   head_ref->num_bytes);
667 		}
668 		delayed_refs->num_heads++;
669 		delayed_refs->num_heads_ready++;
670 		atomic_inc(&delayed_refs->num_entries);
671 		trans->delayed_ref_updates++;
672 	}
673 	if (qrecord_inserted_ret)
674 		*qrecord_inserted_ret = qrecord_inserted;
675 	if (new_ref_mod)
676 		*new_ref_mod = head_ref->total_ref_mod;
677 
678 	return head_ref;
679 }
680 
681 /*
682  * init_delayed_ref_common - Initialize the structure which represents a
683  *			     modification to a an extent.
684  *
685  * @fs_info:    Internal to the mounted filesystem mount structure.
686  *
687  * @ref:	The structure which is going to be initialized.
688  *
689  * @bytenr:	The logical address of the extent for which a modification is
690  *		going to be recorded.
691  *
692  * @num_bytes:  Size of the extent whose modification is being recorded.
693  *
694  * @ref_root:	The id of the root where this modification has originated, this
695  *		can be either one of the well-known metadata trees or the
696  *		subvolume id which references this extent.
697  *
698  * @action:	Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
699  *		BTRFS_ADD_DELAYED_EXTENT
700  *
701  * @ref_type:	Holds the type of the extent which is being recorded, can be
702  *		one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
703  *		when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
704  *		BTRFS_EXTENT_DATA_REF_KEY when recording data extent
705  */
706 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
707 				    struct btrfs_delayed_ref_node *ref,
708 				    u64 bytenr, u64 num_bytes, u64 ref_root,
709 				    int action, u8 ref_type)
710 {
711 	u64 seq = 0;
712 
713 	if (action == BTRFS_ADD_DELAYED_EXTENT)
714 		action = BTRFS_ADD_DELAYED_REF;
715 
716 	if (is_fstree(ref_root))
717 		seq = atomic64_read(&fs_info->tree_mod_seq);
718 
719 	refcount_set(&ref->refs, 1);
720 	ref->bytenr = bytenr;
721 	ref->num_bytes = num_bytes;
722 	ref->ref_mod = 1;
723 	ref->action = action;
724 	ref->is_head = 0;
725 	ref->in_tree = 1;
726 	ref->seq = seq;
727 	ref->type = ref_type;
728 	RB_CLEAR_NODE(&ref->ref_node);
729 	INIT_LIST_HEAD(&ref->add_list);
730 }
731 
732 /*
733  * add a delayed tree ref.  This does all of the accounting required
734  * to make sure the delayed ref is eventually processed before this
735  * transaction commits.
736  */
737 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
738 			       struct btrfs_ref *generic_ref,
739 			       struct btrfs_delayed_extent_op *extent_op,
740 			       int *old_ref_mod, int *new_ref_mod)
741 {
742 	struct btrfs_fs_info *fs_info = trans->fs_info;
743 	struct btrfs_delayed_tree_ref *ref;
744 	struct btrfs_delayed_ref_head *head_ref;
745 	struct btrfs_delayed_ref_root *delayed_refs;
746 	struct btrfs_qgroup_extent_record *record = NULL;
747 	int qrecord_inserted;
748 	bool is_system;
749 	int action = generic_ref->action;
750 	int level = generic_ref->tree_ref.level;
751 	int ret;
752 	u64 bytenr = generic_ref->bytenr;
753 	u64 num_bytes = generic_ref->len;
754 	u64 parent = generic_ref->parent;
755 	u8 ref_type;
756 
757 	is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
758 
759 	ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
760 	BUG_ON(extent_op && extent_op->is_data);
761 	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
762 	if (!ref)
763 		return -ENOMEM;
764 
765 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
766 	if (!head_ref) {
767 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
768 		return -ENOMEM;
769 	}
770 
771 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
772 	    is_fstree(generic_ref->real_root) &&
773 	    is_fstree(generic_ref->tree_ref.root) &&
774 	    !generic_ref->skip_qgroup) {
775 		record = kzalloc(sizeof(*record), GFP_NOFS);
776 		if (!record) {
777 			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
778 			kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
779 			return -ENOMEM;
780 		}
781 	}
782 
783 	if (parent)
784 		ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
785 	else
786 		ref_type = BTRFS_TREE_BLOCK_REF_KEY;
787 
788 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
789 				generic_ref->tree_ref.root, action, ref_type);
790 	ref->root = generic_ref->tree_ref.root;
791 	ref->parent = parent;
792 	ref->level = level;
793 
794 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
795 			      generic_ref->tree_ref.root, 0, action, false,
796 			      is_system);
797 	head_ref->extent_op = extent_op;
798 
799 	delayed_refs = &trans->transaction->delayed_refs;
800 	spin_lock(&delayed_refs->lock);
801 
802 	/*
803 	 * insert both the head node and the new ref without dropping
804 	 * the spin lock
805 	 */
806 	head_ref = add_delayed_ref_head(trans, head_ref, record,
807 					action, &qrecord_inserted,
808 					old_ref_mod, new_ref_mod);
809 
810 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
811 	spin_unlock(&delayed_refs->lock);
812 
813 	/*
814 	 * Need to update the delayed_refs_rsv with any changes we may have
815 	 * made.
816 	 */
817 	btrfs_update_delayed_refs_rsv(trans);
818 
819 	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
820 				   action == BTRFS_ADD_DELAYED_EXTENT ?
821 				   BTRFS_ADD_DELAYED_REF : action);
822 	if (ret > 0)
823 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
824 
825 	if (qrecord_inserted)
826 		btrfs_qgroup_trace_extent_post(fs_info, record);
827 
828 	return 0;
829 }
830 
831 /*
832  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
833  */
834 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
835 			       struct btrfs_ref *generic_ref,
836 			       u64 reserved, int *old_ref_mod,
837 			       int *new_ref_mod)
838 {
839 	struct btrfs_fs_info *fs_info = trans->fs_info;
840 	struct btrfs_delayed_data_ref *ref;
841 	struct btrfs_delayed_ref_head *head_ref;
842 	struct btrfs_delayed_ref_root *delayed_refs;
843 	struct btrfs_qgroup_extent_record *record = NULL;
844 	int qrecord_inserted;
845 	int action = generic_ref->action;
846 	int ret;
847 	u64 bytenr = generic_ref->bytenr;
848 	u64 num_bytes = generic_ref->len;
849 	u64 parent = generic_ref->parent;
850 	u64 ref_root = generic_ref->data_ref.ref_root;
851 	u64 owner = generic_ref->data_ref.ino;
852 	u64 offset = generic_ref->data_ref.offset;
853 	u8 ref_type;
854 
855 	ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
856 	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
857 	if (!ref)
858 		return -ENOMEM;
859 
860 	if (parent)
861 	        ref_type = BTRFS_SHARED_DATA_REF_KEY;
862 	else
863 	        ref_type = BTRFS_EXTENT_DATA_REF_KEY;
864 	init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
865 				ref_root, action, ref_type);
866 	ref->root = ref_root;
867 	ref->parent = parent;
868 	ref->objectid = owner;
869 	ref->offset = offset;
870 
871 
872 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
873 	if (!head_ref) {
874 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
875 		return -ENOMEM;
876 	}
877 
878 	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
879 	    is_fstree(ref_root) &&
880 	    is_fstree(generic_ref->real_root) &&
881 	    !generic_ref->skip_qgroup) {
882 		record = kzalloc(sizeof(*record), GFP_NOFS);
883 		if (!record) {
884 			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
885 			kmem_cache_free(btrfs_delayed_ref_head_cachep,
886 					head_ref);
887 			return -ENOMEM;
888 		}
889 	}
890 
891 	init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
892 			      reserved, action, true, false);
893 	head_ref->extent_op = NULL;
894 
895 	delayed_refs = &trans->transaction->delayed_refs;
896 	spin_lock(&delayed_refs->lock);
897 
898 	/*
899 	 * insert both the head node and the new ref without dropping
900 	 * the spin lock
901 	 */
902 	head_ref = add_delayed_ref_head(trans, head_ref, record,
903 					action, &qrecord_inserted,
904 					old_ref_mod, new_ref_mod);
905 
906 	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
907 	spin_unlock(&delayed_refs->lock);
908 
909 	/*
910 	 * Need to update the delayed_refs_rsv with any changes we may have
911 	 * made.
912 	 */
913 	btrfs_update_delayed_refs_rsv(trans);
914 
915 	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
916 				   action == BTRFS_ADD_DELAYED_EXTENT ?
917 				   BTRFS_ADD_DELAYED_REF : action);
918 	if (ret > 0)
919 		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
920 
921 
922 	if (qrecord_inserted)
923 		return btrfs_qgroup_trace_extent_post(fs_info, record);
924 	return 0;
925 }
926 
927 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
928 				u64 bytenr, u64 num_bytes,
929 				struct btrfs_delayed_extent_op *extent_op)
930 {
931 	struct btrfs_delayed_ref_head *head_ref;
932 	struct btrfs_delayed_ref_root *delayed_refs;
933 
934 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
935 	if (!head_ref)
936 		return -ENOMEM;
937 
938 	init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
939 			      BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
940 			      false);
941 	head_ref->extent_op = extent_op;
942 
943 	delayed_refs = &trans->transaction->delayed_refs;
944 	spin_lock(&delayed_refs->lock);
945 
946 	add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
947 			     NULL, NULL, NULL);
948 
949 	spin_unlock(&delayed_refs->lock);
950 
951 	/*
952 	 * Need to update the delayed_refs_rsv with any changes we may have
953 	 * made.
954 	 */
955 	btrfs_update_delayed_refs_rsv(trans);
956 	return 0;
957 }
958 
959 /*
960  * this does a simple search for the head node for a given extent.
961  * It must be called with the delayed ref spinlock held, and it returns
962  * the head node if any where found, or NULL if not.
963  */
964 struct btrfs_delayed_ref_head *
965 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
966 {
967 	return find_ref_head(delayed_refs, bytenr, false);
968 }
969 
970 void __cold btrfs_delayed_ref_exit(void)
971 {
972 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
973 	kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
974 	kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
975 	kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
976 }
977 
978 int __init btrfs_delayed_ref_init(void)
979 {
980 	btrfs_delayed_ref_head_cachep = kmem_cache_create(
981 				"btrfs_delayed_ref_head",
982 				sizeof(struct btrfs_delayed_ref_head), 0,
983 				SLAB_MEM_SPREAD, NULL);
984 	if (!btrfs_delayed_ref_head_cachep)
985 		goto fail;
986 
987 	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
988 				"btrfs_delayed_tree_ref",
989 				sizeof(struct btrfs_delayed_tree_ref), 0,
990 				SLAB_MEM_SPREAD, NULL);
991 	if (!btrfs_delayed_tree_ref_cachep)
992 		goto fail;
993 
994 	btrfs_delayed_data_ref_cachep = kmem_cache_create(
995 				"btrfs_delayed_data_ref",
996 				sizeof(struct btrfs_delayed_data_ref), 0,
997 				SLAB_MEM_SPREAD, NULL);
998 	if (!btrfs_delayed_data_ref_cachep)
999 		goto fail;
1000 
1001 	btrfs_delayed_extent_op_cachep = kmem_cache_create(
1002 				"btrfs_delayed_extent_op",
1003 				sizeof(struct btrfs_delayed_extent_op), 0,
1004 				SLAB_MEM_SPREAD, NULL);
1005 	if (!btrfs_delayed_extent_op_cachep)
1006 		goto fail;
1007 
1008 	return 0;
1009 fail:
1010 	btrfs_delayed_ref_exit();
1011 	return -ENOMEM;
1012 }
1013