xref: /linux/fs/btrfs/ordered-data.c (revision 163b099146b85d1b05bd2eaa045acbeee25c29e4)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/slab.h>
7  #include <linux/blkdev.h>
8  #include <linux/writeback.h>
9  #include <linux/sched/mm.h>
10  #include "misc.h"
11  #include "ctree.h"
12  #include "transaction.h"
13  #include "btrfs_inode.h"
14  #include "extent_io.h"
15  #include "disk-io.h"
16  #include "compression.h"
17  #include "delalloc-space.h"
18  #include "qgroup.h"
19  
20  static struct kmem_cache *btrfs_ordered_extent_cache;
21  
22  static u64 entry_end(struct btrfs_ordered_extent *entry)
23  {
24  	if (entry->file_offset + entry->num_bytes < entry->file_offset)
25  		return (u64)-1;
26  	return entry->file_offset + entry->num_bytes;
27  }
28  
29  /* returns NULL if the insertion worked, or it returns the node it did find
30   * in the tree
31   */
32  static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
33  				   struct rb_node *node)
34  {
35  	struct rb_node **p = &root->rb_node;
36  	struct rb_node *parent = NULL;
37  	struct btrfs_ordered_extent *entry;
38  
39  	while (*p) {
40  		parent = *p;
41  		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
42  
43  		if (file_offset < entry->file_offset)
44  			p = &(*p)->rb_left;
45  		else if (file_offset >= entry_end(entry))
46  			p = &(*p)->rb_right;
47  		else
48  			return parent;
49  	}
50  
51  	rb_link_node(node, parent, p);
52  	rb_insert_color(node, root);
53  	return NULL;
54  }
55  
56  /*
57   * look for a given offset in the tree, and if it can't be found return the
58   * first lesser offset
59   */
60  static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
61  				     struct rb_node **prev_ret)
62  {
63  	struct rb_node *n = root->rb_node;
64  	struct rb_node *prev = NULL;
65  	struct rb_node *test;
66  	struct btrfs_ordered_extent *entry;
67  	struct btrfs_ordered_extent *prev_entry = NULL;
68  
69  	while (n) {
70  		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
71  		prev = n;
72  		prev_entry = entry;
73  
74  		if (file_offset < entry->file_offset)
75  			n = n->rb_left;
76  		else if (file_offset >= entry_end(entry))
77  			n = n->rb_right;
78  		else
79  			return n;
80  	}
81  	if (!prev_ret)
82  		return NULL;
83  
84  	while (prev && file_offset >= entry_end(prev_entry)) {
85  		test = rb_next(prev);
86  		if (!test)
87  			break;
88  		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
89  				      rb_node);
90  		if (file_offset < entry_end(prev_entry))
91  			break;
92  
93  		prev = test;
94  	}
95  	if (prev)
96  		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
97  				      rb_node);
98  	while (prev && file_offset < entry_end(prev_entry)) {
99  		test = rb_prev(prev);
100  		if (!test)
101  			break;
102  		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
103  				      rb_node);
104  		prev = test;
105  	}
106  	*prev_ret = prev;
107  	return NULL;
108  }
109  
110  /*
111   * helper to check if a given offset is inside a given entry
112   */
113  static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
114  {
115  	if (file_offset < entry->file_offset ||
116  	    entry->file_offset + entry->num_bytes <= file_offset)
117  		return 0;
118  	return 1;
119  }
120  
121  static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
122  			  u64 len)
123  {
124  	if (file_offset + len <= entry->file_offset ||
125  	    entry->file_offset + entry->num_bytes <= file_offset)
126  		return 0;
127  	return 1;
128  }
129  
130  /*
131   * look find the first ordered struct that has this offset, otherwise
132   * the first one less than this offset
133   */
134  static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
135  					  u64 file_offset)
136  {
137  	struct rb_root *root = &tree->tree;
138  	struct rb_node *prev = NULL;
139  	struct rb_node *ret;
140  	struct btrfs_ordered_extent *entry;
141  
142  	if (tree->last) {
143  		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
144  				 rb_node);
145  		if (offset_in_entry(entry, file_offset))
146  			return tree->last;
147  	}
148  	ret = __tree_search(root, file_offset, &prev);
149  	if (!ret)
150  		ret = prev;
151  	if (ret)
152  		tree->last = ret;
153  	return ret;
154  }
155  
156  /*
157   * Allocate and add a new ordered_extent into the per-inode tree.
158   *
159   * The tree is given a single reference on the ordered extent that was
160   * inserted.
161   */
162  static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
163  				      u64 disk_bytenr, u64 num_bytes,
164  				      u64 disk_num_bytes, int type, int dio,
165  				      int compress_type)
166  {
167  	struct btrfs_root *root = inode->root;
168  	struct btrfs_fs_info *fs_info = root->fs_info;
169  	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
170  	struct rb_node *node;
171  	struct btrfs_ordered_extent *entry;
172  	int ret;
173  
174  	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
175  		/* For nocow write, we can release the qgroup rsv right now */
176  		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
177  		if (ret < 0)
178  			return ret;
179  		ret = 0;
180  	} else {
181  		/*
182  		 * The ordered extent has reserved qgroup space, release now
183  		 * and pass the reserved number for qgroup_record to free.
184  		 */
185  		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
186  		if (ret < 0)
187  			return ret;
188  	}
189  	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
190  	if (!entry)
191  		return -ENOMEM;
192  
193  	entry->file_offset = file_offset;
194  	entry->disk_bytenr = disk_bytenr;
195  	entry->num_bytes = num_bytes;
196  	entry->disk_num_bytes = disk_num_bytes;
197  	entry->bytes_left = num_bytes;
198  	entry->inode = igrab(&inode->vfs_inode);
199  	entry->compress_type = compress_type;
200  	entry->truncated_len = (u64)-1;
201  	entry->qgroup_rsv = ret;
202  	entry->physical = (u64)-1;
203  	entry->disk = NULL;
204  	entry->partno = (u8)-1;
205  
206  	ASSERT(type == BTRFS_ORDERED_REGULAR ||
207  	       type == BTRFS_ORDERED_NOCOW ||
208  	       type == BTRFS_ORDERED_PREALLOC ||
209  	       type == BTRFS_ORDERED_COMPRESSED);
210  	set_bit(type, &entry->flags);
211  
212  	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
213  				 fs_info->delalloc_batch);
214  
215  	if (dio)
216  		set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
217  
218  	/* one ref for the tree */
219  	refcount_set(&entry->refs, 1);
220  	init_waitqueue_head(&entry->wait);
221  	INIT_LIST_HEAD(&entry->list);
222  	INIT_LIST_HEAD(&entry->log_list);
223  	INIT_LIST_HEAD(&entry->root_extent_list);
224  	INIT_LIST_HEAD(&entry->work_list);
225  	init_completion(&entry->completion);
226  
227  	trace_btrfs_ordered_extent_add(inode, entry);
228  
229  	spin_lock_irq(&tree->lock);
230  	node = tree_insert(&tree->tree, file_offset,
231  			   &entry->rb_node);
232  	if (node)
233  		btrfs_panic(fs_info, -EEXIST,
234  				"inconsistency in ordered tree at offset %llu",
235  				file_offset);
236  	spin_unlock_irq(&tree->lock);
237  
238  	spin_lock(&root->ordered_extent_lock);
239  	list_add_tail(&entry->root_extent_list,
240  		      &root->ordered_extents);
241  	root->nr_ordered_extents++;
242  	if (root->nr_ordered_extents == 1) {
243  		spin_lock(&fs_info->ordered_root_lock);
244  		BUG_ON(!list_empty(&root->ordered_root));
245  		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
246  		spin_unlock(&fs_info->ordered_root_lock);
247  	}
248  	spin_unlock(&root->ordered_extent_lock);
249  
250  	/*
251  	 * We don't need the count_max_extents here, we can assume that all of
252  	 * that work has been done at higher layers, so this is truly the
253  	 * smallest the extent is going to get.
254  	 */
255  	spin_lock(&inode->lock);
256  	btrfs_mod_outstanding_extents(inode, 1);
257  	spin_unlock(&inode->lock);
258  
259  	return 0;
260  }
261  
262  int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
263  			     u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
264  			     int type)
265  {
266  	ASSERT(type == BTRFS_ORDERED_REGULAR ||
267  	       type == BTRFS_ORDERED_NOCOW ||
268  	       type == BTRFS_ORDERED_PREALLOC);
269  	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
270  					  num_bytes, disk_num_bytes, type, 0,
271  					  BTRFS_COMPRESS_NONE);
272  }
273  
274  int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
275  				 u64 disk_bytenr, u64 num_bytes,
276  				 u64 disk_num_bytes, int type)
277  {
278  	ASSERT(type == BTRFS_ORDERED_REGULAR ||
279  	       type == BTRFS_ORDERED_NOCOW ||
280  	       type == BTRFS_ORDERED_PREALLOC);
281  	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
282  					  num_bytes, disk_num_bytes, type, 1,
283  					  BTRFS_COMPRESS_NONE);
284  }
285  
286  int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
287  				      u64 disk_bytenr, u64 num_bytes,
288  				      u64 disk_num_bytes, int compress_type)
289  {
290  	ASSERT(compress_type != BTRFS_COMPRESS_NONE);
291  	return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
292  					  num_bytes, disk_num_bytes,
293  					  BTRFS_ORDERED_COMPRESSED, 0,
294  					  compress_type);
295  }
296  
297  /*
298   * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
299   * when an ordered extent is finished.  If the list covers more than one
300   * ordered extent, it is split across multiples.
301   */
302  void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
303  			   struct btrfs_ordered_sum *sum)
304  {
305  	struct btrfs_ordered_inode_tree *tree;
306  
307  	tree = &BTRFS_I(entry->inode)->ordered_tree;
308  	spin_lock_irq(&tree->lock);
309  	list_add_tail(&sum->list, &entry->list);
310  	spin_unlock_irq(&tree->lock);
311  }
312  
313  /*
314   * Finish IO for one ordered extent across a given range.  The range can
315   * contain several ordered extents.
316   *
317   * @found_ret:	 Return the finished ordered extent
318   * @file_offset: File offset for the finished IO
319   * 		 Will also be updated to one byte past the range that is
320   * 		 recordered as finished. This allows caller to walk forward.
321   * @io_size:	 Length of the finish IO range
322   * @uptodate:	 If the IO finished without problem
323   *
324   * Return true if any ordered extent is finished in the range, and update
325   * @found_ret and @file_offset.
326   * Return false otherwise.
327   *
328   * NOTE: Although The range can cross multiple ordered extents, only one
329   * ordered extent will be updated during one call. The caller is responsible to
330   * iterate all ordered extents in the range.
331   */
332  bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
333  				   struct btrfs_ordered_extent **finished_ret,
334  				   u64 *file_offset, u64 io_size, int uptodate)
335  {
336  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
337  	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
338  	struct rb_node *node;
339  	struct btrfs_ordered_extent *entry = NULL;
340  	bool finished = false;
341  	unsigned long flags;
342  	u64 dec_end;
343  	u64 dec_start;
344  	u64 to_dec;
345  
346  	spin_lock_irqsave(&tree->lock, flags);
347  	node = tree_search(tree, *file_offset);
348  	if (!node)
349  		goto out;
350  
351  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
352  	if (!offset_in_entry(entry, *file_offset))
353  		goto out;
354  
355  	dec_start = max(*file_offset, entry->file_offset);
356  	dec_end = min(*file_offset + io_size,
357  		      entry->file_offset + entry->num_bytes);
358  	*file_offset = dec_end;
359  	if (dec_start > dec_end) {
360  		btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
361  			   dec_start, dec_end);
362  	}
363  	to_dec = dec_end - dec_start;
364  	if (to_dec > entry->bytes_left) {
365  		btrfs_crit(fs_info,
366  			   "bad ordered accounting left %llu size %llu",
367  			   entry->bytes_left, to_dec);
368  	}
369  	entry->bytes_left -= to_dec;
370  	if (!uptodate)
371  		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
372  
373  	if (entry->bytes_left == 0) {
374  		/*
375  		 * Ensure only one caller can set the flag and finished_ret
376  		 * accordingly
377  		 */
378  		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
379  		/* test_and_set_bit implies a barrier */
380  		cond_wake_up_nomb(&entry->wait);
381  	}
382  out:
383  	if (finished && finished_ret && entry) {
384  		*finished_ret = entry;
385  		refcount_inc(&entry->refs);
386  	}
387  	spin_unlock_irqrestore(&tree->lock, flags);
388  	return finished;
389  }
390  
391  /*
392   * Finish IO for one ordered extent across a given range.  The range can only
393   * contain one ordered extent.
394   *
395   * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
396   *               search and use the ordered extent directly.
397   * 		 Will be also used to store the finished ordered extent.
398   * @file_offset: File offset for the finished IO
399   * @io_size:	 Length of the finish IO range
400   * @uptodate:	 If the IO finishes without problem
401   *
402   * Return true if the ordered extent is finished in the range, and update
403   * @cached.
404   * Return false otherwise.
405   *
406   * NOTE: The range can NOT cross multiple ordered extents.
407   * Thus caller should ensure the range doesn't cross ordered extents.
408   */
409  bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
410  				    struct btrfs_ordered_extent **cached,
411  				    u64 file_offset, u64 io_size, int uptodate)
412  {
413  	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
414  	struct rb_node *node;
415  	struct btrfs_ordered_extent *entry = NULL;
416  	unsigned long flags;
417  	bool finished = false;
418  
419  	spin_lock_irqsave(&tree->lock, flags);
420  	if (cached && *cached) {
421  		entry = *cached;
422  		goto have_entry;
423  	}
424  
425  	node = tree_search(tree, file_offset);
426  	if (!node)
427  		goto out;
428  
429  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
430  have_entry:
431  	if (!offset_in_entry(entry, file_offset))
432  		goto out;
433  
434  	if (io_size > entry->bytes_left)
435  		btrfs_crit(inode->root->fs_info,
436  			   "bad ordered accounting left %llu size %llu",
437  		       entry->bytes_left, io_size);
438  
439  	entry->bytes_left -= io_size;
440  	if (!uptodate)
441  		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
442  
443  	if (entry->bytes_left == 0) {
444  		/*
445  		 * Ensure only one caller can set the flag and finished_ret
446  		 * accordingly
447  		 */
448  		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
449  		/* test_and_set_bit implies a barrier */
450  		cond_wake_up_nomb(&entry->wait);
451  	}
452  out:
453  	if (finished && cached && entry) {
454  		*cached = entry;
455  		refcount_inc(&entry->refs);
456  	}
457  	spin_unlock_irqrestore(&tree->lock, flags);
458  	return finished;
459  }
460  
461  /*
462   * used to drop a reference on an ordered extent.  This will free
463   * the extent if the last reference is dropped
464   */
465  void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
466  {
467  	struct list_head *cur;
468  	struct btrfs_ordered_sum *sum;
469  
470  	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
471  
472  	if (refcount_dec_and_test(&entry->refs)) {
473  		ASSERT(list_empty(&entry->root_extent_list));
474  		ASSERT(list_empty(&entry->log_list));
475  		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
476  		if (entry->inode)
477  			btrfs_add_delayed_iput(entry->inode);
478  		while (!list_empty(&entry->list)) {
479  			cur = entry->list.next;
480  			sum = list_entry(cur, struct btrfs_ordered_sum, list);
481  			list_del(&sum->list);
482  			kvfree(sum);
483  		}
484  		kmem_cache_free(btrfs_ordered_extent_cache, entry);
485  	}
486  }
487  
488  /*
489   * remove an ordered extent from the tree.  No references are dropped
490   * and waiters are woken up.
491   */
492  void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
493  				 struct btrfs_ordered_extent *entry)
494  {
495  	struct btrfs_ordered_inode_tree *tree;
496  	struct btrfs_root *root = btrfs_inode->root;
497  	struct btrfs_fs_info *fs_info = root->fs_info;
498  	struct rb_node *node;
499  	bool pending;
500  
501  	/* This is paired with btrfs_add_ordered_extent. */
502  	spin_lock(&btrfs_inode->lock);
503  	btrfs_mod_outstanding_extents(btrfs_inode, -1);
504  	spin_unlock(&btrfs_inode->lock);
505  	if (root != fs_info->tree_root)
506  		btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
507  						false);
508  
509  	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
510  				 fs_info->delalloc_batch);
511  
512  	tree = &btrfs_inode->ordered_tree;
513  	spin_lock_irq(&tree->lock);
514  	node = &entry->rb_node;
515  	rb_erase(node, &tree->tree);
516  	RB_CLEAR_NODE(node);
517  	if (tree->last == node)
518  		tree->last = NULL;
519  	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
520  	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
521  	spin_unlock_irq(&tree->lock);
522  
523  	/*
524  	 * The current running transaction is waiting on us, we need to let it
525  	 * know that we're complete and wake it up.
526  	 */
527  	if (pending) {
528  		struct btrfs_transaction *trans;
529  
530  		/*
531  		 * The checks for trans are just a formality, it should be set,
532  		 * but if it isn't we don't want to deref/assert under the spin
533  		 * lock, so be nice and check if trans is set, but ASSERT() so
534  		 * if it isn't set a developer will notice.
535  		 */
536  		spin_lock(&fs_info->trans_lock);
537  		trans = fs_info->running_transaction;
538  		if (trans)
539  			refcount_inc(&trans->use_count);
540  		spin_unlock(&fs_info->trans_lock);
541  
542  		ASSERT(trans);
543  		if (trans) {
544  			if (atomic_dec_and_test(&trans->pending_ordered))
545  				wake_up(&trans->pending_wait);
546  			btrfs_put_transaction(trans);
547  		}
548  	}
549  
550  	spin_lock(&root->ordered_extent_lock);
551  	list_del_init(&entry->root_extent_list);
552  	root->nr_ordered_extents--;
553  
554  	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
555  
556  	if (!root->nr_ordered_extents) {
557  		spin_lock(&fs_info->ordered_root_lock);
558  		BUG_ON(list_empty(&root->ordered_root));
559  		list_del_init(&root->ordered_root);
560  		spin_unlock(&fs_info->ordered_root_lock);
561  	}
562  	spin_unlock(&root->ordered_extent_lock);
563  	wake_up(&entry->wait);
564  }
565  
566  static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
567  {
568  	struct btrfs_ordered_extent *ordered;
569  
570  	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
571  	btrfs_start_ordered_extent(ordered, 1);
572  	complete(&ordered->completion);
573  }
574  
575  /*
576   * wait for all the ordered extents in a root.  This is done when balancing
577   * space between drives.
578   */
579  u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
580  			       const u64 range_start, const u64 range_len)
581  {
582  	struct btrfs_fs_info *fs_info = root->fs_info;
583  	LIST_HEAD(splice);
584  	LIST_HEAD(skipped);
585  	LIST_HEAD(works);
586  	struct btrfs_ordered_extent *ordered, *next;
587  	u64 count = 0;
588  	const u64 range_end = range_start + range_len;
589  
590  	mutex_lock(&root->ordered_extent_mutex);
591  	spin_lock(&root->ordered_extent_lock);
592  	list_splice_init(&root->ordered_extents, &splice);
593  	while (!list_empty(&splice) && nr) {
594  		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
595  					   root_extent_list);
596  
597  		if (range_end <= ordered->disk_bytenr ||
598  		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
599  			list_move_tail(&ordered->root_extent_list, &skipped);
600  			cond_resched_lock(&root->ordered_extent_lock);
601  			continue;
602  		}
603  
604  		list_move_tail(&ordered->root_extent_list,
605  			       &root->ordered_extents);
606  		refcount_inc(&ordered->refs);
607  		spin_unlock(&root->ordered_extent_lock);
608  
609  		btrfs_init_work(&ordered->flush_work,
610  				btrfs_run_ordered_extent_work, NULL, NULL);
611  		list_add_tail(&ordered->work_list, &works);
612  		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
613  
614  		cond_resched();
615  		spin_lock(&root->ordered_extent_lock);
616  		if (nr != U64_MAX)
617  			nr--;
618  		count++;
619  	}
620  	list_splice_tail(&skipped, &root->ordered_extents);
621  	list_splice_tail(&splice, &root->ordered_extents);
622  	spin_unlock(&root->ordered_extent_lock);
623  
624  	list_for_each_entry_safe(ordered, next, &works, work_list) {
625  		list_del_init(&ordered->work_list);
626  		wait_for_completion(&ordered->completion);
627  		btrfs_put_ordered_extent(ordered);
628  		cond_resched();
629  	}
630  	mutex_unlock(&root->ordered_extent_mutex);
631  
632  	return count;
633  }
634  
635  void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
636  			     const u64 range_start, const u64 range_len)
637  {
638  	struct btrfs_root *root;
639  	struct list_head splice;
640  	u64 done;
641  
642  	INIT_LIST_HEAD(&splice);
643  
644  	mutex_lock(&fs_info->ordered_operations_mutex);
645  	spin_lock(&fs_info->ordered_root_lock);
646  	list_splice_init(&fs_info->ordered_roots, &splice);
647  	while (!list_empty(&splice) && nr) {
648  		root = list_first_entry(&splice, struct btrfs_root,
649  					ordered_root);
650  		root = btrfs_grab_root(root);
651  		BUG_ON(!root);
652  		list_move_tail(&root->ordered_root,
653  			       &fs_info->ordered_roots);
654  		spin_unlock(&fs_info->ordered_root_lock);
655  
656  		done = btrfs_wait_ordered_extents(root, nr,
657  						  range_start, range_len);
658  		btrfs_put_root(root);
659  
660  		spin_lock(&fs_info->ordered_root_lock);
661  		if (nr != U64_MAX) {
662  			nr -= done;
663  		}
664  	}
665  	list_splice_tail(&splice, &fs_info->ordered_roots);
666  	spin_unlock(&fs_info->ordered_root_lock);
667  	mutex_unlock(&fs_info->ordered_operations_mutex);
668  }
669  
670  /*
671   * Used to start IO or wait for a given ordered extent to finish.
672   *
673   * If wait is one, this effectively waits on page writeback for all the pages
674   * in the extent, and it waits on the io completion code to insert
675   * metadata into the btree corresponding to the extent
676   */
677  void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
678  {
679  	u64 start = entry->file_offset;
680  	u64 end = start + entry->num_bytes - 1;
681  	struct btrfs_inode *inode = BTRFS_I(entry->inode);
682  
683  	trace_btrfs_ordered_extent_start(inode, entry);
684  
685  	/*
686  	 * pages in the range can be dirty, clean or writeback.  We
687  	 * start IO on any dirty ones so the wait doesn't stall waiting
688  	 * for the flusher thread to find them
689  	 */
690  	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
691  		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
692  	if (wait) {
693  		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
694  						 &entry->flags));
695  	}
696  }
697  
698  /*
699   * Used to wait on ordered extents across a large range of bytes.
700   */
701  int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
702  {
703  	int ret = 0;
704  	int ret_wb = 0;
705  	u64 end;
706  	u64 orig_end;
707  	struct btrfs_ordered_extent *ordered;
708  
709  	if (start + len < start) {
710  		orig_end = INT_LIMIT(loff_t);
711  	} else {
712  		orig_end = start + len - 1;
713  		if (orig_end > INT_LIMIT(loff_t))
714  			orig_end = INT_LIMIT(loff_t);
715  	}
716  
717  	/* start IO across the range first to instantiate any delalloc
718  	 * extents
719  	 */
720  	ret = btrfs_fdatawrite_range(inode, start, orig_end);
721  	if (ret)
722  		return ret;
723  
724  	/*
725  	 * If we have a writeback error don't return immediately. Wait first
726  	 * for any ordered extents that haven't completed yet. This is to make
727  	 * sure no one can dirty the same page ranges and call writepages()
728  	 * before the ordered extents complete - to avoid failures (-EEXIST)
729  	 * when adding the new ordered extents to the ordered tree.
730  	 */
731  	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
732  
733  	end = orig_end;
734  	while (1) {
735  		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
736  		if (!ordered)
737  			break;
738  		if (ordered->file_offset > orig_end) {
739  			btrfs_put_ordered_extent(ordered);
740  			break;
741  		}
742  		if (ordered->file_offset + ordered->num_bytes <= start) {
743  			btrfs_put_ordered_extent(ordered);
744  			break;
745  		}
746  		btrfs_start_ordered_extent(ordered, 1);
747  		end = ordered->file_offset;
748  		/*
749  		 * If the ordered extent had an error save the error but don't
750  		 * exit without waiting first for all other ordered extents in
751  		 * the range to complete.
752  		 */
753  		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
754  			ret = -EIO;
755  		btrfs_put_ordered_extent(ordered);
756  		if (end == 0 || end == start)
757  			break;
758  		end--;
759  	}
760  	return ret_wb ? ret_wb : ret;
761  }
762  
763  /*
764   * find an ordered extent corresponding to file_offset.  return NULL if
765   * nothing is found, otherwise take a reference on the extent and return it
766   */
767  struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
768  							 u64 file_offset)
769  {
770  	struct btrfs_ordered_inode_tree *tree;
771  	struct rb_node *node;
772  	struct btrfs_ordered_extent *entry = NULL;
773  	unsigned long flags;
774  
775  	tree = &inode->ordered_tree;
776  	spin_lock_irqsave(&tree->lock, flags);
777  	node = tree_search(tree, file_offset);
778  	if (!node)
779  		goto out;
780  
781  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
782  	if (!offset_in_entry(entry, file_offset))
783  		entry = NULL;
784  	if (entry)
785  		refcount_inc(&entry->refs);
786  out:
787  	spin_unlock_irqrestore(&tree->lock, flags);
788  	return entry;
789  }
790  
791  /* Since the DIO code tries to lock a wide area we need to look for any ordered
792   * extents that exist in the range, rather than just the start of the range.
793   */
794  struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
795  		struct btrfs_inode *inode, u64 file_offset, u64 len)
796  {
797  	struct btrfs_ordered_inode_tree *tree;
798  	struct rb_node *node;
799  	struct btrfs_ordered_extent *entry = NULL;
800  
801  	tree = &inode->ordered_tree;
802  	spin_lock_irq(&tree->lock);
803  	node = tree_search(tree, file_offset);
804  	if (!node) {
805  		node = tree_search(tree, file_offset + len);
806  		if (!node)
807  			goto out;
808  	}
809  
810  	while (1) {
811  		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
812  		if (range_overlaps(entry, file_offset, len))
813  			break;
814  
815  		if (entry->file_offset >= file_offset + len) {
816  			entry = NULL;
817  			break;
818  		}
819  		entry = NULL;
820  		node = rb_next(node);
821  		if (!node)
822  			break;
823  	}
824  out:
825  	if (entry)
826  		refcount_inc(&entry->refs);
827  	spin_unlock_irq(&tree->lock);
828  	return entry;
829  }
830  
831  /*
832   * Adds all ordered extents to the given list. The list ends up sorted by the
833   * file_offset of the ordered extents.
834   */
835  void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
836  					   struct list_head *list)
837  {
838  	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
839  	struct rb_node *n;
840  
841  	ASSERT(inode_is_locked(&inode->vfs_inode));
842  
843  	spin_lock_irq(&tree->lock);
844  	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
845  		struct btrfs_ordered_extent *ordered;
846  
847  		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
848  
849  		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
850  			continue;
851  
852  		ASSERT(list_empty(&ordered->log_list));
853  		list_add_tail(&ordered->log_list, list);
854  		refcount_inc(&ordered->refs);
855  	}
856  	spin_unlock_irq(&tree->lock);
857  }
858  
859  /*
860   * lookup and return any extent before 'file_offset'.  NULL is returned
861   * if none is found
862   */
863  struct btrfs_ordered_extent *
864  btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
865  {
866  	struct btrfs_ordered_inode_tree *tree;
867  	struct rb_node *node;
868  	struct btrfs_ordered_extent *entry = NULL;
869  
870  	tree = &inode->ordered_tree;
871  	spin_lock_irq(&tree->lock);
872  	node = tree_search(tree, file_offset);
873  	if (!node)
874  		goto out;
875  
876  	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
877  	refcount_inc(&entry->refs);
878  out:
879  	spin_unlock_irq(&tree->lock);
880  	return entry;
881  }
882  
883  /*
884   * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
885   * ordered extents in it are run to completion.
886   *
887   * @inode:        Inode whose ordered tree is to be searched
888   * @start:        Beginning of range to flush
889   * @end:          Last byte of range to lock
890   * @cached_state: If passed, will return the extent state responsible for the
891   * locked range. It's the caller's responsibility to free the cached state.
892   *
893   * This function always returns with the given range locked, ensuring after it's
894   * called no order extent can be pending.
895   */
896  void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
897  					u64 end,
898  					struct extent_state **cached_state)
899  {
900  	struct btrfs_ordered_extent *ordered;
901  	struct extent_state *cache = NULL;
902  	struct extent_state **cachedp = &cache;
903  
904  	if (cached_state)
905  		cachedp = cached_state;
906  
907  	while (1) {
908  		lock_extent_bits(&inode->io_tree, start, end, cachedp);
909  		ordered = btrfs_lookup_ordered_range(inode, start,
910  						     end - start + 1);
911  		if (!ordered) {
912  			/*
913  			 * If no external cached_state has been passed then
914  			 * decrement the extra ref taken for cachedp since we
915  			 * aren't exposing it outside of this function
916  			 */
917  			if (!cached_state)
918  				refcount_dec(&cache->refs);
919  			break;
920  		}
921  		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
922  		btrfs_start_ordered_extent(ordered, 1);
923  		btrfs_put_ordered_extent(ordered);
924  	}
925  }
926  
927  static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
928  				u64 len)
929  {
930  	struct inode *inode = ordered->inode;
931  	u64 file_offset = ordered->file_offset + pos;
932  	u64 disk_bytenr = ordered->disk_bytenr + pos;
933  	u64 num_bytes = len;
934  	u64 disk_num_bytes = len;
935  	int type;
936  	unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
937  	int compress_type = ordered->compress_type;
938  	unsigned long weight;
939  	int ret;
940  
941  	weight = hweight_long(flags_masked);
942  	WARN_ON_ONCE(weight > 1);
943  	if (!weight)
944  		type = 0;
945  	else
946  		type = __ffs(flags_masked);
947  
948  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
949  		WARN_ON_ONCE(1);
950  		ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
951  				file_offset, disk_bytenr, num_bytes,
952  				disk_num_bytes, compress_type);
953  	} else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
954  		ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
955  				disk_bytenr, num_bytes, disk_num_bytes, type);
956  	} else {
957  		ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
958  				disk_bytenr, num_bytes, disk_num_bytes, type);
959  	}
960  
961  	return ret;
962  }
963  
964  int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
965  				u64 post)
966  {
967  	struct inode *inode = ordered->inode;
968  	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
969  	struct rb_node *node;
970  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
971  	int ret = 0;
972  
973  	spin_lock_irq(&tree->lock);
974  	/* Remove from tree once */
975  	node = &ordered->rb_node;
976  	rb_erase(node, &tree->tree);
977  	RB_CLEAR_NODE(node);
978  	if (tree->last == node)
979  		tree->last = NULL;
980  
981  	ordered->file_offset += pre;
982  	ordered->disk_bytenr += pre;
983  	ordered->num_bytes -= (pre + post);
984  	ordered->disk_num_bytes -= (pre + post);
985  	ordered->bytes_left -= (pre + post);
986  
987  	/* Re-insert the node */
988  	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
989  	if (node)
990  		btrfs_panic(fs_info, -EEXIST,
991  			"zoned: inconsistency in ordered tree at offset %llu",
992  			    ordered->file_offset);
993  
994  	spin_unlock_irq(&tree->lock);
995  
996  	if (pre)
997  		ret = clone_ordered_extent(ordered, 0, pre);
998  	if (post)
999  		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
1000  					   post);
1001  
1002  	return ret;
1003  }
1004  
1005  int __init ordered_data_init(void)
1006  {
1007  	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1008  				     sizeof(struct btrfs_ordered_extent), 0,
1009  				     SLAB_MEM_SPREAD,
1010  				     NULL);
1011  	if (!btrfs_ordered_extent_cache)
1012  		return -ENOMEM;
1013  
1014  	return 0;
1015  }
1016  
1017  void __cold ordered_data_exit(void)
1018  {
1019  	kmem_cache_destroy(btrfs_ordered_extent_cache);
1020  }
1021