xref: /linux/fs/btrfs/ordered-data.c (revision 7696286034ac72cf9b46499be1715ac62fd302c3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/slab.h>
7 #include <linux/blkdev.h>
8 #include <linux/writeback.h>
9 #include <linux/sched/mm.h>
10 #include "messages.h"
11 #include "misc.h"
12 #include "ctree.h"
13 #include "transaction.h"
14 #include "btrfs_inode.h"
15 #include "extent_io.h"
16 #include "disk-io.h"
17 #include "compression.h"
18 #include "delalloc-space.h"
19 #include "qgroup.h"
20 #include "subpage.h"
21 #include "file.h"
22 #include "block-group.h"
23 
24 static struct kmem_cache *btrfs_ordered_extent_cache;
25 
26 static u64 entry_end(struct btrfs_ordered_extent *entry)
27 {
28 	if (entry->file_offset + entry->num_bytes < entry->file_offset)
29 		return (u64)-1;
30 	return entry->file_offset + entry->num_bytes;
31 }
32 
33 /* returns NULL if the insertion worked, or it returns the node it did find
34  * in the tree
35  */
36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
37 				   struct rb_node *node)
38 {
39 	struct rb_node **p = &root->rb_node;
40 	struct rb_node *parent = NULL;
41 	struct btrfs_ordered_extent *entry;
42 
43 	while (*p) {
44 		parent = *p;
45 		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
46 
47 		if (file_offset < entry->file_offset)
48 			p = &(*p)->rb_left;
49 		else if (file_offset >= entry_end(entry))
50 			p = &(*p)->rb_right;
51 		else
52 			return parent;
53 	}
54 
55 	rb_link_node(node, parent, p);
56 	rb_insert_color(node, root);
57 	return NULL;
58 }
59 
60 /*
61  * look for a given offset in the tree, and if it can't be found return the
62  * first lesser offset
63  */
64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
65 				     struct rb_node **prev_ret)
66 {
67 	struct rb_node *n = root->rb_node;
68 	struct rb_node *prev = NULL;
69 	struct rb_node *test;
70 	struct btrfs_ordered_extent *entry;
71 	struct btrfs_ordered_extent *prev_entry = NULL;
72 
73 	while (n) {
74 		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
75 		prev = n;
76 		prev_entry = entry;
77 
78 		if (file_offset < entry->file_offset)
79 			n = n->rb_left;
80 		else if (file_offset >= entry_end(entry))
81 			n = n->rb_right;
82 		else
83 			return n;
84 	}
85 	if (!prev_ret)
86 		return NULL;
87 
88 	while (prev && file_offset >= entry_end(prev_entry)) {
89 		test = rb_next(prev);
90 		if (!test)
91 			break;
92 		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
93 				      rb_node);
94 		if (file_offset < entry_end(prev_entry))
95 			break;
96 
97 		prev = test;
98 	}
99 	if (prev)
100 		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
101 				      rb_node);
102 	while (prev && file_offset < entry_end(prev_entry)) {
103 		test = rb_prev(prev);
104 		if (!test)
105 			break;
106 		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
107 				      rb_node);
108 		prev = test;
109 	}
110 	*prev_ret = prev;
111 	return NULL;
112 }
113 
114 static int btrfs_range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
115 				u64 len)
116 {
117 	if (file_offset + len <= entry->file_offset ||
118 	    entry->file_offset + entry->num_bytes <= file_offset)
119 		return 0;
120 	return 1;
121 }
122 
123 /*
124  * look find the first ordered struct that has this offset, otherwise
125  * the first one less than this offset
126  */
127 static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode,
128 						  u64 file_offset)
129 {
130 	struct rb_node *prev = NULL;
131 	struct rb_node *ret;
132 	struct btrfs_ordered_extent *entry;
133 
134 	if (inode->ordered_tree_last) {
135 		entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent,
136 				 rb_node);
137 		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
138 			return inode->ordered_tree_last;
139 	}
140 	ret = __tree_search(&inode->ordered_tree, file_offset, &prev);
141 	if (!ret)
142 		ret = prev;
143 	if (ret)
144 		inode->ordered_tree_last = ret;
145 	return ret;
146 }
147 
148 static struct btrfs_ordered_extent *alloc_ordered_extent(
149 			struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
150 			u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
151 			u64 offset, unsigned long flags, int compress_type)
152 {
153 	struct btrfs_ordered_extent *entry;
154 	int ret;
155 	u64 qgroup_rsv = 0;
156 	const bool is_nocow = (flags &
157 	       ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
158 
159 	/*
160 	 * For a NOCOW write we can free the qgroup reserve right now. For a COW
161 	 * one we transfer the reserved space from the inode's iotree into the
162 	 * ordered extent by calling btrfs_qgroup_release_data() and tracking
163 	 * the qgroup reserved amount in the ordered extent, so that later after
164 	 * completing the ordered extent, when running the data delayed ref it
165 	 * creates, we free the reserved data with btrfs_qgroup_free_refroot().
166 	 */
167 	if (is_nocow)
168 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
169 	else
170 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
171 
172 	if (ret < 0)
173 		return ERR_PTR(ret);
174 
175 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
176 	if (!entry) {
177 		entry = ERR_PTR(-ENOMEM);
178 		goto out;
179 	}
180 
181 	entry->file_offset = file_offset;
182 	entry->num_bytes = num_bytes;
183 	entry->ram_bytes = ram_bytes;
184 	entry->disk_bytenr = disk_bytenr;
185 	entry->disk_num_bytes = disk_num_bytes;
186 	entry->offset = offset;
187 	entry->bytes_left = num_bytes;
188 	if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
189 		kmem_cache_free(btrfs_ordered_extent_cache, entry);
190 		entry = ERR_PTR(-ESTALE);
191 		goto out;
192 	}
193 	entry->inode = inode;
194 	entry->compress_type = compress_type;
195 	entry->truncated_len = (u64)-1;
196 	entry->qgroup_rsv = qgroup_rsv;
197 	entry->flags = flags;
198 	refcount_set(&entry->refs, 1);
199 	init_waitqueue_head(&entry->wait);
200 	INIT_LIST_HEAD(&entry->list);
201 	INIT_LIST_HEAD(&entry->log_list);
202 	INIT_LIST_HEAD(&entry->root_extent_list);
203 	INIT_LIST_HEAD(&entry->work_list);
204 	INIT_LIST_HEAD(&entry->bioc_list);
205 	init_completion(&entry->completion);
206 
207 	/*
208 	 * We don't need the count_max_extents here, we can assume that all of
209 	 * that work has been done at higher layers, so this is truly the
210 	 * smallest the extent is going to get.
211 	 */
212 	spin_lock(&inode->lock);
213 	btrfs_mod_outstanding_extents(inode, 1);
214 	spin_unlock(&inode->lock);
215 
216 out:
217 	if (IS_ERR(entry) && !is_nocow)
218 		btrfs_qgroup_free_refroot(inode->root->fs_info,
219 					  btrfs_root_id(inode->root),
220 					  qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
221 
222 	return entry;
223 }
224 
225 static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
226 {
227 	struct btrfs_inode *inode = entry->inode;
228 	struct btrfs_root *root = inode->root;
229 	struct btrfs_fs_info *fs_info = root->fs_info;
230 	struct rb_node *node;
231 
232 	trace_btrfs_ordered_extent_add(inode, entry);
233 
234 	percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
235 				 fs_info->delalloc_batch);
236 
237 	/* One ref for the tree. */
238 	refcount_inc(&entry->refs);
239 
240 	spin_lock(&inode->ordered_tree_lock);
241 	node = tree_insert(&inode->ordered_tree, entry->file_offset,
242 			   &entry->rb_node);
243 	if (unlikely(node))
244 		btrfs_panic(fs_info, -EEXIST,
245 				"inconsistency in ordered tree at offset %llu",
246 				entry->file_offset);
247 	spin_unlock(&inode->ordered_tree_lock);
248 
249 	spin_lock(&root->ordered_extent_lock);
250 	list_add_tail(&entry->root_extent_list,
251 		      &root->ordered_extents);
252 	root->nr_ordered_extents++;
253 	if (root->nr_ordered_extents == 1) {
254 		spin_lock(&fs_info->ordered_root_lock);
255 		BUG_ON(!list_empty(&root->ordered_root));
256 		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
257 		spin_unlock(&fs_info->ordered_root_lock);
258 	}
259 	spin_unlock(&root->ordered_extent_lock);
260 }
261 
262 /*
263  * Add an ordered extent to the per-inode tree.
264  *
265  * @inode:           Inode that this extent is for.
266  * @file_offset:     Logical offset in file where the extent starts.
267  * @num_bytes:       Logical length of extent in file.
268  * @ram_bytes:       Full length of unencoded data.
269  * @disk_bytenr:     Offset of extent on disk.
270  * @disk_num_bytes:  Size of extent on disk.
271  * @offset:          Offset into unencoded data where file data starts.
272  * @flags:           Flags specifying type of extent (1U << BTRFS_ORDERED_*).
273  * @compress_type:   Compression algorithm used for data.
274  *
275  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
276  * tree is given a single reference on the ordered extent that was inserted, and
277  * the returned pointer is given a second reference.
278  *
279  * Return: the new ordered extent or error pointer.
280  */
281 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
282 			struct btrfs_inode *inode, u64 file_offset,
283 			const struct btrfs_file_extent *file_extent, unsigned long flags)
284 {
285 	struct btrfs_ordered_extent *entry;
286 
287 	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
288 
289 	/*
290 	 * For regular writes, we just use the members in @file_extent.
291 	 *
292 	 * For NOCOW, we don't really care about the numbers except @start and
293 	 * file_extent->num_bytes, as we won't insert a file extent item at all.
294 	 *
295 	 * For PREALLOC, we do not use ordered extent members, but
296 	 * btrfs_mark_extent_written() handles everything.
297 	 *
298 	 * So here we always pass 0 as offset for NOCOW/PREALLOC ordered extents,
299 	 * or btrfs_split_ordered_extent() cannot handle it correctly.
300 	 */
301 	if (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)))
302 		entry = alloc_ordered_extent(inode, file_offset,
303 					     file_extent->num_bytes,
304 					     file_extent->num_bytes,
305 					     file_extent->disk_bytenr + file_extent->offset,
306 					     file_extent->num_bytes, 0, flags,
307 					     file_extent->compression);
308 	else
309 		entry = alloc_ordered_extent(inode, file_offset,
310 					     file_extent->num_bytes,
311 					     file_extent->ram_bytes,
312 					     file_extent->disk_bytenr,
313 					     file_extent->disk_num_bytes,
314 					     file_extent->offset, flags,
315 					     file_extent->compression);
316 	if (!IS_ERR(entry))
317 		insert_ordered_extent(entry);
318 	return entry;
319 }
320 
321 /*
322  * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
323  * when an ordered extent is finished.  If the list covers more than one
324  * ordered extent, it is split across multiples.
325  */
326 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
327 			   struct btrfs_ordered_sum *sum)
328 {
329 	struct btrfs_inode *inode = entry->inode;
330 
331 	spin_lock(&inode->ordered_tree_lock);
332 	list_add_tail(&sum->list, &entry->list);
333 	spin_unlock(&inode->ordered_tree_lock);
334 }
335 
336 void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered)
337 {
338 	if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
339 		mapping_set_error(ordered->inode->vfs_inode.i_mapping, -EIO);
340 }
341 
342 static void finish_ordered_fn(struct btrfs_work *work)
343 {
344 	struct btrfs_ordered_extent *ordered_extent;
345 
346 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
347 	btrfs_finish_ordered_io(ordered_extent);
348 }
349 
350 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
351 				      struct folio *folio, u64 file_offset,
352 				      u64 len, bool uptodate)
353 {
354 	struct btrfs_inode *inode = ordered->inode;
355 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
356 
357 	lockdep_assert_held(&inode->ordered_tree_lock);
358 
359 	if (folio) {
360 		ASSERT(folio->mapping);
361 		ASSERT(folio_pos(folio) <= file_offset);
362 		ASSERT(file_offset + len <= folio_next_pos(folio));
363 
364 		/*
365 		 * Ordered flag indicates whether we still have
366 		 * pending io unfinished for the ordered extent.
367 		 *
368 		 * If it's not set, we need to skip to next range.
369 		 */
370 		if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
371 			return false;
372 		btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
373 	}
374 
375 	/* Now we're fine to update the accounting. */
376 	if (WARN_ON_ONCE(len > ordered->bytes_left)) {
377 		btrfs_crit(fs_info,
378 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
379 			   btrfs_root_id(inode->root), btrfs_ino(inode),
380 			   ordered->file_offset, ordered->num_bytes,
381 			   len, ordered->bytes_left);
382 		ordered->bytes_left = 0;
383 	} else {
384 		ordered->bytes_left -= len;
385 	}
386 
387 	if (!uptodate)
388 		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
389 
390 	if (ordered->bytes_left)
391 		return false;
392 
393 	/*
394 	 * All the IO of the ordered extent is finished, we need to queue
395 	 * the finish_func to be executed.
396 	 */
397 	set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
398 	cond_wake_up(&ordered->wait);
399 	refcount_inc(&ordered->refs);
400 	trace_btrfs_ordered_extent_mark_finished(inode, ordered);
401 	return true;
402 }
403 
404 static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
405 {
406 	struct btrfs_inode *inode = ordered->inode;
407 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
408 	struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
409 		fs_info->endio_freespace_worker : fs_info->endio_write_workers;
410 
411 	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL);
412 	btrfs_queue_work(wq, &ordered->work);
413 }
414 
415 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
416 				 struct folio *folio, u64 file_offset, u64 len,
417 				 bool uptodate)
418 {
419 	struct btrfs_inode *inode = ordered->inode;
420 	bool ret;
421 
422 	trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
423 
424 	spin_lock(&inode->ordered_tree_lock);
425 	ret = can_finish_ordered_extent(ordered, folio, file_offset, len,
426 					uptodate);
427 	spin_unlock(&inode->ordered_tree_lock);
428 
429 	/*
430 	 * If this is a COW write it means we created new extent maps for the
431 	 * range and they point to unwritten locations if we got an error either
432 	 * before submitting a bio or during IO.
433 	 *
434 	 * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
435 	 * are queuing its completion below. During completion, at
436 	 * btrfs_finish_one_ordered(), we will drop the extent maps for the
437 	 * unwritten extents.
438 	 *
439 	 * However because completion runs in a work queue we can end up having
440 	 * a fast fsync running before that. In the case of direct IO, once we
441 	 * unlock the inode the fsync might start, and we queue the completion
442 	 * before unlocking the inode. In the case of buffered IO when writeback
443 	 * finishes (end_bbio_data_write()) we queue the completion, so if the
444 	 * writeback was triggered by a fast fsync, the fsync might start
445 	 * logging before ordered extent completion runs in the work queue.
446 	 *
447 	 * The fast fsync will log file extent items based on the extent maps it
448 	 * finds, so if by the time it collects extent maps the ordered extent
449 	 * completion didn't happen yet, it will log file extent items that
450 	 * point to unwritten extents, resulting in a corruption if a crash
451 	 * happens and the log tree is replayed. Note that a fast fsync does not
452 	 * wait for completion of ordered extents in order to reduce latency.
453 	 *
454 	 * Set a flag in the inode so that the next fast fsync will wait for
455 	 * ordered extents to complete before starting to log.
456 	 */
457 	if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
458 		set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
459 
460 	if (ret)
461 		btrfs_queue_ordered_fn(ordered);
462 }
463 
464 /*
465  * Mark all ordered extents io inside the specified range finished.
466  *
467  * @folio:	 The involved folio for the operation.
468  *		 For uncompressed buffered IO, the folio status also needs to be
469  *		 updated to indicate whether the pending ordered io is finished.
470  *		 Can be NULL for direct IO and compressed write.
471  *		 For these cases, callers are ensured they won't execute the
472  *		 endio function twice.
473  *
474  * This function is called for endio, thus the range must have ordered
475  * extent(s) covering it.
476  */
477 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
478 				    struct folio *folio, u64 file_offset,
479 				    u64 num_bytes, bool uptodate)
480 {
481 	struct rb_node *node;
482 	struct btrfs_ordered_extent *entry = NULL;
483 	u64 cur = file_offset;
484 	const u64 end = file_offset + num_bytes;
485 
486 	trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate);
487 
488 	spin_lock(&inode->ordered_tree_lock);
489 	while (cur < end) {
490 		u64 entry_end;
491 		u64 this_end;
492 		u64 len;
493 
494 		node = ordered_tree_search(inode, cur);
495 		/* No ordered extents at all */
496 		if (!node)
497 			break;
498 
499 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
500 		entry_end = entry->file_offset + entry->num_bytes;
501 		/*
502 		 * |<-- OE --->|  |
503 		 *		  cur
504 		 * Go to next OE.
505 		 */
506 		if (cur >= entry_end) {
507 			node = rb_next(node);
508 			/* No more ordered extents, exit */
509 			if (!node)
510 				break;
511 			entry = rb_entry(node, struct btrfs_ordered_extent,
512 					 rb_node);
513 
514 			/* Go to next ordered extent and continue */
515 			cur = entry->file_offset;
516 			continue;
517 		}
518 		/*
519 		 * |	|<--- OE --->|
520 		 * cur
521 		 * Go to the start of OE.
522 		 */
523 		if (cur < entry->file_offset) {
524 			cur = entry->file_offset;
525 			continue;
526 		}
527 
528 		/*
529 		 * Now we are definitely inside one ordered extent.
530 		 *
531 		 * |<--- OE --->|
532 		 *	|
533 		 *	cur
534 		 */
535 		this_end = min(entry_end, end);
536 		len = this_end - cur;
537 		ASSERT(len < U32_MAX);
538 
539 		if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) {
540 			spin_unlock(&inode->ordered_tree_lock);
541 			btrfs_queue_ordered_fn(entry);
542 			spin_lock(&inode->ordered_tree_lock);
543 		}
544 		cur += len;
545 	}
546 	spin_unlock(&inode->ordered_tree_lock);
547 }
548 
549 /*
550  * Finish IO for one ordered extent across a given range.  The range can only
551  * contain one ordered extent.
552  *
553  * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
554  *               search and use the ordered extent directly.
555  * 		 Will be also used to store the finished ordered extent.
556  * @file_offset: File offset for the finished IO
557  * @io_size:	 Length of the finish IO range
558  *
559  * Return true if the ordered extent is finished in the range, and update
560  * @cached.
561  * Return false otherwise.
562  *
563  * NOTE: The range can NOT cross multiple ordered extents.
564  * Thus caller should ensure the range doesn't cross ordered extents.
565  */
566 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
567 				    struct btrfs_ordered_extent **cached,
568 				    u64 file_offset, u64 io_size)
569 {
570 	struct rb_node *node;
571 	struct btrfs_ordered_extent *entry = NULL;
572 	bool finished = false;
573 
574 	spin_lock(&inode->ordered_tree_lock);
575 	if (cached && *cached) {
576 		entry = *cached;
577 		goto have_entry;
578 	}
579 
580 	node = ordered_tree_search(inode, file_offset);
581 	if (!node)
582 		goto out;
583 
584 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
585 have_entry:
586 	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
587 		goto out;
588 
589 	if (io_size > entry->bytes_left)
590 		btrfs_crit(inode->root->fs_info,
591 			   "bad ordered accounting left %llu size %llu",
592 		       entry->bytes_left, io_size);
593 
594 	entry->bytes_left -= io_size;
595 
596 	if (entry->bytes_left == 0) {
597 		/*
598 		 * Ensure only one caller can set the flag and finished_ret
599 		 * accordingly
600 		 */
601 		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
602 		/* test_and_set_bit implies a barrier */
603 		cond_wake_up_nomb(&entry->wait);
604 	}
605 out:
606 	if (finished && cached && entry) {
607 		*cached = entry;
608 		refcount_inc(&entry->refs);
609 		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
610 	}
611 	spin_unlock(&inode->ordered_tree_lock);
612 	return finished;
613 }
614 
615 /*
616  * used to drop a reference on an ordered extent.  This will free
617  * the extent if the last reference is dropped
618  */
619 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
620 {
621 	trace_btrfs_ordered_extent_put(entry->inode, entry);
622 
623 	if (refcount_dec_and_test(&entry->refs)) {
624 		struct btrfs_ordered_sum *sum;
625 		struct btrfs_ordered_sum *tmp;
626 
627 		ASSERT(list_empty(&entry->root_extent_list));
628 		ASSERT(list_empty(&entry->log_list));
629 		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
630 		btrfs_add_delayed_iput(entry->inode);
631 		list_for_each_entry_safe(sum, tmp, &entry->list, list)
632 			kvfree(sum);
633 		kmem_cache_free(btrfs_ordered_extent_cache, entry);
634 	}
635 }
636 
637 /*
638  * remove an ordered extent from the tree.  No references are dropped
639  * and waiters are woken up.
640  */
641 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
642 				 struct btrfs_ordered_extent *entry)
643 {
644 	struct btrfs_root *root = btrfs_inode->root;
645 	struct btrfs_fs_info *fs_info = root->fs_info;
646 	struct rb_node *node;
647 	bool pending;
648 	bool freespace_inode;
649 
650 	/*
651 	 * If this is a free space inode the thread has not acquired the ordered
652 	 * extents lockdep map.
653 	 */
654 	freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
655 
656 	btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
657 	/* This is paired with alloc_ordered_extent(). */
658 	spin_lock(&btrfs_inode->lock);
659 	btrfs_mod_outstanding_extents(btrfs_inode, -1);
660 	spin_unlock(&btrfs_inode->lock);
661 	if (root != fs_info->tree_root) {
662 		u64 release;
663 
664 		if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
665 			release = entry->disk_num_bytes;
666 		else
667 			release = entry->num_bytes;
668 		btrfs_delalloc_release_metadata(btrfs_inode, release,
669 						test_bit(BTRFS_ORDERED_IOERR,
670 							 &entry->flags));
671 	}
672 
673 	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
674 				 fs_info->delalloc_batch);
675 
676 	spin_lock(&btrfs_inode->ordered_tree_lock);
677 	node = &entry->rb_node;
678 	rb_erase(node, &btrfs_inode->ordered_tree);
679 	RB_CLEAR_NODE(node);
680 	if (btrfs_inode->ordered_tree_last == node)
681 		btrfs_inode->ordered_tree_last = NULL;
682 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
683 	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
684 	spin_unlock(&btrfs_inode->ordered_tree_lock);
685 
686 	/*
687 	 * The current running transaction is waiting on us, we need to let it
688 	 * know that we're complete and wake it up.
689 	 */
690 	if (pending) {
691 		struct btrfs_transaction *trans;
692 
693 		/*
694 		 * The checks for trans are just a formality, it should be set,
695 		 * but if it isn't we don't want to deref/assert under the spin
696 		 * lock, so be nice and check if trans is set, but ASSERT() so
697 		 * if it isn't set a developer will notice.
698 		 */
699 		spin_lock(&fs_info->trans_lock);
700 		trans = fs_info->running_transaction;
701 		if (trans)
702 			refcount_inc(&trans->use_count);
703 		spin_unlock(&fs_info->trans_lock);
704 
705 		ASSERT(trans || BTRFS_FS_ERROR(fs_info));
706 		if (trans) {
707 			if (atomic_dec_and_test(&trans->pending_ordered))
708 				wake_up(&trans->pending_wait);
709 			btrfs_put_transaction(trans);
710 		}
711 	}
712 
713 	btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
714 
715 	spin_lock(&root->ordered_extent_lock);
716 	list_del_init(&entry->root_extent_list);
717 	root->nr_ordered_extents--;
718 
719 	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
720 
721 	if (!root->nr_ordered_extents) {
722 		spin_lock(&fs_info->ordered_root_lock);
723 		BUG_ON(list_empty(&root->ordered_root));
724 		list_del_init(&root->ordered_root);
725 		spin_unlock(&fs_info->ordered_root_lock);
726 	}
727 	spin_unlock(&root->ordered_extent_lock);
728 	wake_up(&entry->wait);
729 	if (!freespace_inode)
730 		btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
731 }
732 
733 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
734 {
735 	struct btrfs_ordered_extent *ordered;
736 
737 	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
738 	btrfs_start_ordered_extent(ordered);
739 	complete(&ordered->completion);
740 }
741 
742 /*
743  * Wait for all the ordered extents in a root. Use @bg as range or do whole
744  * range if it's NULL.
745  */
746 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
747 			       const struct btrfs_block_group *bg)
748 {
749 	struct btrfs_fs_info *fs_info = root->fs_info;
750 	LIST_HEAD(splice);
751 	LIST_HEAD(skipped);
752 	LIST_HEAD(works);
753 	struct btrfs_ordered_extent *ordered, *next;
754 	u64 count = 0;
755 	u64 range_start, range_len;
756 	u64 range_end;
757 
758 	if (bg) {
759 		range_start = bg->start;
760 		range_len = bg->length;
761 	} else {
762 		range_start = 0;
763 		range_len = U64_MAX;
764 	}
765 	range_end = range_start + range_len;
766 
767 	mutex_lock(&root->ordered_extent_mutex);
768 	spin_lock(&root->ordered_extent_lock);
769 	list_splice_init(&root->ordered_extents, &splice);
770 	while (!list_empty(&splice) && nr) {
771 		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
772 					   root_extent_list);
773 
774 		if (range_end <= ordered->disk_bytenr ||
775 		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
776 			list_move_tail(&ordered->root_extent_list, &skipped);
777 			cond_resched_lock(&root->ordered_extent_lock);
778 			continue;
779 		}
780 
781 		list_move_tail(&ordered->root_extent_list,
782 			       &root->ordered_extents);
783 		refcount_inc(&ordered->refs);
784 		spin_unlock(&root->ordered_extent_lock);
785 
786 		btrfs_init_work(&ordered->flush_work,
787 				btrfs_run_ordered_extent_work, NULL);
788 		list_add_tail(&ordered->work_list, &works);
789 		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
790 
791 		cond_resched();
792 		if (nr != U64_MAX)
793 			nr--;
794 		count++;
795 		spin_lock(&root->ordered_extent_lock);
796 	}
797 	list_splice_tail(&skipped, &root->ordered_extents);
798 	list_splice_tail(&splice, &root->ordered_extents);
799 	spin_unlock(&root->ordered_extent_lock);
800 
801 	list_for_each_entry_safe(ordered, next, &works, work_list) {
802 		list_del_init(&ordered->work_list);
803 		wait_for_completion(&ordered->completion);
804 		btrfs_put_ordered_extent(ordered);
805 		cond_resched();
806 	}
807 	mutex_unlock(&root->ordered_extent_mutex);
808 
809 	return count;
810 }
811 
812 /*
813  * Wait for @nr ordered extents that intersect the @bg, or the whole range of
814  * the filesystem if @bg is NULL.
815  */
816 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
817 			      const struct btrfs_block_group *bg)
818 {
819 	struct btrfs_root *root;
820 	LIST_HEAD(splice);
821 	u64 done;
822 
823 	mutex_lock(&fs_info->ordered_operations_mutex);
824 	spin_lock(&fs_info->ordered_root_lock);
825 	list_splice_init(&fs_info->ordered_roots, &splice);
826 	while (!list_empty(&splice) && nr) {
827 		root = list_first_entry(&splice, struct btrfs_root,
828 					ordered_root);
829 		root = btrfs_grab_root(root);
830 		BUG_ON(!root);
831 		list_move_tail(&root->ordered_root,
832 			       &fs_info->ordered_roots);
833 		spin_unlock(&fs_info->ordered_root_lock);
834 
835 		done = btrfs_wait_ordered_extents(root, nr, bg);
836 		btrfs_put_root(root);
837 
838 		if (nr != U64_MAX)
839 			nr -= done;
840 
841 		spin_lock(&fs_info->ordered_root_lock);
842 	}
843 	list_splice_tail(&splice, &fs_info->ordered_roots);
844 	spin_unlock(&fs_info->ordered_root_lock);
845 	mutex_unlock(&fs_info->ordered_operations_mutex);
846 }
847 
848 /*
849  * Start IO and wait for a given ordered extent to finish.
850  *
851  * Wait on page writeback for all the pages in the extent but not in
852  * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
853  * IO completion code to insert metadata into the btree corresponding to the extent.
854  */
855 void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
856 					    u64 nowriteback_start, u32 nowriteback_len)
857 {
858 	u64 start = entry->file_offset;
859 	u64 end = start + entry->num_bytes - 1;
860 	struct btrfs_inode *inode = entry->inode;
861 	bool freespace_inode;
862 
863 	trace_btrfs_ordered_extent_start(inode, entry);
864 
865 	/*
866 	 * If this is a free space inode do not take the ordered extents lockdep
867 	 * map.
868 	 */
869 	freespace_inode = btrfs_is_free_space_inode(inode);
870 
871 	/*
872 	 * pages in the range can be dirty, clean or writeback.  We
873 	 * start IO on any dirty ones so the wait doesn't stall waiting
874 	 * for the flusher thread to find them
875 	 */
876 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
877 		if (!nowriteback_len) {
878 			filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
879 		} else {
880 			if (start < nowriteback_start)
881 				filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
882 							 nowriteback_start - 1);
883 			if (nowriteback_start + nowriteback_len < end)
884 				filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
885 							 nowriteback_start + nowriteback_len,
886 							 end);
887 		}
888 	}
889 
890 	if (!freespace_inode)
891 		btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
892 	wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
893 }
894 
895 /*
896  * Used to wait on ordered extents across a large range of bytes.
897  */
898 int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len)
899 {
900 	int ret = 0;
901 	int ret_wb = 0;
902 	u64 end;
903 	u64 orig_end;
904 	struct btrfs_ordered_extent *ordered;
905 
906 	if (start + len < start) {
907 		orig_end = OFFSET_MAX;
908 	} else {
909 		orig_end = start + len - 1;
910 		if (orig_end > OFFSET_MAX)
911 			orig_end = OFFSET_MAX;
912 	}
913 
914 	/* start IO across the range first to instantiate any delalloc
915 	 * extents
916 	 */
917 	ret = btrfs_fdatawrite_range(inode, start, orig_end);
918 	if (ret)
919 		return ret;
920 
921 	/*
922 	 * If we have a writeback error don't return immediately. Wait first
923 	 * for any ordered extents that haven't completed yet. This is to make
924 	 * sure no one can dirty the same page ranges and call writepages()
925 	 * before the ordered extents complete - to avoid failures (-EEXIST)
926 	 * when adding the new ordered extents to the ordered tree.
927 	 */
928 	ret_wb = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, orig_end);
929 
930 	end = orig_end;
931 	while (1) {
932 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
933 		if (!ordered)
934 			break;
935 		if (ordered->file_offset > orig_end) {
936 			btrfs_put_ordered_extent(ordered);
937 			break;
938 		}
939 		if (ordered->file_offset + ordered->num_bytes <= start) {
940 			btrfs_put_ordered_extent(ordered);
941 			break;
942 		}
943 		btrfs_start_ordered_extent(ordered);
944 		end = ordered->file_offset;
945 		/*
946 		 * If the ordered extent had an error save the error but don't
947 		 * exit without waiting first for all other ordered extents in
948 		 * the range to complete.
949 		 */
950 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
951 			ret = -EIO;
952 		btrfs_put_ordered_extent(ordered);
953 		if (end == 0 || end == start)
954 			break;
955 		end--;
956 	}
957 	return ret_wb ? ret_wb : ret;
958 }
959 
960 /*
961  * find an ordered extent corresponding to file_offset.  return NULL if
962  * nothing is found, otherwise take a reference on the extent and return it
963  */
964 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
965 							 u64 file_offset)
966 {
967 	struct rb_node *node;
968 	struct btrfs_ordered_extent *entry = NULL;
969 
970 	spin_lock(&inode->ordered_tree_lock);
971 	node = ordered_tree_search(inode, file_offset);
972 	if (!node)
973 		goto out;
974 
975 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
976 	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
977 		entry = NULL;
978 	if (entry) {
979 		refcount_inc(&entry->refs);
980 		trace_btrfs_ordered_extent_lookup(inode, entry);
981 	}
982 out:
983 	spin_unlock(&inode->ordered_tree_lock);
984 	return entry;
985 }
986 
987 /* Since the DIO code tries to lock a wide area we need to look for any ordered
988  * extents that exist in the range, rather than just the start of the range.
989  */
990 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
991 		struct btrfs_inode *inode, u64 file_offset, u64 len)
992 {
993 	struct rb_node *node;
994 	struct btrfs_ordered_extent *entry = NULL;
995 
996 	spin_lock(&inode->ordered_tree_lock);
997 	node = ordered_tree_search(inode, file_offset);
998 	if (!node) {
999 		node = ordered_tree_search(inode, file_offset + len);
1000 		if (!node)
1001 			goto out;
1002 	}
1003 
1004 	while (1) {
1005 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1006 		if (btrfs_range_overlaps(entry, file_offset, len))
1007 			break;
1008 
1009 		if (entry->file_offset >= file_offset + len) {
1010 			entry = NULL;
1011 			break;
1012 		}
1013 		entry = NULL;
1014 		node = rb_next(node);
1015 		if (!node)
1016 			break;
1017 	}
1018 out:
1019 	if (entry) {
1020 		refcount_inc(&entry->refs);
1021 		trace_btrfs_ordered_extent_lookup_range(inode, entry);
1022 	}
1023 	spin_unlock(&inode->ordered_tree_lock);
1024 	return entry;
1025 }
1026 
1027 /*
1028  * Adds all ordered extents to the given list. The list ends up sorted by the
1029  * file_offset of the ordered extents.
1030  */
1031 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
1032 					   struct list_head *list)
1033 {
1034 	struct rb_node *n;
1035 
1036 	btrfs_assert_inode_locked(inode);
1037 
1038 	spin_lock(&inode->ordered_tree_lock);
1039 	for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) {
1040 		struct btrfs_ordered_extent *ordered;
1041 
1042 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
1043 
1044 		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
1045 			continue;
1046 
1047 		ASSERT(list_empty(&ordered->log_list));
1048 		list_add_tail(&ordered->log_list, list);
1049 		refcount_inc(&ordered->refs);
1050 		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
1051 	}
1052 	spin_unlock(&inode->ordered_tree_lock);
1053 }
1054 
1055 /*
1056  * lookup and return any extent before 'file_offset'.  NULL is returned
1057  * if none is found
1058  */
1059 struct btrfs_ordered_extent *
1060 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
1061 {
1062 	struct rb_node *node;
1063 	struct btrfs_ordered_extent *entry = NULL;
1064 
1065 	spin_lock(&inode->ordered_tree_lock);
1066 	node = ordered_tree_search(inode, file_offset);
1067 	if (!node)
1068 		goto out;
1069 
1070 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1071 	refcount_inc(&entry->refs);
1072 	trace_btrfs_ordered_extent_lookup_first(inode, entry);
1073 out:
1074 	spin_unlock(&inode->ordered_tree_lock);
1075 	return entry;
1076 }
1077 
1078 /*
1079  * Lookup the first ordered extent that overlaps the range
1080  * [@file_offset, @file_offset + @len).
1081  *
1082  * The difference between this and btrfs_lookup_first_ordered_extent() is
1083  * that this one won't return any ordered extent that does not overlap the range.
1084  * And the difference against btrfs_lookup_ordered_extent() is, this function
1085  * ensures the first ordered extent gets returned.
1086  */
1087 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
1088 			struct btrfs_inode *inode, u64 file_offset, u64 len)
1089 {
1090 	struct rb_node *node;
1091 	struct rb_node *cur;
1092 	struct rb_node *prev;
1093 	struct rb_node *next;
1094 	struct btrfs_ordered_extent *entry = NULL;
1095 
1096 	spin_lock(&inode->ordered_tree_lock);
1097 	node = inode->ordered_tree.rb_node;
1098 	/*
1099 	 * Here we don't want to use tree_search() which will use tree->last
1100 	 * and screw up the search order.
1101 	 * And __tree_search() can't return the adjacent ordered extents
1102 	 * either, thus here we do our own search.
1103 	 */
1104 	while (node) {
1105 		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
1106 
1107 		if (file_offset < entry->file_offset) {
1108 			node = node->rb_left;
1109 		} else if (file_offset >= entry_end(entry)) {
1110 			node = node->rb_right;
1111 		} else {
1112 			/*
1113 			 * Direct hit, got an ordered extent that starts at
1114 			 * @file_offset
1115 			 */
1116 			goto out;
1117 		}
1118 	}
1119 	if (!entry) {
1120 		/* Empty tree */
1121 		goto out;
1122 	}
1123 
1124 	cur = &entry->rb_node;
1125 	/* We got an entry around @file_offset, check adjacent entries */
1126 	if (entry->file_offset < file_offset) {
1127 		prev = cur;
1128 		next = rb_next(cur);
1129 	} else {
1130 		prev = rb_prev(cur);
1131 		next = cur;
1132 	}
1133 	if (prev) {
1134 		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
1135 		if (btrfs_range_overlaps(entry, file_offset, len))
1136 			goto out;
1137 	}
1138 	if (next) {
1139 		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
1140 		if (btrfs_range_overlaps(entry, file_offset, len))
1141 			goto out;
1142 	}
1143 	/* No ordered extent in the range */
1144 	entry = NULL;
1145 out:
1146 	if (entry) {
1147 		refcount_inc(&entry->refs);
1148 		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
1149 	}
1150 
1151 	spin_unlock(&inode->ordered_tree_lock);
1152 	return entry;
1153 }
1154 
1155 /*
1156  * Lock the passed range and ensures all pending ordered extents in it are run
1157  * to completion.
1158  *
1159  * @inode:        Inode whose ordered tree is to be searched
1160  * @start:        Beginning of range to flush
1161  * @end:          Last byte of range to lock
1162  * @cached_state: If passed, will return the extent state responsible for the
1163  *                locked range. It's the caller's responsibility to free the
1164  *                cached state.
1165  *
1166  * Always return with the given range locked, ensuring after it's called no
1167  * order extent can be pending.
1168  */
1169 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1170 					u64 end,
1171 					struct extent_state **cached_state)
1172 {
1173 	struct btrfs_ordered_extent *ordered;
1174 	struct extent_state *cache = NULL;
1175 	struct extent_state **cachedp = &cache;
1176 
1177 	if (cached_state)
1178 		cachedp = cached_state;
1179 
1180 	while (1) {
1181 		btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
1182 		ordered = btrfs_lookup_ordered_range(inode, start,
1183 						     end - start + 1);
1184 		if (!ordered) {
1185 			/*
1186 			 * If no external cached_state has been passed then
1187 			 * decrement the extra ref taken for cachedp since we
1188 			 * aren't exposing it outside of this function
1189 			 */
1190 			if (!cached_state)
1191 				refcount_dec(&cache->refs);
1192 			break;
1193 		}
1194 		btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
1195 		btrfs_start_ordered_extent(ordered);
1196 		btrfs_put_ordered_extent(ordered);
1197 	}
1198 }
1199 
1200 /*
1201  * Lock the passed range and ensure all pending ordered extents in it are run
1202  * to completion in nowait mode.
1203  *
1204  * Return true if btrfs_lock_ordered_range does not return any extents,
1205  * otherwise false.
1206  */
1207 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
1208 				  struct extent_state **cached_state)
1209 {
1210 	struct btrfs_ordered_extent *ordered;
1211 
1212 	if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
1213 		return false;
1214 
1215 	ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
1216 	if (!ordered)
1217 		return true;
1218 
1219 	btrfs_put_ordered_extent(ordered);
1220 	btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
1221 
1222 	return false;
1223 }
1224 
1225 /* Split out a new ordered extent for this first @len bytes of @ordered. */
1226 struct btrfs_ordered_extent *btrfs_split_ordered_extent(
1227 			struct btrfs_ordered_extent *ordered, u64 len)
1228 {
1229 	struct btrfs_inode *inode = ordered->inode;
1230 	struct btrfs_root *root = inode->root;
1231 	struct btrfs_fs_info *fs_info = root->fs_info;
1232 	u64 file_offset = ordered->file_offset;
1233 	u64 disk_bytenr = ordered->disk_bytenr;
1234 	unsigned long flags = ordered->flags;
1235 	struct btrfs_ordered_sum *sum, *tmpsum;
1236 	struct btrfs_ordered_extent *new;
1237 	struct rb_node *node;
1238 	u64 offset = 0;
1239 
1240 	trace_btrfs_ordered_extent_split(inode, ordered);
1241 
1242 	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
1243 
1244 	/*
1245 	 * The entire bio must be covered by the ordered extent, but we can't
1246 	 * reduce the original extent to a zero length either.
1247 	 */
1248 	if (WARN_ON_ONCE(len >= ordered->num_bytes))
1249 		return ERR_PTR(-EINVAL);
1250 	/*
1251 	 * If our ordered extent had an error there's no point in continuing.
1252 	 * The error may have come from a transaction abort done either by this
1253 	 * task or some other concurrent task, and the transaction abort path
1254 	 * iterates over all existing ordered extents and sets the flag
1255 	 * BTRFS_ORDERED_IOERR on them.
1256 	 */
1257 	if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
1258 		const int fs_error = BTRFS_FS_ERROR(fs_info);
1259 
1260 		return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
1261 	}
1262 	/* We cannot split partially completed ordered extents. */
1263 	if (ordered->bytes_left) {
1264 		ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
1265 		if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
1266 			return ERR_PTR(-EINVAL);
1267 	}
1268 	/* We cannot split a compressed ordered extent. */
1269 	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
1270 		return ERR_PTR(-EINVAL);
1271 
1272 	new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
1273 				   len, 0, flags, ordered->compress_type);
1274 	if (IS_ERR(new))
1275 		return new;
1276 
1277 	/* One ref for the tree. */
1278 	refcount_inc(&new->refs);
1279 
1280 	/*
1281 	 * Take the root's ordered_extent_lock to avoid a race with
1282 	 * btrfs_wait_ordered_extents() when updating the disk_bytenr and
1283 	 * disk_num_bytes fields of the ordered extent below.
1284 	 *
1285 	 * There's no concern about a previous caller of
1286 	 * btrfs_wait_ordered_extents() getting the trimmed ordered extent
1287 	 * before we insert the new one, because even if it gets the ordered
1288 	 * extent before it's trimmed and the new one inserted, right before it
1289 	 * uses it or during its use, the ordered extent might have been
1290 	 * trimmed in the meanwhile, and it missed the new ordered extent.
1291 	 * There's no way around this and it's harmless for current use cases,
1292 	 * so we take the root's ordered_extent_lock to fix that race during
1293 	 * trimming and silence tools like KCSAN.
1294 	 */
1295 	spin_lock_irq(&root->ordered_extent_lock);
1296 	spin_lock(&inode->ordered_tree_lock);
1297 
1298 	/*
1299 	 * We don't have overlapping ordered extents (that would imply double
1300 	 * allocation of extents) and we checked above that the split length
1301 	 * does not cross the ordered extent's num_bytes field, so there's
1302 	 * no need to remove it and re-insert it in the tree.
1303 	 */
1304 	ordered->file_offset += len;
1305 	ordered->disk_bytenr += len;
1306 	ordered->num_bytes -= len;
1307 	ordered->disk_num_bytes -= len;
1308 	ordered->ram_bytes -= len;
1309 
1310 	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
1311 		ASSERT(ordered->bytes_left == 0);
1312 		new->bytes_left = 0;
1313 	} else {
1314 		ordered->bytes_left -= len;
1315 	}
1316 
1317 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
1318 		if (ordered->truncated_len > len) {
1319 			ordered->truncated_len -= len;
1320 		} else {
1321 			new->truncated_len = ordered->truncated_len;
1322 			ordered->truncated_len = 0;
1323 		}
1324 	}
1325 
1326 	list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
1327 		if (offset == len)
1328 			break;
1329 		list_move_tail(&sum->list, &new->list);
1330 		offset += sum->len;
1331 	}
1332 
1333 	node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node);
1334 	if (unlikely(node))
1335 		btrfs_panic(fs_info, -EEXIST,
1336 			"inconsistency in ordered tree at offset %llu after split",
1337 			new->file_offset);
1338 	spin_unlock(&inode->ordered_tree_lock);
1339 
1340 	list_add_tail(&new->root_extent_list, &root->ordered_extents);
1341 	root->nr_ordered_extents++;
1342 	spin_unlock_irq(&root->ordered_extent_lock);
1343 	return new;
1344 }
1345 
1346 int __init ordered_data_init(void)
1347 {
1348 	btrfs_ordered_extent_cache = KMEM_CACHE(btrfs_ordered_extent, 0);
1349 	if (!btrfs_ordered_extent_cache)
1350 		return -ENOMEM;
1351 
1352 	return 0;
1353 }
1354 
1355 void __cold ordered_data_exit(void)
1356 {
1357 	kmem_cache_destroy(btrfs_ordered_extent_cache);
1358 }
1359