xref: /linux/fs/btrfs/tree-log.c (revision a4eb44a6435d6d8f9e642407a4a06f65eb90ca04)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/blkdev.h>
9 #include <linux/list_sort.h>
10 #include <linux/iversion.h>
11 #include "misc.h"
12 #include "ctree.h"
13 #include "tree-log.h"
14 #include "disk-io.h"
15 #include "locking.h"
16 #include "print-tree.h"
17 #include "backref.h"
18 #include "compression.h"
19 #include "qgroup.h"
20 #include "block-group.h"
21 #include "space-info.h"
22 #include "zoned.h"
23 #include "inode-item.h"
24 
25 /* magic values for the inode_only field in btrfs_log_inode:
26  *
27  * LOG_INODE_ALL means to log everything
28  * LOG_INODE_EXISTS means to log just enough to recreate the inode
29  * during log replay
30  */
31 enum {
32 	LOG_INODE_ALL,
33 	LOG_INODE_EXISTS,
34 	LOG_OTHER_INODE,
35 	LOG_OTHER_INODE_ALL,
36 };
37 
38 /*
39  * directory trouble cases
40  *
41  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
42  * log, we must force a full commit before doing an fsync of the directory
43  * where the unlink was done.
44  * ---> record transid of last unlink/rename per directory
45  *
46  * mkdir foo/some_dir
47  * normal commit
48  * rename foo/some_dir foo2/some_dir
49  * mkdir foo/some_dir
50  * fsync foo/some_dir/some_file
51  *
52  * The fsync above will unlink the original some_dir without recording
53  * it in its new location (foo2).  After a crash, some_dir will be gone
54  * unless the fsync of some_file forces a full commit
55  *
56  * 2) we must log any new names for any file or dir that is in the fsync
57  * log. ---> check inode while renaming/linking.
58  *
59  * 2a) we must log any new names for any file or dir during rename
60  * when the directory they are being removed from was logged.
61  * ---> check inode and old parent dir during rename
62  *
63  *  2a is actually the more important variant.  With the extra logging
64  *  a crash might unlink the old name without recreating the new one
65  *
66  * 3) after a crash, we must go through any directories with a link count
67  * of zero and redo the rm -rf
68  *
69  * mkdir f1/foo
70  * normal commit
71  * rm -rf f1/foo
72  * fsync(f1)
73  *
74  * The directory f1 was fully removed from the FS, but fsync was never
75  * called on f1, only its parent dir.  After a crash the rm -rf must
76  * be replayed.  This must be able to recurse down the entire
77  * directory tree.  The inode link count fixup code takes care of the
78  * ugly details.
79  */
80 
81 /*
82  * stages for the tree walking.  The first
83  * stage (0) is to only pin down the blocks we find
84  * the second stage (1) is to make sure that all the inodes
85  * we find in the log are created in the subvolume.
86  *
87  * The last stage is to deal with directories and links and extents
88  * and all the other fun semantics
89  */
90 enum {
91 	LOG_WALK_PIN_ONLY,
92 	LOG_WALK_REPLAY_INODES,
93 	LOG_WALK_REPLAY_DIR_INDEX,
94 	LOG_WALK_REPLAY_ALL,
95 };
96 
97 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
98 			   struct btrfs_inode *inode,
99 			   int inode_only,
100 			   struct btrfs_log_ctx *ctx);
101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
102 			     struct btrfs_root *root,
103 			     struct btrfs_path *path, u64 objectid);
104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
105 				       struct btrfs_root *root,
106 				       struct btrfs_root *log,
107 				       struct btrfs_path *path,
108 				       u64 dirid, int del_all);
109 static void wait_log_commit(struct btrfs_root *root, int transid);
110 
111 /*
112  * tree logging is a special write ahead log used to make sure that
113  * fsyncs and O_SYNCs can happen without doing full tree commits.
114  *
115  * Full tree commits are expensive because they require commonly
116  * modified blocks to be recowed, creating many dirty pages in the
117  * extent tree an 4x-6x higher write load than ext3.
118  *
119  * Instead of doing a tree commit on every fsync, we use the
120  * key ranges and transaction ids to find items for a given file or directory
121  * that have changed in this transaction.  Those items are copied into
122  * a special tree (one per subvolume root), that tree is written to disk
123  * and then the fsync is considered complete.
124  *
125  * After a crash, items are copied out of the log-tree back into the
126  * subvolume tree.  Any file data extents found are recorded in the extent
127  * allocation tree, and the log-tree freed.
128  *
129  * The log tree is read three times, once to pin down all the extents it is
130  * using in ram and once, once to create all the inodes logged in the tree
131  * and once to do all the other items.
132  */
133 
134 /*
135  * start a sub transaction and setup the log tree
136  * this increments the log tree writer count to make the people
137  * syncing the tree wait for us to finish
138  */
139 static int start_log_trans(struct btrfs_trans_handle *trans,
140 			   struct btrfs_root *root,
141 			   struct btrfs_log_ctx *ctx)
142 {
143 	struct btrfs_fs_info *fs_info = root->fs_info;
144 	struct btrfs_root *tree_root = fs_info->tree_root;
145 	const bool zoned = btrfs_is_zoned(fs_info);
146 	int ret = 0;
147 	bool created = false;
148 
149 	/*
150 	 * First check if the log root tree was already created. If not, create
151 	 * it before locking the root's log_mutex, just to keep lockdep happy.
152 	 */
153 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
154 		mutex_lock(&tree_root->log_mutex);
155 		if (!fs_info->log_root_tree) {
156 			ret = btrfs_init_log_root_tree(trans, fs_info);
157 			if (!ret) {
158 				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
159 				created = true;
160 			}
161 		}
162 		mutex_unlock(&tree_root->log_mutex);
163 		if (ret)
164 			return ret;
165 	}
166 
167 	mutex_lock(&root->log_mutex);
168 
169 again:
170 	if (root->log_root) {
171 		int index = (root->log_transid + 1) % 2;
172 
173 		if (btrfs_need_log_full_commit(trans)) {
174 			ret = -EAGAIN;
175 			goto out;
176 		}
177 
178 		if (zoned && atomic_read(&root->log_commit[index])) {
179 			wait_log_commit(root, root->log_transid - 1);
180 			goto again;
181 		}
182 
183 		if (!root->log_start_pid) {
184 			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
185 			root->log_start_pid = current->pid;
186 		} else if (root->log_start_pid != current->pid) {
187 			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
188 		}
189 	} else {
190 		/*
191 		 * This means fs_info->log_root_tree was already created
192 		 * for some other FS trees. Do the full commit not to mix
193 		 * nodes from multiple log transactions to do sequential
194 		 * writing.
195 		 */
196 		if (zoned && !created) {
197 			ret = -EAGAIN;
198 			goto out;
199 		}
200 
201 		ret = btrfs_add_log_tree(trans, root);
202 		if (ret)
203 			goto out;
204 
205 		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
206 		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
207 		root->log_start_pid = current->pid;
208 	}
209 
210 	atomic_inc(&root->log_writers);
211 	if (!ctx->logging_new_name) {
212 		int index = root->log_transid % 2;
213 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
214 		ctx->log_transid = root->log_transid;
215 	}
216 
217 out:
218 	mutex_unlock(&root->log_mutex);
219 	return ret;
220 }
221 
222 /*
223  * returns 0 if there was a log transaction running and we were able
224  * to join, or returns -ENOENT if there were not transactions
225  * in progress
226  */
227 static int join_running_log_trans(struct btrfs_root *root)
228 {
229 	const bool zoned = btrfs_is_zoned(root->fs_info);
230 	int ret = -ENOENT;
231 
232 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
233 		return ret;
234 
235 	mutex_lock(&root->log_mutex);
236 again:
237 	if (root->log_root) {
238 		int index = (root->log_transid + 1) % 2;
239 
240 		ret = 0;
241 		if (zoned && atomic_read(&root->log_commit[index])) {
242 			wait_log_commit(root, root->log_transid - 1);
243 			goto again;
244 		}
245 		atomic_inc(&root->log_writers);
246 	}
247 	mutex_unlock(&root->log_mutex);
248 	return ret;
249 }
250 
251 /*
252  * This either makes the current running log transaction wait
253  * until you call btrfs_end_log_trans() or it makes any future
254  * log transactions wait until you call btrfs_end_log_trans()
255  */
256 void btrfs_pin_log_trans(struct btrfs_root *root)
257 {
258 	atomic_inc(&root->log_writers);
259 }
260 
261 /*
262  * indicate we're done making changes to the log tree
263  * and wake up anyone waiting to do a sync
264  */
265 void btrfs_end_log_trans(struct btrfs_root *root)
266 {
267 	if (atomic_dec_and_test(&root->log_writers)) {
268 		/* atomic_dec_and_test implies a barrier */
269 		cond_wake_up_nomb(&root->log_writer_wait);
270 	}
271 }
272 
273 static int btrfs_write_tree_block(struct extent_buffer *buf)
274 {
275 	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
276 					buf->start + buf->len - 1);
277 }
278 
279 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
280 {
281 	filemap_fdatawait_range(buf->pages[0]->mapping,
282 			        buf->start, buf->start + buf->len - 1);
283 }
284 
285 /*
286  * the walk control struct is used to pass state down the chain when
287  * processing the log tree.  The stage field tells us which part
288  * of the log tree processing we are currently doing.  The others
289  * are state fields used for that specific part
290  */
291 struct walk_control {
292 	/* should we free the extent on disk when done?  This is used
293 	 * at transaction commit time while freeing a log tree
294 	 */
295 	int free;
296 
297 	/* should we write out the extent buffer?  This is used
298 	 * while flushing the log tree to disk during a sync
299 	 */
300 	int write;
301 
302 	/* should we wait for the extent buffer io to finish?  Also used
303 	 * while flushing the log tree to disk for a sync
304 	 */
305 	int wait;
306 
307 	/* pin only walk, we record which extents on disk belong to the
308 	 * log trees
309 	 */
310 	int pin;
311 
312 	/* what stage of the replay code we're currently in */
313 	int stage;
314 
315 	/*
316 	 * Ignore any items from the inode currently being processed. Needs
317 	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
318 	 * the LOG_WALK_REPLAY_INODES stage.
319 	 */
320 	bool ignore_cur_inode;
321 
322 	/* the root we are currently replaying */
323 	struct btrfs_root *replay_dest;
324 
325 	/* the trans handle for the current replay */
326 	struct btrfs_trans_handle *trans;
327 
328 	/* the function that gets used to process blocks we find in the
329 	 * tree.  Note the extent_buffer might not be up to date when it is
330 	 * passed in, and it must be checked or read if you need the data
331 	 * inside it
332 	 */
333 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
334 			    struct walk_control *wc, u64 gen, int level);
335 };
336 
337 /*
338  * process_func used to pin down extents, write them or wait on them
339  */
340 static int process_one_buffer(struct btrfs_root *log,
341 			      struct extent_buffer *eb,
342 			      struct walk_control *wc, u64 gen, int level)
343 {
344 	struct btrfs_fs_info *fs_info = log->fs_info;
345 	int ret = 0;
346 
347 	/*
348 	 * If this fs is mixed then we need to be able to process the leaves to
349 	 * pin down any logged extents, so we have to read the block.
350 	 */
351 	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
352 		ret = btrfs_read_buffer(eb, gen, level, NULL);
353 		if (ret)
354 			return ret;
355 	}
356 
357 	if (wc->pin)
358 		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
359 						      eb->len);
360 
361 	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
362 		if (wc->pin && btrfs_header_level(eb) == 0)
363 			ret = btrfs_exclude_logged_extents(eb);
364 		if (wc->write)
365 			btrfs_write_tree_block(eb);
366 		if (wc->wait)
367 			btrfs_wait_tree_block_writeback(eb);
368 	}
369 	return ret;
370 }
371 
372 static int do_overwrite_item(struct btrfs_trans_handle *trans,
373 			     struct btrfs_root *root,
374 			     struct btrfs_path *path,
375 			     struct extent_buffer *eb, int slot,
376 			     struct btrfs_key *key)
377 {
378 	int ret;
379 	u32 item_size;
380 	u64 saved_i_size = 0;
381 	int save_old_i_size = 0;
382 	unsigned long src_ptr;
383 	unsigned long dst_ptr;
384 	int overwrite_root = 0;
385 	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
386 
387 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
388 		overwrite_root = 1;
389 
390 	item_size = btrfs_item_size(eb, slot);
391 	src_ptr = btrfs_item_ptr_offset(eb, slot);
392 
393 	/* Our caller must have done a search for the key for us. */
394 	ASSERT(path->nodes[0] != NULL);
395 
396 	/*
397 	 * And the slot must point to the exact key or the slot where the key
398 	 * should be at (the first item with a key greater than 'key')
399 	 */
400 	if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
401 		struct btrfs_key found_key;
402 
403 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
404 		ret = btrfs_comp_cpu_keys(&found_key, key);
405 		ASSERT(ret >= 0);
406 	} else {
407 		ret = 1;
408 	}
409 
410 	if (ret == 0) {
411 		char *src_copy;
412 		char *dst_copy;
413 		u32 dst_size = btrfs_item_size(path->nodes[0],
414 						  path->slots[0]);
415 		if (dst_size != item_size)
416 			goto insert;
417 
418 		if (item_size == 0) {
419 			btrfs_release_path(path);
420 			return 0;
421 		}
422 		dst_copy = kmalloc(item_size, GFP_NOFS);
423 		src_copy = kmalloc(item_size, GFP_NOFS);
424 		if (!dst_copy || !src_copy) {
425 			btrfs_release_path(path);
426 			kfree(dst_copy);
427 			kfree(src_copy);
428 			return -ENOMEM;
429 		}
430 
431 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
432 
433 		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
434 		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
435 				   item_size);
436 		ret = memcmp(dst_copy, src_copy, item_size);
437 
438 		kfree(dst_copy);
439 		kfree(src_copy);
440 		/*
441 		 * they have the same contents, just return, this saves
442 		 * us from cowing blocks in the destination tree and doing
443 		 * extra writes that may not have been done by a previous
444 		 * sync
445 		 */
446 		if (ret == 0) {
447 			btrfs_release_path(path);
448 			return 0;
449 		}
450 
451 		/*
452 		 * We need to load the old nbytes into the inode so when we
453 		 * replay the extents we've logged we get the right nbytes.
454 		 */
455 		if (inode_item) {
456 			struct btrfs_inode_item *item;
457 			u64 nbytes;
458 			u32 mode;
459 
460 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
461 					      struct btrfs_inode_item);
462 			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
463 			item = btrfs_item_ptr(eb, slot,
464 					      struct btrfs_inode_item);
465 			btrfs_set_inode_nbytes(eb, item, nbytes);
466 
467 			/*
468 			 * If this is a directory we need to reset the i_size to
469 			 * 0 so that we can set it up properly when replaying
470 			 * the rest of the items in this log.
471 			 */
472 			mode = btrfs_inode_mode(eb, item);
473 			if (S_ISDIR(mode))
474 				btrfs_set_inode_size(eb, item, 0);
475 		}
476 	} else if (inode_item) {
477 		struct btrfs_inode_item *item;
478 		u32 mode;
479 
480 		/*
481 		 * New inode, set nbytes to 0 so that the nbytes comes out
482 		 * properly when we replay the extents.
483 		 */
484 		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
485 		btrfs_set_inode_nbytes(eb, item, 0);
486 
487 		/*
488 		 * If this is a directory we need to reset the i_size to 0 so
489 		 * that we can set it up properly when replaying the rest of
490 		 * the items in this log.
491 		 */
492 		mode = btrfs_inode_mode(eb, item);
493 		if (S_ISDIR(mode))
494 			btrfs_set_inode_size(eb, item, 0);
495 	}
496 insert:
497 	btrfs_release_path(path);
498 	/* try to insert the key into the destination tree */
499 	path->skip_release_on_error = 1;
500 	ret = btrfs_insert_empty_item(trans, root, path,
501 				      key, item_size);
502 	path->skip_release_on_error = 0;
503 
504 	/* make sure any existing item is the correct size */
505 	if (ret == -EEXIST || ret == -EOVERFLOW) {
506 		u32 found_size;
507 		found_size = btrfs_item_size(path->nodes[0],
508 						path->slots[0]);
509 		if (found_size > item_size)
510 			btrfs_truncate_item(path, item_size, 1);
511 		else if (found_size < item_size)
512 			btrfs_extend_item(path, item_size - found_size);
513 	} else if (ret) {
514 		return ret;
515 	}
516 	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
517 					path->slots[0]);
518 
519 	/* don't overwrite an existing inode if the generation number
520 	 * was logged as zero.  This is done when the tree logging code
521 	 * is just logging an inode to make sure it exists after recovery.
522 	 *
523 	 * Also, don't overwrite i_size on directories during replay.
524 	 * log replay inserts and removes directory items based on the
525 	 * state of the tree found in the subvolume, and i_size is modified
526 	 * as it goes
527 	 */
528 	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
529 		struct btrfs_inode_item *src_item;
530 		struct btrfs_inode_item *dst_item;
531 
532 		src_item = (struct btrfs_inode_item *)src_ptr;
533 		dst_item = (struct btrfs_inode_item *)dst_ptr;
534 
535 		if (btrfs_inode_generation(eb, src_item) == 0) {
536 			struct extent_buffer *dst_eb = path->nodes[0];
537 			const u64 ino_size = btrfs_inode_size(eb, src_item);
538 
539 			/*
540 			 * For regular files an ino_size == 0 is used only when
541 			 * logging that an inode exists, as part of a directory
542 			 * fsync, and the inode wasn't fsynced before. In this
543 			 * case don't set the size of the inode in the fs/subvol
544 			 * tree, otherwise we would be throwing valid data away.
545 			 */
546 			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
547 			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
548 			    ino_size != 0)
549 				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
550 			goto no_copy;
551 		}
552 
553 		if (overwrite_root &&
554 		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
555 		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
556 			save_old_i_size = 1;
557 			saved_i_size = btrfs_inode_size(path->nodes[0],
558 							dst_item);
559 		}
560 	}
561 
562 	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
563 			   src_ptr, item_size);
564 
565 	if (save_old_i_size) {
566 		struct btrfs_inode_item *dst_item;
567 		dst_item = (struct btrfs_inode_item *)dst_ptr;
568 		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
569 	}
570 
571 	/* make sure the generation is filled in */
572 	if (key->type == BTRFS_INODE_ITEM_KEY) {
573 		struct btrfs_inode_item *dst_item;
574 		dst_item = (struct btrfs_inode_item *)dst_ptr;
575 		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
576 			btrfs_set_inode_generation(path->nodes[0], dst_item,
577 						   trans->transid);
578 		}
579 	}
580 no_copy:
581 	btrfs_mark_buffer_dirty(path->nodes[0]);
582 	btrfs_release_path(path);
583 	return 0;
584 }
585 
586 /*
587  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
588  * to the src data we are copying out.
589  *
590  * root is the tree we are copying into, and path is a scratch
591  * path for use in this function (it should be released on entry and
592  * will be released on exit).
593  *
594  * If the key is already in the destination tree the existing item is
595  * overwritten.  If the existing item isn't big enough, it is extended.
596  * If it is too large, it is truncated.
597  *
598  * If the key isn't in the destination yet, a new item is inserted.
599  */
600 static int overwrite_item(struct btrfs_trans_handle *trans,
601 			  struct btrfs_root *root,
602 			  struct btrfs_path *path,
603 			  struct extent_buffer *eb, int slot,
604 			  struct btrfs_key *key)
605 {
606 	int ret;
607 
608 	/* Look for the key in the destination tree. */
609 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
610 	if (ret < 0)
611 		return ret;
612 
613 	return do_overwrite_item(trans, root, path, eb, slot, key);
614 }
615 
616 /*
617  * simple helper to read an inode off the disk from a given root
618  * This can only be called for subvolume roots and not for the log
619  */
620 static noinline struct inode *read_one_inode(struct btrfs_root *root,
621 					     u64 objectid)
622 {
623 	struct inode *inode;
624 
625 	inode = btrfs_iget(root->fs_info->sb, objectid, root);
626 	if (IS_ERR(inode))
627 		inode = NULL;
628 	return inode;
629 }
630 
631 /* replays a single extent in 'eb' at 'slot' with 'key' into the
632  * subvolume 'root'.  path is released on entry and should be released
633  * on exit.
634  *
635  * extents in the log tree have not been allocated out of the extent
636  * tree yet.  So, this completes the allocation, taking a reference
637  * as required if the extent already exists or creating a new extent
638  * if it isn't in the extent allocation tree yet.
639  *
640  * The extent is inserted into the file, dropping any existing extents
641  * from the file that overlap the new one.
642  */
643 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
644 				      struct btrfs_root *root,
645 				      struct btrfs_path *path,
646 				      struct extent_buffer *eb, int slot,
647 				      struct btrfs_key *key)
648 {
649 	struct btrfs_drop_extents_args drop_args = { 0 };
650 	struct btrfs_fs_info *fs_info = root->fs_info;
651 	int found_type;
652 	u64 extent_end;
653 	u64 start = key->offset;
654 	u64 nbytes = 0;
655 	struct btrfs_file_extent_item *item;
656 	struct inode *inode = NULL;
657 	unsigned long size;
658 	int ret = 0;
659 
660 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
661 	found_type = btrfs_file_extent_type(eb, item);
662 
663 	if (found_type == BTRFS_FILE_EXTENT_REG ||
664 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
665 		nbytes = btrfs_file_extent_num_bytes(eb, item);
666 		extent_end = start + nbytes;
667 
668 		/*
669 		 * We don't add to the inodes nbytes if we are prealloc or a
670 		 * hole.
671 		 */
672 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
673 			nbytes = 0;
674 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
675 		size = btrfs_file_extent_ram_bytes(eb, item);
676 		nbytes = btrfs_file_extent_ram_bytes(eb, item);
677 		extent_end = ALIGN(start + size,
678 				   fs_info->sectorsize);
679 	} else {
680 		ret = 0;
681 		goto out;
682 	}
683 
684 	inode = read_one_inode(root, key->objectid);
685 	if (!inode) {
686 		ret = -EIO;
687 		goto out;
688 	}
689 
690 	/*
691 	 * first check to see if we already have this extent in the
692 	 * file.  This must be done before the btrfs_drop_extents run
693 	 * so we don't try to drop this extent.
694 	 */
695 	ret = btrfs_lookup_file_extent(trans, root, path,
696 			btrfs_ino(BTRFS_I(inode)), start, 0);
697 
698 	if (ret == 0 &&
699 	    (found_type == BTRFS_FILE_EXTENT_REG ||
700 	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
701 		struct btrfs_file_extent_item cmp1;
702 		struct btrfs_file_extent_item cmp2;
703 		struct btrfs_file_extent_item *existing;
704 		struct extent_buffer *leaf;
705 
706 		leaf = path->nodes[0];
707 		existing = btrfs_item_ptr(leaf, path->slots[0],
708 					  struct btrfs_file_extent_item);
709 
710 		read_extent_buffer(eb, &cmp1, (unsigned long)item,
711 				   sizeof(cmp1));
712 		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
713 				   sizeof(cmp2));
714 
715 		/*
716 		 * we already have a pointer to this exact extent,
717 		 * we don't have to do anything
718 		 */
719 		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
720 			btrfs_release_path(path);
721 			goto out;
722 		}
723 	}
724 	btrfs_release_path(path);
725 
726 	/* drop any overlapping extents */
727 	drop_args.start = start;
728 	drop_args.end = extent_end;
729 	drop_args.drop_cache = true;
730 	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
731 	if (ret)
732 		goto out;
733 
734 	if (found_type == BTRFS_FILE_EXTENT_REG ||
735 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
736 		u64 offset;
737 		unsigned long dest_offset;
738 		struct btrfs_key ins;
739 
740 		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
741 		    btrfs_fs_incompat(fs_info, NO_HOLES))
742 			goto update_inode;
743 
744 		ret = btrfs_insert_empty_item(trans, root, path, key,
745 					      sizeof(*item));
746 		if (ret)
747 			goto out;
748 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
749 						    path->slots[0]);
750 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
751 				(unsigned long)item,  sizeof(*item));
752 
753 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
754 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
755 		ins.type = BTRFS_EXTENT_ITEM_KEY;
756 		offset = key->offset - btrfs_file_extent_offset(eb, item);
757 
758 		/*
759 		 * Manually record dirty extent, as here we did a shallow
760 		 * file extent item copy and skip normal backref update,
761 		 * but modifying extent tree all by ourselves.
762 		 * So need to manually record dirty extent for qgroup,
763 		 * as the owner of the file extent changed from log tree
764 		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
765 		 */
766 		ret = btrfs_qgroup_trace_extent(trans,
767 				btrfs_file_extent_disk_bytenr(eb, item),
768 				btrfs_file_extent_disk_num_bytes(eb, item),
769 				GFP_NOFS);
770 		if (ret < 0)
771 			goto out;
772 
773 		if (ins.objectid > 0) {
774 			struct btrfs_ref ref = { 0 };
775 			u64 csum_start;
776 			u64 csum_end;
777 			LIST_HEAD(ordered_sums);
778 
779 			/*
780 			 * is this extent already allocated in the extent
781 			 * allocation tree?  If so, just add a reference
782 			 */
783 			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
784 						ins.offset);
785 			if (ret < 0) {
786 				goto out;
787 			} else if (ret == 0) {
788 				btrfs_init_generic_ref(&ref,
789 						BTRFS_ADD_DELAYED_REF,
790 						ins.objectid, ins.offset, 0);
791 				btrfs_init_data_ref(&ref,
792 						root->root_key.objectid,
793 						key->objectid, offset, 0, false);
794 				ret = btrfs_inc_extent_ref(trans, &ref);
795 				if (ret)
796 					goto out;
797 			} else {
798 				/*
799 				 * insert the extent pointer in the extent
800 				 * allocation tree
801 				 */
802 				ret = btrfs_alloc_logged_file_extent(trans,
803 						root->root_key.objectid,
804 						key->objectid, offset, &ins);
805 				if (ret)
806 					goto out;
807 			}
808 			btrfs_release_path(path);
809 
810 			if (btrfs_file_extent_compression(eb, item)) {
811 				csum_start = ins.objectid;
812 				csum_end = csum_start + ins.offset;
813 			} else {
814 				csum_start = ins.objectid +
815 					btrfs_file_extent_offset(eb, item);
816 				csum_end = csum_start +
817 					btrfs_file_extent_num_bytes(eb, item);
818 			}
819 
820 			ret = btrfs_lookup_csums_range(root->log_root,
821 						csum_start, csum_end - 1,
822 						&ordered_sums, 0);
823 			if (ret)
824 				goto out;
825 			/*
826 			 * Now delete all existing cums in the csum root that
827 			 * cover our range. We do this because we can have an
828 			 * extent that is completely referenced by one file
829 			 * extent item and partially referenced by another
830 			 * file extent item (like after using the clone or
831 			 * extent_same ioctls). In this case if we end up doing
832 			 * the replay of the one that partially references the
833 			 * extent first, and we do not do the csum deletion
834 			 * below, we can get 2 csum items in the csum tree that
835 			 * overlap each other. For example, imagine our log has
836 			 * the two following file extent items:
837 			 *
838 			 * key (257 EXTENT_DATA 409600)
839 			 *     extent data disk byte 12845056 nr 102400
840 			 *     extent data offset 20480 nr 20480 ram 102400
841 			 *
842 			 * key (257 EXTENT_DATA 819200)
843 			 *     extent data disk byte 12845056 nr 102400
844 			 *     extent data offset 0 nr 102400 ram 102400
845 			 *
846 			 * Where the second one fully references the 100K extent
847 			 * that starts at disk byte 12845056, and the log tree
848 			 * has a single csum item that covers the entire range
849 			 * of the extent:
850 			 *
851 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
852 			 *
853 			 * After the first file extent item is replayed, the
854 			 * csum tree gets the following csum item:
855 			 *
856 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
857 			 *
858 			 * Which covers the 20K sub-range starting at offset 20K
859 			 * of our extent. Now when we replay the second file
860 			 * extent item, if we do not delete existing csum items
861 			 * that cover any of its blocks, we end up getting two
862 			 * csum items in our csum tree that overlap each other:
863 			 *
864 			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
865 			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
866 			 *
867 			 * Which is a problem, because after this anyone trying
868 			 * to lookup up for the checksum of any block of our
869 			 * extent starting at an offset of 40K or higher, will
870 			 * end up looking at the second csum item only, which
871 			 * does not contain the checksum for any block starting
872 			 * at offset 40K or higher of our extent.
873 			 */
874 			while (!list_empty(&ordered_sums)) {
875 				struct btrfs_ordered_sum *sums;
876 				struct btrfs_root *csum_root;
877 
878 				sums = list_entry(ordered_sums.next,
879 						struct btrfs_ordered_sum,
880 						list);
881 				csum_root = btrfs_csum_root(fs_info,
882 							    sums->bytenr);
883 				if (!ret)
884 					ret = btrfs_del_csums(trans, csum_root,
885 							      sums->bytenr,
886 							      sums->len);
887 				if (!ret)
888 					ret = btrfs_csum_file_blocks(trans,
889 								     csum_root,
890 								     sums);
891 				list_del(&sums->list);
892 				kfree(sums);
893 			}
894 			if (ret)
895 				goto out;
896 		} else {
897 			btrfs_release_path(path);
898 		}
899 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
900 		/* inline extents are easy, we just overwrite them */
901 		ret = overwrite_item(trans, root, path, eb, slot, key);
902 		if (ret)
903 			goto out;
904 	}
905 
906 	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
907 						extent_end - start);
908 	if (ret)
909 		goto out;
910 
911 update_inode:
912 	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
913 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
914 out:
915 	if (inode)
916 		iput(inode);
917 	return ret;
918 }
919 
920 /*
921  * when cleaning up conflicts between the directory names in the
922  * subvolume, directory names in the log and directory names in the
923  * inode back references, we may have to unlink inodes from directories.
924  *
925  * This is a helper function to do the unlink of a specific directory
926  * item
927  */
928 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
929 				      struct btrfs_path *path,
930 				      struct btrfs_inode *dir,
931 				      struct btrfs_dir_item *di)
932 {
933 	struct btrfs_root *root = dir->root;
934 	struct inode *inode;
935 	char *name;
936 	int name_len;
937 	struct extent_buffer *leaf;
938 	struct btrfs_key location;
939 	int ret;
940 
941 	leaf = path->nodes[0];
942 
943 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
944 	name_len = btrfs_dir_name_len(leaf, di);
945 	name = kmalloc(name_len, GFP_NOFS);
946 	if (!name)
947 		return -ENOMEM;
948 
949 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
950 	btrfs_release_path(path);
951 
952 	inode = read_one_inode(root, location.objectid);
953 	if (!inode) {
954 		ret = -EIO;
955 		goto out;
956 	}
957 
958 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
959 	if (ret)
960 		goto out;
961 
962 	ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
963 			name_len);
964 	if (ret)
965 		goto out;
966 	else
967 		ret = btrfs_run_delayed_items(trans);
968 out:
969 	kfree(name);
970 	iput(inode);
971 	return ret;
972 }
973 
974 /*
975  * See if a given name and sequence number found in an inode back reference are
976  * already in a directory and correctly point to this inode.
977  *
978  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
979  * exists.
980  */
981 static noinline int inode_in_dir(struct btrfs_root *root,
982 				 struct btrfs_path *path,
983 				 u64 dirid, u64 objectid, u64 index,
984 				 const char *name, int name_len)
985 {
986 	struct btrfs_dir_item *di;
987 	struct btrfs_key location;
988 	int ret = 0;
989 
990 	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
991 					 index, name, name_len, 0);
992 	if (IS_ERR(di)) {
993 		ret = PTR_ERR(di);
994 		goto out;
995 	} else if (di) {
996 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
997 		if (location.objectid != objectid)
998 			goto out;
999 	} else {
1000 		goto out;
1001 	}
1002 
1003 	btrfs_release_path(path);
1004 	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
1005 	if (IS_ERR(di)) {
1006 		ret = PTR_ERR(di);
1007 		goto out;
1008 	} else if (di) {
1009 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1010 		if (location.objectid == objectid)
1011 			ret = 1;
1012 	}
1013 out:
1014 	btrfs_release_path(path);
1015 	return ret;
1016 }
1017 
1018 /*
1019  * helper function to check a log tree for a named back reference in
1020  * an inode.  This is used to decide if a back reference that is
1021  * found in the subvolume conflicts with what we find in the log.
1022  *
1023  * inode backreferences may have multiple refs in a single item,
1024  * during replay we process one reference at a time, and we don't
1025  * want to delete valid links to a file from the subvolume if that
1026  * link is also in the log.
1027  */
1028 static noinline int backref_in_log(struct btrfs_root *log,
1029 				   struct btrfs_key *key,
1030 				   u64 ref_objectid,
1031 				   const char *name, int namelen)
1032 {
1033 	struct btrfs_path *path;
1034 	int ret;
1035 
1036 	path = btrfs_alloc_path();
1037 	if (!path)
1038 		return -ENOMEM;
1039 
1040 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1041 	if (ret < 0) {
1042 		goto out;
1043 	} else if (ret == 1) {
1044 		ret = 0;
1045 		goto out;
1046 	}
1047 
1048 	if (key->type == BTRFS_INODE_EXTREF_KEY)
1049 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1050 						       path->slots[0],
1051 						       ref_objectid,
1052 						       name, namelen);
1053 	else
1054 		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1055 						   path->slots[0],
1056 						   name, namelen);
1057 out:
1058 	btrfs_free_path(path);
1059 	return ret;
1060 }
1061 
1062 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1063 				  struct btrfs_root *root,
1064 				  struct btrfs_path *path,
1065 				  struct btrfs_root *log_root,
1066 				  struct btrfs_inode *dir,
1067 				  struct btrfs_inode *inode,
1068 				  u64 inode_objectid, u64 parent_objectid,
1069 				  u64 ref_index, char *name, int namelen,
1070 				  int *search_done)
1071 {
1072 	int ret;
1073 	char *victim_name;
1074 	int victim_name_len;
1075 	struct extent_buffer *leaf;
1076 	struct btrfs_dir_item *di;
1077 	struct btrfs_key search_key;
1078 	struct btrfs_inode_extref *extref;
1079 
1080 again:
1081 	/* Search old style refs */
1082 	search_key.objectid = inode_objectid;
1083 	search_key.type = BTRFS_INODE_REF_KEY;
1084 	search_key.offset = parent_objectid;
1085 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1086 	if (ret == 0) {
1087 		struct btrfs_inode_ref *victim_ref;
1088 		unsigned long ptr;
1089 		unsigned long ptr_end;
1090 
1091 		leaf = path->nodes[0];
1092 
1093 		/* are we trying to overwrite a back ref for the root directory
1094 		 * if so, just jump out, we're done
1095 		 */
1096 		if (search_key.objectid == search_key.offset)
1097 			return 1;
1098 
1099 		/* check all the names in this back reference to see
1100 		 * if they are in the log.  if so, we allow them to stay
1101 		 * otherwise they must be unlinked as a conflict
1102 		 */
1103 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1104 		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1105 		while (ptr < ptr_end) {
1106 			victim_ref = (struct btrfs_inode_ref *)ptr;
1107 			victim_name_len = btrfs_inode_ref_name_len(leaf,
1108 								   victim_ref);
1109 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1110 			if (!victim_name)
1111 				return -ENOMEM;
1112 
1113 			read_extent_buffer(leaf, victim_name,
1114 					   (unsigned long)(victim_ref + 1),
1115 					   victim_name_len);
1116 
1117 			ret = backref_in_log(log_root, &search_key,
1118 					     parent_objectid, victim_name,
1119 					     victim_name_len);
1120 			if (ret < 0) {
1121 				kfree(victim_name);
1122 				return ret;
1123 			} else if (!ret) {
1124 				inc_nlink(&inode->vfs_inode);
1125 				btrfs_release_path(path);
1126 
1127 				ret = btrfs_unlink_inode(trans, dir, inode,
1128 						victim_name, victim_name_len);
1129 				kfree(victim_name);
1130 				if (ret)
1131 					return ret;
1132 				ret = btrfs_run_delayed_items(trans);
1133 				if (ret)
1134 					return ret;
1135 				*search_done = 1;
1136 				goto again;
1137 			}
1138 			kfree(victim_name);
1139 
1140 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1141 		}
1142 
1143 		/*
1144 		 * NOTE: we have searched root tree and checked the
1145 		 * corresponding ref, it does not need to check again.
1146 		 */
1147 		*search_done = 1;
1148 	}
1149 	btrfs_release_path(path);
1150 
1151 	/* Same search but for extended refs */
1152 	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1153 					   inode_objectid, parent_objectid, 0,
1154 					   0);
1155 	if (!IS_ERR_OR_NULL(extref)) {
1156 		u32 item_size;
1157 		u32 cur_offset = 0;
1158 		unsigned long base;
1159 		struct inode *victim_parent;
1160 
1161 		leaf = path->nodes[0];
1162 
1163 		item_size = btrfs_item_size(leaf, path->slots[0]);
1164 		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1165 
1166 		while (cur_offset < item_size) {
1167 			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1168 
1169 			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1170 
1171 			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1172 				goto next;
1173 
1174 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1175 			if (!victim_name)
1176 				return -ENOMEM;
1177 			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1178 					   victim_name_len);
1179 
1180 			search_key.objectid = inode_objectid;
1181 			search_key.type = BTRFS_INODE_EXTREF_KEY;
1182 			search_key.offset = btrfs_extref_hash(parent_objectid,
1183 							      victim_name,
1184 							      victim_name_len);
1185 			ret = backref_in_log(log_root, &search_key,
1186 					     parent_objectid, victim_name,
1187 					     victim_name_len);
1188 			if (ret < 0) {
1189 				kfree(victim_name);
1190 				return ret;
1191 			} else if (!ret) {
1192 				ret = -ENOENT;
1193 				victim_parent = read_one_inode(root,
1194 						parent_objectid);
1195 				if (victim_parent) {
1196 					inc_nlink(&inode->vfs_inode);
1197 					btrfs_release_path(path);
1198 
1199 					ret = btrfs_unlink_inode(trans,
1200 							BTRFS_I(victim_parent),
1201 							inode,
1202 							victim_name,
1203 							victim_name_len);
1204 					if (!ret)
1205 						ret = btrfs_run_delayed_items(
1206 								  trans);
1207 				}
1208 				iput(victim_parent);
1209 				kfree(victim_name);
1210 				if (ret)
1211 					return ret;
1212 				*search_done = 1;
1213 				goto again;
1214 			}
1215 			kfree(victim_name);
1216 next:
1217 			cur_offset += victim_name_len + sizeof(*extref);
1218 		}
1219 		*search_done = 1;
1220 	}
1221 	btrfs_release_path(path);
1222 
1223 	/* look for a conflicting sequence number */
1224 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1225 					 ref_index, name, namelen, 0);
1226 	if (IS_ERR(di)) {
1227 		return PTR_ERR(di);
1228 	} else if (di) {
1229 		ret = drop_one_dir_item(trans, path, dir, di);
1230 		if (ret)
1231 			return ret;
1232 	}
1233 	btrfs_release_path(path);
1234 
1235 	/* look for a conflicting name */
1236 	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1237 				   name, namelen, 0);
1238 	if (IS_ERR(di)) {
1239 		return PTR_ERR(di);
1240 	} else if (di) {
1241 		ret = drop_one_dir_item(trans, path, dir, di);
1242 		if (ret)
1243 			return ret;
1244 	}
1245 	btrfs_release_path(path);
1246 
1247 	return 0;
1248 }
1249 
1250 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1251 			     u32 *namelen, char **name, u64 *index,
1252 			     u64 *parent_objectid)
1253 {
1254 	struct btrfs_inode_extref *extref;
1255 
1256 	extref = (struct btrfs_inode_extref *)ref_ptr;
1257 
1258 	*namelen = btrfs_inode_extref_name_len(eb, extref);
1259 	*name = kmalloc(*namelen, GFP_NOFS);
1260 	if (*name == NULL)
1261 		return -ENOMEM;
1262 
1263 	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1264 			   *namelen);
1265 
1266 	if (index)
1267 		*index = btrfs_inode_extref_index(eb, extref);
1268 	if (parent_objectid)
1269 		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1270 
1271 	return 0;
1272 }
1273 
1274 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1275 			  u32 *namelen, char **name, u64 *index)
1276 {
1277 	struct btrfs_inode_ref *ref;
1278 
1279 	ref = (struct btrfs_inode_ref *)ref_ptr;
1280 
1281 	*namelen = btrfs_inode_ref_name_len(eb, ref);
1282 	*name = kmalloc(*namelen, GFP_NOFS);
1283 	if (*name == NULL)
1284 		return -ENOMEM;
1285 
1286 	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1287 
1288 	if (index)
1289 		*index = btrfs_inode_ref_index(eb, ref);
1290 
1291 	return 0;
1292 }
1293 
1294 /*
1295  * Take an inode reference item from the log tree and iterate all names from the
1296  * inode reference item in the subvolume tree with the same key (if it exists).
1297  * For any name that is not in the inode reference item from the log tree, do a
1298  * proper unlink of that name (that is, remove its entry from the inode
1299  * reference item and both dir index keys).
1300  */
1301 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1302 				 struct btrfs_root *root,
1303 				 struct btrfs_path *path,
1304 				 struct btrfs_inode *inode,
1305 				 struct extent_buffer *log_eb,
1306 				 int log_slot,
1307 				 struct btrfs_key *key)
1308 {
1309 	int ret;
1310 	unsigned long ref_ptr;
1311 	unsigned long ref_end;
1312 	struct extent_buffer *eb;
1313 
1314 again:
1315 	btrfs_release_path(path);
1316 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1317 	if (ret > 0) {
1318 		ret = 0;
1319 		goto out;
1320 	}
1321 	if (ret < 0)
1322 		goto out;
1323 
1324 	eb = path->nodes[0];
1325 	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1326 	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1327 	while (ref_ptr < ref_end) {
1328 		char *name = NULL;
1329 		int namelen;
1330 		u64 parent_id;
1331 
1332 		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1333 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1334 						NULL, &parent_id);
1335 		} else {
1336 			parent_id = key->offset;
1337 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1338 					     NULL);
1339 		}
1340 		if (ret)
1341 			goto out;
1342 
1343 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1344 			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1345 							       parent_id, name,
1346 							       namelen);
1347 		else
1348 			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1349 							   name, namelen);
1350 
1351 		if (!ret) {
1352 			struct inode *dir;
1353 
1354 			btrfs_release_path(path);
1355 			dir = read_one_inode(root, parent_id);
1356 			if (!dir) {
1357 				ret = -ENOENT;
1358 				kfree(name);
1359 				goto out;
1360 			}
1361 			ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
1362 						 inode, name, namelen);
1363 			kfree(name);
1364 			iput(dir);
1365 			/*
1366 			 * Whenever we need to check if a name exists or not, we
1367 			 * check the subvolume tree. So after an unlink we must
1368 			 * run delayed items, so that future checks for a name
1369 			 * during log replay see that the name does not exists
1370 			 * anymore.
1371 			 */
1372 			if (!ret)
1373 				ret = btrfs_run_delayed_items(trans);
1374 			if (ret)
1375 				goto out;
1376 			goto again;
1377 		}
1378 
1379 		kfree(name);
1380 		ref_ptr += namelen;
1381 		if (key->type == BTRFS_INODE_EXTREF_KEY)
1382 			ref_ptr += sizeof(struct btrfs_inode_extref);
1383 		else
1384 			ref_ptr += sizeof(struct btrfs_inode_ref);
1385 	}
1386 	ret = 0;
1387  out:
1388 	btrfs_release_path(path);
1389 	return ret;
1390 }
1391 
1392 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1393 				  const u8 ref_type, const char *name,
1394 				  const int namelen)
1395 {
1396 	struct btrfs_key key;
1397 	struct btrfs_path *path;
1398 	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1399 	int ret;
1400 
1401 	path = btrfs_alloc_path();
1402 	if (!path)
1403 		return -ENOMEM;
1404 
1405 	key.objectid = btrfs_ino(BTRFS_I(inode));
1406 	key.type = ref_type;
1407 	if (key.type == BTRFS_INODE_REF_KEY)
1408 		key.offset = parent_id;
1409 	else
1410 		key.offset = btrfs_extref_hash(parent_id, name, namelen);
1411 
1412 	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1413 	if (ret < 0)
1414 		goto out;
1415 	if (ret > 0) {
1416 		ret = 0;
1417 		goto out;
1418 	}
1419 	if (key.type == BTRFS_INODE_EXTREF_KEY)
1420 		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1421 				path->slots[0], parent_id, name, namelen);
1422 	else
1423 		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1424 						   name, namelen);
1425 
1426 out:
1427 	btrfs_free_path(path);
1428 	return ret;
1429 }
1430 
1431 static int add_link(struct btrfs_trans_handle *trans,
1432 		    struct inode *dir, struct inode *inode, const char *name,
1433 		    int namelen, u64 ref_index)
1434 {
1435 	struct btrfs_root *root = BTRFS_I(dir)->root;
1436 	struct btrfs_dir_item *dir_item;
1437 	struct btrfs_key key;
1438 	struct btrfs_path *path;
1439 	struct inode *other_inode = NULL;
1440 	int ret;
1441 
1442 	path = btrfs_alloc_path();
1443 	if (!path)
1444 		return -ENOMEM;
1445 
1446 	dir_item = btrfs_lookup_dir_item(NULL, root, path,
1447 					 btrfs_ino(BTRFS_I(dir)),
1448 					 name, namelen, 0);
1449 	if (!dir_item) {
1450 		btrfs_release_path(path);
1451 		goto add_link;
1452 	} else if (IS_ERR(dir_item)) {
1453 		ret = PTR_ERR(dir_item);
1454 		goto out;
1455 	}
1456 
1457 	/*
1458 	 * Our inode's dentry collides with the dentry of another inode which is
1459 	 * in the log but not yet processed since it has a higher inode number.
1460 	 * So delete that other dentry.
1461 	 */
1462 	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1463 	btrfs_release_path(path);
1464 	other_inode = read_one_inode(root, key.objectid);
1465 	if (!other_inode) {
1466 		ret = -ENOENT;
1467 		goto out;
1468 	}
1469 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
1470 				 name, namelen);
1471 	if (ret)
1472 		goto out;
1473 	/*
1474 	 * If we dropped the link count to 0, bump it so that later the iput()
1475 	 * on the inode will not free it. We will fixup the link count later.
1476 	 */
1477 	if (other_inode->i_nlink == 0)
1478 		inc_nlink(other_inode);
1479 
1480 	ret = btrfs_run_delayed_items(trans);
1481 	if (ret)
1482 		goto out;
1483 add_link:
1484 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1485 			     name, namelen, 0, ref_index);
1486 out:
1487 	iput(other_inode);
1488 	btrfs_free_path(path);
1489 
1490 	return ret;
1491 }
1492 
1493 /*
1494  * replay one inode back reference item found in the log tree.
1495  * eb, slot and key refer to the buffer and key found in the log tree.
1496  * root is the destination we are replaying into, and path is for temp
1497  * use by this function.  (it should be released on return).
1498  */
1499 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1500 				  struct btrfs_root *root,
1501 				  struct btrfs_root *log,
1502 				  struct btrfs_path *path,
1503 				  struct extent_buffer *eb, int slot,
1504 				  struct btrfs_key *key)
1505 {
1506 	struct inode *dir = NULL;
1507 	struct inode *inode = NULL;
1508 	unsigned long ref_ptr;
1509 	unsigned long ref_end;
1510 	char *name = NULL;
1511 	int namelen;
1512 	int ret;
1513 	int search_done = 0;
1514 	int log_ref_ver = 0;
1515 	u64 parent_objectid;
1516 	u64 inode_objectid;
1517 	u64 ref_index = 0;
1518 	int ref_struct_size;
1519 
1520 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1521 	ref_end = ref_ptr + btrfs_item_size(eb, slot);
1522 
1523 	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1524 		struct btrfs_inode_extref *r;
1525 
1526 		ref_struct_size = sizeof(struct btrfs_inode_extref);
1527 		log_ref_ver = 1;
1528 		r = (struct btrfs_inode_extref *)ref_ptr;
1529 		parent_objectid = btrfs_inode_extref_parent(eb, r);
1530 	} else {
1531 		ref_struct_size = sizeof(struct btrfs_inode_ref);
1532 		parent_objectid = key->offset;
1533 	}
1534 	inode_objectid = key->objectid;
1535 
1536 	/*
1537 	 * it is possible that we didn't log all the parent directories
1538 	 * for a given inode.  If we don't find the dir, just don't
1539 	 * copy the back ref in.  The link count fixup code will take
1540 	 * care of the rest
1541 	 */
1542 	dir = read_one_inode(root, parent_objectid);
1543 	if (!dir) {
1544 		ret = -ENOENT;
1545 		goto out;
1546 	}
1547 
1548 	inode = read_one_inode(root, inode_objectid);
1549 	if (!inode) {
1550 		ret = -EIO;
1551 		goto out;
1552 	}
1553 
1554 	while (ref_ptr < ref_end) {
1555 		if (log_ref_ver) {
1556 			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1557 						&ref_index, &parent_objectid);
1558 			/*
1559 			 * parent object can change from one array
1560 			 * item to another.
1561 			 */
1562 			if (!dir)
1563 				dir = read_one_inode(root, parent_objectid);
1564 			if (!dir) {
1565 				ret = -ENOENT;
1566 				goto out;
1567 			}
1568 		} else {
1569 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1570 					     &ref_index);
1571 		}
1572 		if (ret)
1573 			goto out;
1574 
1575 		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1576 				   btrfs_ino(BTRFS_I(inode)), ref_index,
1577 				   name, namelen);
1578 		if (ret < 0) {
1579 			goto out;
1580 		} else if (ret == 0) {
1581 			/*
1582 			 * look for a conflicting back reference in the
1583 			 * metadata. if we find one we have to unlink that name
1584 			 * of the file before we add our new link.  Later on, we
1585 			 * overwrite any existing back reference, and we don't
1586 			 * want to create dangling pointers in the directory.
1587 			 */
1588 
1589 			if (!search_done) {
1590 				ret = __add_inode_ref(trans, root, path, log,
1591 						      BTRFS_I(dir),
1592 						      BTRFS_I(inode),
1593 						      inode_objectid,
1594 						      parent_objectid,
1595 						      ref_index, name, namelen,
1596 						      &search_done);
1597 				if (ret) {
1598 					if (ret == 1)
1599 						ret = 0;
1600 					goto out;
1601 				}
1602 			}
1603 
1604 			/*
1605 			 * If a reference item already exists for this inode
1606 			 * with the same parent and name, but different index,
1607 			 * drop it and the corresponding directory index entries
1608 			 * from the parent before adding the new reference item
1609 			 * and dir index entries, otherwise we would fail with
1610 			 * -EEXIST returned from btrfs_add_link() below.
1611 			 */
1612 			ret = btrfs_inode_ref_exists(inode, dir, key->type,
1613 						     name, namelen);
1614 			if (ret > 0) {
1615 				ret = btrfs_unlink_inode(trans,
1616 							 BTRFS_I(dir),
1617 							 BTRFS_I(inode),
1618 							 name, namelen);
1619 				/*
1620 				 * If we dropped the link count to 0, bump it so
1621 				 * that later the iput() on the inode will not
1622 				 * free it. We will fixup the link count later.
1623 				 */
1624 				if (!ret && inode->i_nlink == 0)
1625 					inc_nlink(inode);
1626 				/*
1627 				 * Whenever we need to check if a name exists or
1628 				 * not, we check the subvolume tree. So after an
1629 				 * unlink we must run delayed items, so that future
1630 				 * checks for a name during log replay see that the
1631 				 * name does not exists anymore.
1632 				 */
1633 				if (!ret)
1634 					ret = btrfs_run_delayed_items(trans);
1635 			}
1636 			if (ret < 0)
1637 				goto out;
1638 
1639 			/* insert our name */
1640 			ret = add_link(trans, dir, inode, name, namelen,
1641 				       ref_index);
1642 			if (ret)
1643 				goto out;
1644 
1645 			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1646 			if (ret)
1647 				goto out;
1648 		}
1649 		/* Else, ret == 1, we already have a perfect match, we're done. */
1650 
1651 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1652 		kfree(name);
1653 		name = NULL;
1654 		if (log_ref_ver) {
1655 			iput(dir);
1656 			dir = NULL;
1657 		}
1658 	}
1659 
1660 	/*
1661 	 * Before we overwrite the inode reference item in the subvolume tree
1662 	 * with the item from the log tree, we must unlink all names from the
1663 	 * parent directory that are in the subvolume's tree inode reference
1664 	 * item, otherwise we end up with an inconsistent subvolume tree where
1665 	 * dir index entries exist for a name but there is no inode reference
1666 	 * item with the same name.
1667 	 */
1668 	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1669 				    key);
1670 	if (ret)
1671 		goto out;
1672 
1673 	/* finally write the back reference in the inode */
1674 	ret = overwrite_item(trans, root, path, eb, slot, key);
1675 out:
1676 	btrfs_release_path(path);
1677 	kfree(name);
1678 	iput(dir);
1679 	iput(inode);
1680 	return ret;
1681 }
1682 
1683 static int count_inode_extrefs(struct btrfs_root *root,
1684 		struct btrfs_inode *inode, struct btrfs_path *path)
1685 {
1686 	int ret = 0;
1687 	int name_len;
1688 	unsigned int nlink = 0;
1689 	u32 item_size;
1690 	u32 cur_offset = 0;
1691 	u64 inode_objectid = btrfs_ino(inode);
1692 	u64 offset = 0;
1693 	unsigned long ptr;
1694 	struct btrfs_inode_extref *extref;
1695 	struct extent_buffer *leaf;
1696 
1697 	while (1) {
1698 		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1699 					    &extref, &offset);
1700 		if (ret)
1701 			break;
1702 
1703 		leaf = path->nodes[0];
1704 		item_size = btrfs_item_size(leaf, path->slots[0]);
1705 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1706 		cur_offset = 0;
1707 
1708 		while (cur_offset < item_size) {
1709 			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1710 			name_len = btrfs_inode_extref_name_len(leaf, extref);
1711 
1712 			nlink++;
1713 
1714 			cur_offset += name_len + sizeof(*extref);
1715 		}
1716 
1717 		offset++;
1718 		btrfs_release_path(path);
1719 	}
1720 	btrfs_release_path(path);
1721 
1722 	if (ret < 0 && ret != -ENOENT)
1723 		return ret;
1724 	return nlink;
1725 }
1726 
1727 static int count_inode_refs(struct btrfs_root *root,
1728 			struct btrfs_inode *inode, struct btrfs_path *path)
1729 {
1730 	int ret;
1731 	struct btrfs_key key;
1732 	unsigned int nlink = 0;
1733 	unsigned long ptr;
1734 	unsigned long ptr_end;
1735 	int name_len;
1736 	u64 ino = btrfs_ino(inode);
1737 
1738 	key.objectid = ino;
1739 	key.type = BTRFS_INODE_REF_KEY;
1740 	key.offset = (u64)-1;
1741 
1742 	while (1) {
1743 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1744 		if (ret < 0)
1745 			break;
1746 		if (ret > 0) {
1747 			if (path->slots[0] == 0)
1748 				break;
1749 			path->slots[0]--;
1750 		}
1751 process_slot:
1752 		btrfs_item_key_to_cpu(path->nodes[0], &key,
1753 				      path->slots[0]);
1754 		if (key.objectid != ino ||
1755 		    key.type != BTRFS_INODE_REF_KEY)
1756 			break;
1757 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1758 		ptr_end = ptr + btrfs_item_size(path->nodes[0],
1759 						   path->slots[0]);
1760 		while (ptr < ptr_end) {
1761 			struct btrfs_inode_ref *ref;
1762 
1763 			ref = (struct btrfs_inode_ref *)ptr;
1764 			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1765 							    ref);
1766 			ptr = (unsigned long)(ref + 1) + name_len;
1767 			nlink++;
1768 		}
1769 
1770 		if (key.offset == 0)
1771 			break;
1772 		if (path->slots[0] > 0) {
1773 			path->slots[0]--;
1774 			goto process_slot;
1775 		}
1776 		key.offset--;
1777 		btrfs_release_path(path);
1778 	}
1779 	btrfs_release_path(path);
1780 
1781 	return nlink;
1782 }
1783 
1784 /*
1785  * There are a few corners where the link count of the file can't
1786  * be properly maintained during replay.  So, instead of adding
1787  * lots of complexity to the log code, we just scan the backrefs
1788  * for any file that has been through replay.
1789  *
1790  * The scan will update the link count on the inode to reflect the
1791  * number of back refs found.  If it goes down to zero, the iput
1792  * will free the inode.
1793  */
1794 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1795 					   struct btrfs_root *root,
1796 					   struct inode *inode)
1797 {
1798 	struct btrfs_path *path;
1799 	int ret;
1800 	u64 nlink = 0;
1801 	u64 ino = btrfs_ino(BTRFS_I(inode));
1802 
1803 	path = btrfs_alloc_path();
1804 	if (!path)
1805 		return -ENOMEM;
1806 
1807 	ret = count_inode_refs(root, BTRFS_I(inode), path);
1808 	if (ret < 0)
1809 		goto out;
1810 
1811 	nlink = ret;
1812 
1813 	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1814 	if (ret < 0)
1815 		goto out;
1816 
1817 	nlink += ret;
1818 
1819 	ret = 0;
1820 
1821 	if (nlink != inode->i_nlink) {
1822 		set_nlink(inode, nlink);
1823 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1824 		if (ret)
1825 			goto out;
1826 	}
1827 	BTRFS_I(inode)->index_cnt = (u64)-1;
1828 
1829 	if (inode->i_nlink == 0) {
1830 		if (S_ISDIR(inode->i_mode)) {
1831 			ret = replay_dir_deletes(trans, root, NULL, path,
1832 						 ino, 1);
1833 			if (ret)
1834 				goto out;
1835 		}
1836 		ret = btrfs_insert_orphan_item(trans, root, ino);
1837 		if (ret == -EEXIST)
1838 			ret = 0;
1839 	}
1840 
1841 out:
1842 	btrfs_free_path(path);
1843 	return ret;
1844 }
1845 
1846 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1847 					    struct btrfs_root *root,
1848 					    struct btrfs_path *path)
1849 {
1850 	int ret;
1851 	struct btrfs_key key;
1852 	struct inode *inode;
1853 
1854 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1855 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1856 	key.offset = (u64)-1;
1857 	while (1) {
1858 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1859 		if (ret < 0)
1860 			break;
1861 
1862 		if (ret == 1) {
1863 			ret = 0;
1864 			if (path->slots[0] == 0)
1865 				break;
1866 			path->slots[0]--;
1867 		}
1868 
1869 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1870 		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1871 		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1872 			break;
1873 
1874 		ret = btrfs_del_item(trans, root, path);
1875 		if (ret)
1876 			break;
1877 
1878 		btrfs_release_path(path);
1879 		inode = read_one_inode(root, key.offset);
1880 		if (!inode) {
1881 			ret = -EIO;
1882 			break;
1883 		}
1884 
1885 		ret = fixup_inode_link_count(trans, root, inode);
1886 		iput(inode);
1887 		if (ret)
1888 			break;
1889 
1890 		/*
1891 		 * fixup on a directory may create new entries,
1892 		 * make sure we always look for the highset possible
1893 		 * offset
1894 		 */
1895 		key.offset = (u64)-1;
1896 	}
1897 	btrfs_release_path(path);
1898 	return ret;
1899 }
1900 
1901 
1902 /*
1903  * record a given inode in the fixup dir so we can check its link
1904  * count when replay is done.  The link count is incremented here
1905  * so the inode won't go away until we check it
1906  */
1907 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1908 				      struct btrfs_root *root,
1909 				      struct btrfs_path *path,
1910 				      u64 objectid)
1911 {
1912 	struct btrfs_key key;
1913 	int ret = 0;
1914 	struct inode *inode;
1915 
1916 	inode = read_one_inode(root, objectid);
1917 	if (!inode)
1918 		return -EIO;
1919 
1920 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1921 	key.type = BTRFS_ORPHAN_ITEM_KEY;
1922 	key.offset = objectid;
1923 
1924 	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1925 
1926 	btrfs_release_path(path);
1927 	if (ret == 0) {
1928 		if (!inode->i_nlink)
1929 			set_nlink(inode, 1);
1930 		else
1931 			inc_nlink(inode);
1932 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1933 	} else if (ret == -EEXIST) {
1934 		ret = 0;
1935 	}
1936 	iput(inode);
1937 
1938 	return ret;
1939 }
1940 
1941 /*
1942  * when replaying the log for a directory, we only insert names
1943  * for inodes that actually exist.  This means an fsync on a directory
1944  * does not implicitly fsync all the new files in it
1945  */
1946 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1947 				    struct btrfs_root *root,
1948 				    u64 dirid, u64 index,
1949 				    char *name, int name_len,
1950 				    struct btrfs_key *location)
1951 {
1952 	struct inode *inode;
1953 	struct inode *dir;
1954 	int ret;
1955 
1956 	inode = read_one_inode(root, location->objectid);
1957 	if (!inode)
1958 		return -ENOENT;
1959 
1960 	dir = read_one_inode(root, dirid);
1961 	if (!dir) {
1962 		iput(inode);
1963 		return -EIO;
1964 	}
1965 
1966 	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1967 			name_len, 1, index);
1968 
1969 	/* FIXME, put inode into FIXUP list */
1970 
1971 	iput(inode);
1972 	iput(dir);
1973 	return ret;
1974 }
1975 
1976 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1977 					struct btrfs_inode *dir,
1978 					struct btrfs_path *path,
1979 					struct btrfs_dir_item *dst_di,
1980 					const struct btrfs_key *log_key,
1981 					u8 log_type,
1982 					bool exists)
1983 {
1984 	struct btrfs_key found_key;
1985 
1986 	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1987 	/* The existing dentry points to the same inode, don't delete it. */
1988 	if (found_key.objectid == log_key->objectid &&
1989 	    found_key.type == log_key->type &&
1990 	    found_key.offset == log_key->offset &&
1991 	    btrfs_dir_type(path->nodes[0], dst_di) == log_type)
1992 		return 1;
1993 
1994 	/*
1995 	 * Don't drop the conflicting directory entry if the inode for the new
1996 	 * entry doesn't exist.
1997 	 */
1998 	if (!exists)
1999 		return 0;
2000 
2001 	return drop_one_dir_item(trans, path, dir, dst_di);
2002 }
2003 
2004 /*
2005  * take a single entry in a log directory item and replay it into
2006  * the subvolume.
2007  *
2008  * if a conflicting item exists in the subdirectory already,
2009  * the inode it points to is unlinked and put into the link count
2010  * fix up tree.
2011  *
2012  * If a name from the log points to a file or directory that does
2013  * not exist in the FS, it is skipped.  fsyncs on directories
2014  * do not force down inodes inside that directory, just changes to the
2015  * names or unlinks in a directory.
2016  *
2017  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
2018  * non-existing inode) and 1 if the name was replayed.
2019  */
2020 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
2021 				    struct btrfs_root *root,
2022 				    struct btrfs_path *path,
2023 				    struct extent_buffer *eb,
2024 				    struct btrfs_dir_item *di,
2025 				    struct btrfs_key *key)
2026 {
2027 	char *name;
2028 	int name_len;
2029 	struct btrfs_dir_item *dir_dst_di;
2030 	struct btrfs_dir_item *index_dst_di;
2031 	bool dir_dst_matches = false;
2032 	bool index_dst_matches = false;
2033 	struct btrfs_key log_key;
2034 	struct btrfs_key search_key;
2035 	struct inode *dir;
2036 	u8 log_type;
2037 	bool exists;
2038 	int ret;
2039 	bool update_size = true;
2040 	bool name_added = false;
2041 
2042 	dir = read_one_inode(root, key->objectid);
2043 	if (!dir)
2044 		return -EIO;
2045 
2046 	name_len = btrfs_dir_name_len(eb, di);
2047 	name = kmalloc(name_len, GFP_NOFS);
2048 	if (!name) {
2049 		ret = -ENOMEM;
2050 		goto out;
2051 	}
2052 
2053 	log_type = btrfs_dir_type(eb, di);
2054 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
2055 		   name_len);
2056 
2057 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
2058 	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
2059 	btrfs_release_path(path);
2060 	if (ret < 0)
2061 		goto out;
2062 	exists = (ret == 0);
2063 	ret = 0;
2064 
2065 	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
2066 					   name, name_len, 1);
2067 	if (IS_ERR(dir_dst_di)) {
2068 		ret = PTR_ERR(dir_dst_di);
2069 		goto out;
2070 	} else if (dir_dst_di) {
2071 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
2072 						   dir_dst_di, &log_key, log_type,
2073 						   exists);
2074 		if (ret < 0)
2075 			goto out;
2076 		dir_dst_matches = (ret == 1);
2077 	}
2078 
2079 	btrfs_release_path(path);
2080 
2081 	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
2082 						   key->objectid, key->offset,
2083 						   name, name_len, 1);
2084 	if (IS_ERR(index_dst_di)) {
2085 		ret = PTR_ERR(index_dst_di);
2086 		goto out;
2087 	} else if (index_dst_di) {
2088 		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
2089 						   index_dst_di, &log_key,
2090 						   log_type, exists);
2091 		if (ret < 0)
2092 			goto out;
2093 		index_dst_matches = (ret == 1);
2094 	}
2095 
2096 	btrfs_release_path(path);
2097 
2098 	if (dir_dst_matches && index_dst_matches) {
2099 		ret = 0;
2100 		update_size = false;
2101 		goto out;
2102 	}
2103 
2104 	/*
2105 	 * Check if the inode reference exists in the log for the given name,
2106 	 * inode and parent inode
2107 	 */
2108 	search_key.objectid = log_key.objectid;
2109 	search_key.type = BTRFS_INODE_REF_KEY;
2110 	search_key.offset = key->objectid;
2111 	ret = backref_in_log(root->log_root, &search_key, 0, name, name_len);
2112 	if (ret < 0) {
2113 	        goto out;
2114 	} else if (ret) {
2115 	        /* The dentry will be added later. */
2116 	        ret = 0;
2117 	        update_size = false;
2118 	        goto out;
2119 	}
2120 
2121 	search_key.objectid = log_key.objectid;
2122 	search_key.type = BTRFS_INODE_EXTREF_KEY;
2123 	search_key.offset = key->objectid;
2124 	ret = backref_in_log(root->log_root, &search_key, key->objectid, name,
2125 			     name_len);
2126 	if (ret < 0) {
2127 		goto out;
2128 	} else if (ret) {
2129 		/* The dentry will be added later. */
2130 		ret = 0;
2131 		update_size = false;
2132 		goto out;
2133 	}
2134 	btrfs_release_path(path);
2135 	ret = insert_one_name(trans, root, key->objectid, key->offset,
2136 			      name, name_len, &log_key);
2137 	if (ret && ret != -ENOENT && ret != -EEXIST)
2138 		goto out;
2139 	if (!ret)
2140 		name_added = true;
2141 	update_size = false;
2142 	ret = 0;
2143 
2144 out:
2145 	if (!ret && update_size) {
2146 		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2147 		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
2148 	}
2149 	kfree(name);
2150 	iput(dir);
2151 	if (!ret && name_added)
2152 		ret = 1;
2153 	return ret;
2154 }
2155 
2156 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
2157 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2158 					struct btrfs_root *root,
2159 					struct btrfs_path *path,
2160 					struct extent_buffer *eb, int slot,
2161 					struct btrfs_key *key)
2162 {
2163 	int ret;
2164 	struct btrfs_dir_item *di;
2165 
2166 	/* We only log dir index keys, which only contain a single dir item. */
2167 	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
2168 
2169 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2170 	ret = replay_one_name(trans, root, path, eb, di, key);
2171 	if (ret < 0)
2172 		return ret;
2173 
2174 	/*
2175 	 * If this entry refers to a non-directory (directories can not have a
2176 	 * link count > 1) and it was added in the transaction that was not
2177 	 * committed, make sure we fixup the link count of the inode the entry
2178 	 * points to. Otherwise something like the following would result in a
2179 	 * directory pointing to an inode with a wrong link that does not account
2180 	 * for this dir entry:
2181 	 *
2182 	 * mkdir testdir
2183 	 * touch testdir/foo
2184 	 * touch testdir/bar
2185 	 * sync
2186 	 *
2187 	 * ln testdir/bar testdir/bar_link
2188 	 * ln testdir/foo testdir/foo_link
2189 	 * xfs_io -c "fsync" testdir/bar
2190 	 *
2191 	 * <power failure>
2192 	 *
2193 	 * mount fs, log replay happens
2194 	 *
2195 	 * File foo would remain with a link count of 1 when it has two entries
2196 	 * pointing to it in the directory testdir. This would make it impossible
2197 	 * to ever delete the parent directory has it would result in stale
2198 	 * dentries that can never be deleted.
2199 	 */
2200 	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2201 		struct btrfs_path *fixup_path;
2202 		struct btrfs_key di_key;
2203 
2204 		fixup_path = btrfs_alloc_path();
2205 		if (!fixup_path)
2206 			return -ENOMEM;
2207 
2208 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2209 		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2210 		btrfs_free_path(fixup_path);
2211 	}
2212 
2213 	return ret;
2214 }
2215 
2216 /*
2217  * directory replay has two parts.  There are the standard directory
2218  * items in the log copied from the subvolume, and range items
2219  * created in the log while the subvolume was logged.
2220  *
2221  * The range items tell us which parts of the key space the log
2222  * is authoritative for.  During replay, if a key in the subvolume
2223  * directory is in a logged range item, but not actually in the log
2224  * that means it was deleted from the directory before the fsync
2225  * and should be removed.
2226  */
2227 static noinline int find_dir_range(struct btrfs_root *root,
2228 				   struct btrfs_path *path,
2229 				   u64 dirid,
2230 				   u64 *start_ret, u64 *end_ret)
2231 {
2232 	struct btrfs_key key;
2233 	u64 found_end;
2234 	struct btrfs_dir_log_item *item;
2235 	int ret;
2236 	int nritems;
2237 
2238 	if (*start_ret == (u64)-1)
2239 		return 1;
2240 
2241 	key.objectid = dirid;
2242 	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2243 	key.offset = *start_ret;
2244 
2245 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2246 	if (ret < 0)
2247 		goto out;
2248 	if (ret > 0) {
2249 		if (path->slots[0] == 0)
2250 			goto out;
2251 		path->slots[0]--;
2252 	}
2253 	if (ret != 0)
2254 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2255 
2256 	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2257 		ret = 1;
2258 		goto next;
2259 	}
2260 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2261 			      struct btrfs_dir_log_item);
2262 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2263 
2264 	if (*start_ret >= key.offset && *start_ret <= found_end) {
2265 		ret = 0;
2266 		*start_ret = key.offset;
2267 		*end_ret = found_end;
2268 		goto out;
2269 	}
2270 	ret = 1;
2271 next:
2272 	/* check the next slot in the tree to see if it is a valid item */
2273 	nritems = btrfs_header_nritems(path->nodes[0]);
2274 	path->slots[0]++;
2275 	if (path->slots[0] >= nritems) {
2276 		ret = btrfs_next_leaf(root, path);
2277 		if (ret)
2278 			goto out;
2279 	}
2280 
2281 	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2282 
2283 	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2284 		ret = 1;
2285 		goto out;
2286 	}
2287 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2288 			      struct btrfs_dir_log_item);
2289 	found_end = btrfs_dir_log_end(path->nodes[0], item);
2290 	*start_ret = key.offset;
2291 	*end_ret = found_end;
2292 	ret = 0;
2293 out:
2294 	btrfs_release_path(path);
2295 	return ret;
2296 }
2297 
2298 /*
2299  * this looks for a given directory item in the log.  If the directory
2300  * item is not in the log, the item is removed and the inode it points
2301  * to is unlinked
2302  */
2303 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2304 				      struct btrfs_root *log,
2305 				      struct btrfs_path *path,
2306 				      struct btrfs_path *log_path,
2307 				      struct inode *dir,
2308 				      struct btrfs_key *dir_key)
2309 {
2310 	struct btrfs_root *root = BTRFS_I(dir)->root;
2311 	int ret;
2312 	struct extent_buffer *eb;
2313 	int slot;
2314 	struct btrfs_dir_item *di;
2315 	int name_len;
2316 	char *name;
2317 	struct inode *inode = NULL;
2318 	struct btrfs_key location;
2319 
2320 	/*
2321 	 * Currenly we only log dir index keys. Even if we replay a log created
2322 	 * by an older kernel that logged both dir index and dir item keys, all
2323 	 * we need to do is process the dir index keys, we (and our caller) can
2324 	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2325 	 */
2326 	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2327 
2328 	eb = path->nodes[0];
2329 	slot = path->slots[0];
2330 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2331 	name_len = btrfs_dir_name_len(eb, di);
2332 	name = kmalloc(name_len, GFP_NOFS);
2333 	if (!name) {
2334 		ret = -ENOMEM;
2335 		goto out;
2336 	}
2337 
2338 	read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len);
2339 
2340 	if (log) {
2341 		struct btrfs_dir_item *log_di;
2342 
2343 		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2344 						     dir_key->objectid,
2345 						     dir_key->offset,
2346 						     name, name_len, 0);
2347 		if (IS_ERR(log_di)) {
2348 			ret = PTR_ERR(log_di);
2349 			goto out;
2350 		} else if (log_di) {
2351 			/* The dentry exists in the log, we have nothing to do. */
2352 			ret = 0;
2353 			goto out;
2354 		}
2355 	}
2356 
2357 	btrfs_dir_item_key_to_cpu(eb, di, &location);
2358 	btrfs_release_path(path);
2359 	btrfs_release_path(log_path);
2360 	inode = read_one_inode(root, location.objectid);
2361 	if (!inode) {
2362 		ret = -EIO;
2363 		goto out;
2364 	}
2365 
2366 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
2367 	if (ret)
2368 		goto out;
2369 
2370 	inc_nlink(inode);
2371 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
2372 				 name_len);
2373 	if (ret)
2374 		goto out;
2375 
2376 	ret = btrfs_run_delayed_items(trans);
2377 	if (ret)
2378 		goto out;
2379 
2380 	/*
2381 	 * Unlike dir item keys, dir index keys can only have one name (entry) in
2382 	 * them, as there are no key collisions since each key has a unique offset
2383 	 * (an index number), so we're done.
2384 	 */
2385 out:
2386 	btrfs_release_path(path);
2387 	btrfs_release_path(log_path);
2388 	kfree(name);
2389 	iput(inode);
2390 	return ret;
2391 }
2392 
2393 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2394 			      struct btrfs_root *root,
2395 			      struct btrfs_root *log,
2396 			      struct btrfs_path *path,
2397 			      const u64 ino)
2398 {
2399 	struct btrfs_key search_key;
2400 	struct btrfs_path *log_path;
2401 	int i;
2402 	int nritems;
2403 	int ret;
2404 
2405 	log_path = btrfs_alloc_path();
2406 	if (!log_path)
2407 		return -ENOMEM;
2408 
2409 	search_key.objectid = ino;
2410 	search_key.type = BTRFS_XATTR_ITEM_KEY;
2411 	search_key.offset = 0;
2412 again:
2413 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2414 	if (ret < 0)
2415 		goto out;
2416 process_leaf:
2417 	nritems = btrfs_header_nritems(path->nodes[0]);
2418 	for (i = path->slots[0]; i < nritems; i++) {
2419 		struct btrfs_key key;
2420 		struct btrfs_dir_item *di;
2421 		struct btrfs_dir_item *log_di;
2422 		u32 total_size;
2423 		u32 cur;
2424 
2425 		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2426 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2427 			ret = 0;
2428 			goto out;
2429 		}
2430 
2431 		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2432 		total_size = btrfs_item_size(path->nodes[0], i);
2433 		cur = 0;
2434 		while (cur < total_size) {
2435 			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2436 			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2437 			u32 this_len = sizeof(*di) + name_len + data_len;
2438 			char *name;
2439 
2440 			name = kmalloc(name_len, GFP_NOFS);
2441 			if (!name) {
2442 				ret = -ENOMEM;
2443 				goto out;
2444 			}
2445 			read_extent_buffer(path->nodes[0], name,
2446 					   (unsigned long)(di + 1), name_len);
2447 
2448 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2449 						    name, name_len, 0);
2450 			btrfs_release_path(log_path);
2451 			if (!log_di) {
2452 				/* Doesn't exist in log tree, so delete it. */
2453 				btrfs_release_path(path);
2454 				di = btrfs_lookup_xattr(trans, root, path, ino,
2455 							name, name_len, -1);
2456 				kfree(name);
2457 				if (IS_ERR(di)) {
2458 					ret = PTR_ERR(di);
2459 					goto out;
2460 				}
2461 				ASSERT(di);
2462 				ret = btrfs_delete_one_dir_name(trans, root,
2463 								path, di);
2464 				if (ret)
2465 					goto out;
2466 				btrfs_release_path(path);
2467 				search_key = key;
2468 				goto again;
2469 			}
2470 			kfree(name);
2471 			if (IS_ERR(log_di)) {
2472 				ret = PTR_ERR(log_di);
2473 				goto out;
2474 			}
2475 			cur += this_len;
2476 			di = (struct btrfs_dir_item *)((char *)di + this_len);
2477 		}
2478 	}
2479 	ret = btrfs_next_leaf(root, path);
2480 	if (ret > 0)
2481 		ret = 0;
2482 	else if (ret == 0)
2483 		goto process_leaf;
2484 out:
2485 	btrfs_free_path(log_path);
2486 	btrfs_release_path(path);
2487 	return ret;
2488 }
2489 
2490 
2491 /*
2492  * deletion replay happens before we copy any new directory items
2493  * out of the log or out of backreferences from inodes.  It
2494  * scans the log to find ranges of keys that log is authoritative for,
2495  * and then scans the directory to find items in those ranges that are
2496  * not present in the log.
2497  *
2498  * Anything we don't find in the log is unlinked and removed from the
2499  * directory.
2500  */
2501 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2502 				       struct btrfs_root *root,
2503 				       struct btrfs_root *log,
2504 				       struct btrfs_path *path,
2505 				       u64 dirid, int del_all)
2506 {
2507 	u64 range_start;
2508 	u64 range_end;
2509 	int ret = 0;
2510 	struct btrfs_key dir_key;
2511 	struct btrfs_key found_key;
2512 	struct btrfs_path *log_path;
2513 	struct inode *dir;
2514 
2515 	dir_key.objectid = dirid;
2516 	dir_key.type = BTRFS_DIR_INDEX_KEY;
2517 	log_path = btrfs_alloc_path();
2518 	if (!log_path)
2519 		return -ENOMEM;
2520 
2521 	dir = read_one_inode(root, dirid);
2522 	/* it isn't an error if the inode isn't there, that can happen
2523 	 * because we replay the deletes before we copy in the inode item
2524 	 * from the log
2525 	 */
2526 	if (!dir) {
2527 		btrfs_free_path(log_path);
2528 		return 0;
2529 	}
2530 
2531 	range_start = 0;
2532 	range_end = 0;
2533 	while (1) {
2534 		if (del_all)
2535 			range_end = (u64)-1;
2536 		else {
2537 			ret = find_dir_range(log, path, dirid,
2538 					     &range_start, &range_end);
2539 			if (ret < 0)
2540 				goto out;
2541 			else if (ret > 0)
2542 				break;
2543 		}
2544 
2545 		dir_key.offset = range_start;
2546 		while (1) {
2547 			int nritems;
2548 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2549 						0, 0);
2550 			if (ret < 0)
2551 				goto out;
2552 
2553 			nritems = btrfs_header_nritems(path->nodes[0]);
2554 			if (path->slots[0] >= nritems) {
2555 				ret = btrfs_next_leaf(root, path);
2556 				if (ret == 1)
2557 					break;
2558 				else if (ret < 0)
2559 					goto out;
2560 			}
2561 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2562 					      path->slots[0]);
2563 			if (found_key.objectid != dirid ||
2564 			    found_key.type != dir_key.type) {
2565 				ret = 0;
2566 				goto out;
2567 			}
2568 
2569 			if (found_key.offset > range_end)
2570 				break;
2571 
2572 			ret = check_item_in_log(trans, log, path,
2573 						log_path, dir,
2574 						&found_key);
2575 			if (ret)
2576 				goto out;
2577 			if (found_key.offset == (u64)-1)
2578 				break;
2579 			dir_key.offset = found_key.offset + 1;
2580 		}
2581 		btrfs_release_path(path);
2582 		if (range_end == (u64)-1)
2583 			break;
2584 		range_start = range_end + 1;
2585 	}
2586 	ret = 0;
2587 out:
2588 	btrfs_release_path(path);
2589 	btrfs_free_path(log_path);
2590 	iput(dir);
2591 	return ret;
2592 }
2593 
2594 /*
2595  * the process_func used to replay items from the log tree.  This
2596  * gets called in two different stages.  The first stage just looks
2597  * for inodes and makes sure they are all copied into the subvolume.
2598  *
2599  * The second stage copies all the other item types from the log into
2600  * the subvolume.  The two stage approach is slower, but gets rid of
2601  * lots of complexity around inodes referencing other inodes that exist
2602  * only in the log (references come from either directory items or inode
2603  * back refs).
2604  */
2605 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2606 			     struct walk_control *wc, u64 gen, int level)
2607 {
2608 	int nritems;
2609 	struct btrfs_path *path;
2610 	struct btrfs_root *root = wc->replay_dest;
2611 	struct btrfs_key key;
2612 	int i;
2613 	int ret;
2614 
2615 	ret = btrfs_read_buffer(eb, gen, level, NULL);
2616 	if (ret)
2617 		return ret;
2618 
2619 	level = btrfs_header_level(eb);
2620 
2621 	if (level != 0)
2622 		return 0;
2623 
2624 	path = btrfs_alloc_path();
2625 	if (!path)
2626 		return -ENOMEM;
2627 
2628 	nritems = btrfs_header_nritems(eb);
2629 	for (i = 0; i < nritems; i++) {
2630 		btrfs_item_key_to_cpu(eb, &key, i);
2631 
2632 		/* inode keys are done during the first stage */
2633 		if (key.type == BTRFS_INODE_ITEM_KEY &&
2634 		    wc->stage == LOG_WALK_REPLAY_INODES) {
2635 			struct btrfs_inode_item *inode_item;
2636 			u32 mode;
2637 
2638 			inode_item = btrfs_item_ptr(eb, i,
2639 					    struct btrfs_inode_item);
2640 			/*
2641 			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2642 			 * and never got linked before the fsync, skip it, as
2643 			 * replaying it is pointless since it would be deleted
2644 			 * later. We skip logging tmpfiles, but it's always
2645 			 * possible we are replaying a log created with a kernel
2646 			 * that used to log tmpfiles.
2647 			 */
2648 			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2649 				wc->ignore_cur_inode = true;
2650 				continue;
2651 			} else {
2652 				wc->ignore_cur_inode = false;
2653 			}
2654 			ret = replay_xattr_deletes(wc->trans, root, log,
2655 						   path, key.objectid);
2656 			if (ret)
2657 				break;
2658 			mode = btrfs_inode_mode(eb, inode_item);
2659 			if (S_ISDIR(mode)) {
2660 				ret = replay_dir_deletes(wc->trans,
2661 					 root, log, path, key.objectid, 0);
2662 				if (ret)
2663 					break;
2664 			}
2665 			ret = overwrite_item(wc->trans, root, path,
2666 					     eb, i, &key);
2667 			if (ret)
2668 				break;
2669 
2670 			/*
2671 			 * Before replaying extents, truncate the inode to its
2672 			 * size. We need to do it now and not after log replay
2673 			 * because before an fsync we can have prealloc extents
2674 			 * added beyond the inode's i_size. If we did it after,
2675 			 * through orphan cleanup for example, we would drop
2676 			 * those prealloc extents just after replaying them.
2677 			 */
2678 			if (S_ISREG(mode)) {
2679 				struct btrfs_drop_extents_args drop_args = { 0 };
2680 				struct inode *inode;
2681 				u64 from;
2682 
2683 				inode = read_one_inode(root, key.objectid);
2684 				if (!inode) {
2685 					ret = -EIO;
2686 					break;
2687 				}
2688 				from = ALIGN(i_size_read(inode),
2689 					     root->fs_info->sectorsize);
2690 				drop_args.start = from;
2691 				drop_args.end = (u64)-1;
2692 				drop_args.drop_cache = true;
2693 				ret = btrfs_drop_extents(wc->trans, root,
2694 							 BTRFS_I(inode),
2695 							 &drop_args);
2696 				if (!ret) {
2697 					inode_sub_bytes(inode,
2698 							drop_args.bytes_found);
2699 					/* Update the inode's nbytes. */
2700 					ret = btrfs_update_inode(wc->trans,
2701 							root, BTRFS_I(inode));
2702 				}
2703 				iput(inode);
2704 				if (ret)
2705 					break;
2706 			}
2707 
2708 			ret = link_to_fixup_dir(wc->trans, root,
2709 						path, key.objectid);
2710 			if (ret)
2711 				break;
2712 		}
2713 
2714 		if (wc->ignore_cur_inode)
2715 			continue;
2716 
2717 		if (key.type == BTRFS_DIR_INDEX_KEY &&
2718 		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2719 			ret = replay_one_dir_item(wc->trans, root, path,
2720 						  eb, i, &key);
2721 			if (ret)
2722 				break;
2723 		}
2724 
2725 		if (wc->stage < LOG_WALK_REPLAY_ALL)
2726 			continue;
2727 
2728 		/* these keys are simply copied */
2729 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2730 			ret = overwrite_item(wc->trans, root, path,
2731 					     eb, i, &key);
2732 			if (ret)
2733 				break;
2734 		} else if (key.type == BTRFS_INODE_REF_KEY ||
2735 			   key.type == BTRFS_INODE_EXTREF_KEY) {
2736 			ret = add_inode_ref(wc->trans, root, log, path,
2737 					    eb, i, &key);
2738 			if (ret && ret != -ENOENT)
2739 				break;
2740 			ret = 0;
2741 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2742 			ret = replay_one_extent(wc->trans, root, path,
2743 						eb, i, &key);
2744 			if (ret)
2745 				break;
2746 		}
2747 		/*
2748 		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2749 		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2750 		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2751 		 * older kernel with such keys, ignore them.
2752 		 */
2753 	}
2754 	btrfs_free_path(path);
2755 	return ret;
2756 }
2757 
2758 /*
2759  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2760  */
2761 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2762 {
2763 	struct btrfs_block_group *cache;
2764 
2765 	cache = btrfs_lookup_block_group(fs_info, start);
2766 	if (!cache) {
2767 		btrfs_err(fs_info, "unable to find block group for %llu", start);
2768 		return;
2769 	}
2770 
2771 	spin_lock(&cache->space_info->lock);
2772 	spin_lock(&cache->lock);
2773 	cache->reserved -= fs_info->nodesize;
2774 	cache->space_info->bytes_reserved -= fs_info->nodesize;
2775 	spin_unlock(&cache->lock);
2776 	spin_unlock(&cache->space_info->lock);
2777 
2778 	btrfs_put_block_group(cache);
2779 }
2780 
2781 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2782 				   struct btrfs_root *root,
2783 				   struct btrfs_path *path, int *level,
2784 				   struct walk_control *wc)
2785 {
2786 	struct btrfs_fs_info *fs_info = root->fs_info;
2787 	u64 bytenr;
2788 	u64 ptr_gen;
2789 	struct extent_buffer *next;
2790 	struct extent_buffer *cur;
2791 	u32 blocksize;
2792 	int ret = 0;
2793 
2794 	while (*level > 0) {
2795 		struct btrfs_key first_key;
2796 
2797 		cur = path->nodes[*level];
2798 
2799 		WARN_ON(btrfs_header_level(cur) != *level);
2800 
2801 		if (path->slots[*level] >=
2802 		    btrfs_header_nritems(cur))
2803 			break;
2804 
2805 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2806 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2807 		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2808 		blocksize = fs_info->nodesize;
2809 
2810 		next = btrfs_find_create_tree_block(fs_info, bytenr,
2811 						    btrfs_header_owner(cur),
2812 						    *level - 1);
2813 		if (IS_ERR(next))
2814 			return PTR_ERR(next);
2815 
2816 		if (*level == 1) {
2817 			ret = wc->process_func(root, next, wc, ptr_gen,
2818 					       *level - 1);
2819 			if (ret) {
2820 				free_extent_buffer(next);
2821 				return ret;
2822 			}
2823 
2824 			path->slots[*level]++;
2825 			if (wc->free) {
2826 				ret = btrfs_read_buffer(next, ptr_gen,
2827 							*level - 1, &first_key);
2828 				if (ret) {
2829 					free_extent_buffer(next);
2830 					return ret;
2831 				}
2832 
2833 				if (trans) {
2834 					btrfs_tree_lock(next);
2835 					btrfs_clean_tree_block(next);
2836 					btrfs_wait_tree_block_writeback(next);
2837 					btrfs_tree_unlock(next);
2838 					ret = btrfs_pin_reserved_extent(trans,
2839 							bytenr, blocksize);
2840 					if (ret) {
2841 						free_extent_buffer(next);
2842 						return ret;
2843 					}
2844 					btrfs_redirty_list_add(
2845 						trans->transaction, next);
2846 				} else {
2847 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2848 						clear_extent_buffer_dirty(next);
2849 					unaccount_log_buffer(fs_info, bytenr);
2850 				}
2851 			}
2852 			free_extent_buffer(next);
2853 			continue;
2854 		}
2855 		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2856 		if (ret) {
2857 			free_extent_buffer(next);
2858 			return ret;
2859 		}
2860 
2861 		if (path->nodes[*level-1])
2862 			free_extent_buffer(path->nodes[*level-1]);
2863 		path->nodes[*level-1] = next;
2864 		*level = btrfs_header_level(next);
2865 		path->slots[*level] = 0;
2866 		cond_resched();
2867 	}
2868 	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2869 
2870 	cond_resched();
2871 	return 0;
2872 }
2873 
2874 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2875 				 struct btrfs_root *root,
2876 				 struct btrfs_path *path, int *level,
2877 				 struct walk_control *wc)
2878 {
2879 	struct btrfs_fs_info *fs_info = root->fs_info;
2880 	int i;
2881 	int slot;
2882 	int ret;
2883 
2884 	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2885 		slot = path->slots[i];
2886 		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2887 			path->slots[i]++;
2888 			*level = i;
2889 			WARN_ON(*level == 0);
2890 			return 0;
2891 		} else {
2892 			ret = wc->process_func(root, path->nodes[*level], wc,
2893 				 btrfs_header_generation(path->nodes[*level]),
2894 				 *level);
2895 			if (ret)
2896 				return ret;
2897 
2898 			if (wc->free) {
2899 				struct extent_buffer *next;
2900 
2901 				next = path->nodes[*level];
2902 
2903 				if (trans) {
2904 					btrfs_tree_lock(next);
2905 					btrfs_clean_tree_block(next);
2906 					btrfs_wait_tree_block_writeback(next);
2907 					btrfs_tree_unlock(next);
2908 					ret = btrfs_pin_reserved_extent(trans,
2909 						     path->nodes[*level]->start,
2910 						     path->nodes[*level]->len);
2911 					if (ret)
2912 						return ret;
2913 					btrfs_redirty_list_add(trans->transaction,
2914 							       next);
2915 				} else {
2916 					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2917 						clear_extent_buffer_dirty(next);
2918 
2919 					unaccount_log_buffer(fs_info,
2920 						path->nodes[*level]->start);
2921 				}
2922 			}
2923 			free_extent_buffer(path->nodes[*level]);
2924 			path->nodes[*level] = NULL;
2925 			*level = i + 1;
2926 		}
2927 	}
2928 	return 1;
2929 }
2930 
2931 /*
2932  * drop the reference count on the tree rooted at 'snap'.  This traverses
2933  * the tree freeing any blocks that have a ref count of zero after being
2934  * decremented.
2935  */
2936 static int walk_log_tree(struct btrfs_trans_handle *trans,
2937 			 struct btrfs_root *log, struct walk_control *wc)
2938 {
2939 	struct btrfs_fs_info *fs_info = log->fs_info;
2940 	int ret = 0;
2941 	int wret;
2942 	int level;
2943 	struct btrfs_path *path;
2944 	int orig_level;
2945 
2946 	path = btrfs_alloc_path();
2947 	if (!path)
2948 		return -ENOMEM;
2949 
2950 	level = btrfs_header_level(log->node);
2951 	orig_level = level;
2952 	path->nodes[level] = log->node;
2953 	atomic_inc(&log->node->refs);
2954 	path->slots[level] = 0;
2955 
2956 	while (1) {
2957 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2958 		if (wret > 0)
2959 			break;
2960 		if (wret < 0) {
2961 			ret = wret;
2962 			goto out;
2963 		}
2964 
2965 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2966 		if (wret > 0)
2967 			break;
2968 		if (wret < 0) {
2969 			ret = wret;
2970 			goto out;
2971 		}
2972 	}
2973 
2974 	/* was the root node processed? if not, catch it here */
2975 	if (path->nodes[orig_level]) {
2976 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2977 			 btrfs_header_generation(path->nodes[orig_level]),
2978 			 orig_level);
2979 		if (ret)
2980 			goto out;
2981 		if (wc->free) {
2982 			struct extent_buffer *next;
2983 
2984 			next = path->nodes[orig_level];
2985 
2986 			if (trans) {
2987 				btrfs_tree_lock(next);
2988 				btrfs_clean_tree_block(next);
2989 				btrfs_wait_tree_block_writeback(next);
2990 				btrfs_tree_unlock(next);
2991 				ret = btrfs_pin_reserved_extent(trans,
2992 						next->start, next->len);
2993 				if (ret)
2994 					goto out;
2995 				btrfs_redirty_list_add(trans->transaction, next);
2996 			} else {
2997 				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2998 					clear_extent_buffer_dirty(next);
2999 				unaccount_log_buffer(fs_info, next->start);
3000 			}
3001 		}
3002 	}
3003 
3004 out:
3005 	btrfs_free_path(path);
3006 	return ret;
3007 }
3008 
3009 /*
3010  * helper function to update the item for a given subvolumes log root
3011  * in the tree of log roots
3012  */
3013 static int update_log_root(struct btrfs_trans_handle *trans,
3014 			   struct btrfs_root *log,
3015 			   struct btrfs_root_item *root_item)
3016 {
3017 	struct btrfs_fs_info *fs_info = log->fs_info;
3018 	int ret;
3019 
3020 	if (log->log_transid == 1) {
3021 		/* insert root item on the first sync */
3022 		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
3023 				&log->root_key, root_item);
3024 	} else {
3025 		ret = btrfs_update_root(trans, fs_info->log_root_tree,
3026 				&log->root_key, root_item);
3027 	}
3028 	return ret;
3029 }
3030 
3031 static void wait_log_commit(struct btrfs_root *root, int transid)
3032 {
3033 	DEFINE_WAIT(wait);
3034 	int index = transid % 2;
3035 
3036 	/*
3037 	 * we only allow two pending log transactions at a time,
3038 	 * so we know that if ours is more than 2 older than the
3039 	 * current transaction, we're done
3040 	 */
3041 	for (;;) {
3042 		prepare_to_wait(&root->log_commit_wait[index],
3043 				&wait, TASK_UNINTERRUPTIBLE);
3044 
3045 		if (!(root->log_transid_committed < transid &&
3046 		      atomic_read(&root->log_commit[index])))
3047 			break;
3048 
3049 		mutex_unlock(&root->log_mutex);
3050 		schedule();
3051 		mutex_lock(&root->log_mutex);
3052 	}
3053 	finish_wait(&root->log_commit_wait[index], &wait);
3054 }
3055 
3056 static void wait_for_writer(struct btrfs_root *root)
3057 {
3058 	DEFINE_WAIT(wait);
3059 
3060 	for (;;) {
3061 		prepare_to_wait(&root->log_writer_wait, &wait,
3062 				TASK_UNINTERRUPTIBLE);
3063 		if (!atomic_read(&root->log_writers))
3064 			break;
3065 
3066 		mutex_unlock(&root->log_mutex);
3067 		schedule();
3068 		mutex_lock(&root->log_mutex);
3069 	}
3070 	finish_wait(&root->log_writer_wait, &wait);
3071 }
3072 
3073 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3074 					struct btrfs_log_ctx *ctx)
3075 {
3076 	mutex_lock(&root->log_mutex);
3077 	list_del_init(&ctx->list);
3078 	mutex_unlock(&root->log_mutex);
3079 }
3080 
3081 /*
3082  * Invoked in log mutex context, or be sure there is no other task which
3083  * can access the list.
3084  */
3085 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3086 					     int index, int error)
3087 {
3088 	struct btrfs_log_ctx *ctx;
3089 	struct btrfs_log_ctx *safe;
3090 
3091 	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3092 		list_del_init(&ctx->list);
3093 		ctx->log_ret = error;
3094 	}
3095 }
3096 
3097 /*
3098  * btrfs_sync_log does sends a given tree log down to the disk and
3099  * updates the super blocks to record it.  When this call is done,
3100  * you know that any inodes previously logged are safely on disk only
3101  * if it returns 0.
3102  *
3103  * Any other return value means you need to call btrfs_commit_transaction.
3104  * Some of the edge cases for fsyncing directories that have had unlinks
3105  * or renames done in the past mean that sometimes the only safe
3106  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3107  * that has happened.
3108  */
3109 int btrfs_sync_log(struct btrfs_trans_handle *trans,
3110 		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3111 {
3112 	int index1;
3113 	int index2;
3114 	int mark;
3115 	int ret;
3116 	struct btrfs_fs_info *fs_info = root->fs_info;
3117 	struct btrfs_root *log = root->log_root;
3118 	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3119 	struct btrfs_root_item new_root_item;
3120 	int log_transid = 0;
3121 	struct btrfs_log_ctx root_log_ctx;
3122 	struct blk_plug plug;
3123 	u64 log_root_start;
3124 	u64 log_root_level;
3125 
3126 	mutex_lock(&root->log_mutex);
3127 	log_transid = ctx->log_transid;
3128 	if (root->log_transid_committed >= log_transid) {
3129 		mutex_unlock(&root->log_mutex);
3130 		return ctx->log_ret;
3131 	}
3132 
3133 	index1 = log_transid % 2;
3134 	if (atomic_read(&root->log_commit[index1])) {
3135 		wait_log_commit(root, log_transid);
3136 		mutex_unlock(&root->log_mutex);
3137 		return ctx->log_ret;
3138 	}
3139 	ASSERT(log_transid == root->log_transid);
3140 	atomic_set(&root->log_commit[index1], 1);
3141 
3142 	/* wait for previous tree log sync to complete */
3143 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3144 		wait_log_commit(root, log_transid - 1);
3145 
3146 	while (1) {
3147 		int batch = atomic_read(&root->log_batch);
3148 		/* when we're on an ssd, just kick the log commit out */
3149 		if (!btrfs_test_opt(fs_info, SSD) &&
3150 		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3151 			mutex_unlock(&root->log_mutex);
3152 			schedule_timeout_uninterruptible(1);
3153 			mutex_lock(&root->log_mutex);
3154 		}
3155 		wait_for_writer(root);
3156 		if (batch == atomic_read(&root->log_batch))
3157 			break;
3158 	}
3159 
3160 	/* bail out if we need to do a full commit */
3161 	if (btrfs_need_log_full_commit(trans)) {
3162 		ret = -EAGAIN;
3163 		mutex_unlock(&root->log_mutex);
3164 		goto out;
3165 	}
3166 
3167 	if (log_transid % 2 == 0)
3168 		mark = EXTENT_DIRTY;
3169 	else
3170 		mark = EXTENT_NEW;
3171 
3172 	/* we start IO on  all the marked extents here, but we don't actually
3173 	 * wait for them until later.
3174 	 */
3175 	blk_start_plug(&plug);
3176 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3177 	/*
3178 	 * -EAGAIN happens when someone, e.g., a concurrent transaction
3179 	 *  commit, writes a dirty extent in this tree-log commit. This
3180 	 *  concurrent write will create a hole writing out the extents,
3181 	 *  and we cannot proceed on a zoned filesystem, requiring
3182 	 *  sequential writing. While we can bail out to a full commit
3183 	 *  here, but we can continue hoping the concurrent writing fills
3184 	 *  the hole.
3185 	 */
3186 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
3187 		ret = 0;
3188 	if (ret) {
3189 		blk_finish_plug(&plug);
3190 		btrfs_abort_transaction(trans, ret);
3191 		btrfs_set_log_full_commit(trans);
3192 		mutex_unlock(&root->log_mutex);
3193 		goto out;
3194 	}
3195 
3196 	/*
3197 	 * We _must_ update under the root->log_mutex in order to make sure we
3198 	 * have a consistent view of the log root we are trying to commit at
3199 	 * this moment.
3200 	 *
3201 	 * We _must_ copy this into a local copy, because we are not holding the
3202 	 * log_root_tree->log_mutex yet.  This is important because when we
3203 	 * commit the log_root_tree we must have a consistent view of the
3204 	 * log_root_tree when we update the super block to point at the
3205 	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3206 	 * with the commit and possibly point at the new block which we may not
3207 	 * have written out.
3208 	 */
3209 	btrfs_set_root_node(&log->root_item, log->node);
3210 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3211 
3212 	root->log_transid++;
3213 	log->log_transid = root->log_transid;
3214 	root->log_start_pid = 0;
3215 	/*
3216 	 * IO has been started, blocks of the log tree have WRITTEN flag set
3217 	 * in their headers. new modifications of the log will be written to
3218 	 * new positions. so it's safe to allow log writers to go in.
3219 	 */
3220 	mutex_unlock(&root->log_mutex);
3221 
3222 	if (btrfs_is_zoned(fs_info)) {
3223 		mutex_lock(&fs_info->tree_root->log_mutex);
3224 		if (!log_root_tree->node) {
3225 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3226 			if (ret) {
3227 				mutex_unlock(&fs_info->tree_root->log_mutex);
3228 				goto out;
3229 			}
3230 		}
3231 		mutex_unlock(&fs_info->tree_root->log_mutex);
3232 	}
3233 
3234 	btrfs_init_log_ctx(&root_log_ctx, NULL);
3235 
3236 	mutex_lock(&log_root_tree->log_mutex);
3237 
3238 	index2 = log_root_tree->log_transid % 2;
3239 	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3240 	root_log_ctx.log_transid = log_root_tree->log_transid;
3241 
3242 	/*
3243 	 * Now we are safe to update the log_root_tree because we're under the
3244 	 * log_mutex, and we're a current writer so we're holding the commit
3245 	 * open until we drop the log_mutex.
3246 	 */
3247 	ret = update_log_root(trans, log, &new_root_item);
3248 	if (ret) {
3249 		if (!list_empty(&root_log_ctx.list))
3250 			list_del_init(&root_log_ctx.list);
3251 
3252 		blk_finish_plug(&plug);
3253 		btrfs_set_log_full_commit(trans);
3254 
3255 		if (ret != -ENOSPC) {
3256 			btrfs_abort_transaction(trans, ret);
3257 			mutex_unlock(&log_root_tree->log_mutex);
3258 			goto out;
3259 		}
3260 		btrfs_wait_tree_log_extents(log, mark);
3261 		mutex_unlock(&log_root_tree->log_mutex);
3262 		ret = -EAGAIN;
3263 		goto out;
3264 	}
3265 
3266 	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3267 		blk_finish_plug(&plug);
3268 		list_del_init(&root_log_ctx.list);
3269 		mutex_unlock(&log_root_tree->log_mutex);
3270 		ret = root_log_ctx.log_ret;
3271 		goto out;
3272 	}
3273 
3274 	index2 = root_log_ctx.log_transid % 2;
3275 	if (atomic_read(&log_root_tree->log_commit[index2])) {
3276 		blk_finish_plug(&plug);
3277 		ret = btrfs_wait_tree_log_extents(log, mark);
3278 		wait_log_commit(log_root_tree,
3279 				root_log_ctx.log_transid);
3280 		mutex_unlock(&log_root_tree->log_mutex);
3281 		if (!ret)
3282 			ret = root_log_ctx.log_ret;
3283 		goto out;
3284 	}
3285 	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3286 	atomic_set(&log_root_tree->log_commit[index2], 1);
3287 
3288 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3289 		wait_log_commit(log_root_tree,
3290 				root_log_ctx.log_transid - 1);
3291 	}
3292 
3293 	/*
3294 	 * now that we've moved on to the tree of log tree roots,
3295 	 * check the full commit flag again
3296 	 */
3297 	if (btrfs_need_log_full_commit(trans)) {
3298 		blk_finish_plug(&plug);
3299 		btrfs_wait_tree_log_extents(log, mark);
3300 		mutex_unlock(&log_root_tree->log_mutex);
3301 		ret = -EAGAIN;
3302 		goto out_wake_log_root;
3303 	}
3304 
3305 	ret = btrfs_write_marked_extents(fs_info,
3306 					 &log_root_tree->dirty_log_pages,
3307 					 EXTENT_DIRTY | EXTENT_NEW);
3308 	blk_finish_plug(&plug);
3309 	/*
3310 	 * As described above, -EAGAIN indicates a hole in the extents. We
3311 	 * cannot wait for these write outs since the waiting cause a
3312 	 * deadlock. Bail out to the full commit instead.
3313 	 */
3314 	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3315 		btrfs_set_log_full_commit(trans);
3316 		btrfs_wait_tree_log_extents(log, mark);
3317 		mutex_unlock(&log_root_tree->log_mutex);
3318 		goto out_wake_log_root;
3319 	} else if (ret) {
3320 		btrfs_set_log_full_commit(trans);
3321 		btrfs_abort_transaction(trans, ret);
3322 		mutex_unlock(&log_root_tree->log_mutex);
3323 		goto out_wake_log_root;
3324 	}
3325 	ret = btrfs_wait_tree_log_extents(log, mark);
3326 	if (!ret)
3327 		ret = btrfs_wait_tree_log_extents(log_root_tree,
3328 						  EXTENT_NEW | EXTENT_DIRTY);
3329 	if (ret) {
3330 		btrfs_set_log_full_commit(trans);
3331 		mutex_unlock(&log_root_tree->log_mutex);
3332 		goto out_wake_log_root;
3333 	}
3334 
3335 	log_root_start = log_root_tree->node->start;
3336 	log_root_level = btrfs_header_level(log_root_tree->node);
3337 	log_root_tree->log_transid++;
3338 	mutex_unlock(&log_root_tree->log_mutex);
3339 
3340 	/*
3341 	 * Here we are guaranteed that nobody is going to write the superblock
3342 	 * for the current transaction before us and that neither we do write
3343 	 * our superblock before the previous transaction finishes its commit
3344 	 * and writes its superblock, because:
3345 	 *
3346 	 * 1) We are holding a handle on the current transaction, so no body
3347 	 *    can commit it until we release the handle;
3348 	 *
3349 	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3350 	 *    if the previous transaction is still committing, and hasn't yet
3351 	 *    written its superblock, we wait for it to do it, because a
3352 	 *    transaction commit acquires the tree_log_mutex when the commit
3353 	 *    begins and releases it only after writing its superblock.
3354 	 */
3355 	mutex_lock(&fs_info->tree_log_mutex);
3356 
3357 	/*
3358 	 * The previous transaction writeout phase could have failed, and thus
3359 	 * marked the fs in an error state.  We must not commit here, as we
3360 	 * could have updated our generation in the super_for_commit and
3361 	 * writing the super here would result in transid mismatches.  If there
3362 	 * is an error here just bail.
3363 	 */
3364 	if (BTRFS_FS_ERROR(fs_info)) {
3365 		ret = -EIO;
3366 		btrfs_set_log_full_commit(trans);
3367 		btrfs_abort_transaction(trans, ret);
3368 		mutex_unlock(&fs_info->tree_log_mutex);
3369 		goto out_wake_log_root;
3370 	}
3371 
3372 	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3373 	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3374 	ret = write_all_supers(fs_info, 1);
3375 	mutex_unlock(&fs_info->tree_log_mutex);
3376 	if (ret) {
3377 		btrfs_set_log_full_commit(trans);
3378 		btrfs_abort_transaction(trans, ret);
3379 		goto out_wake_log_root;
3380 	}
3381 
3382 	/*
3383 	 * We know there can only be one task here, since we have not yet set
3384 	 * root->log_commit[index1] to 0 and any task attempting to sync the
3385 	 * log must wait for the previous log transaction to commit if it's
3386 	 * still in progress or wait for the current log transaction commit if
3387 	 * someone else already started it. We use <= and not < because the
3388 	 * first log transaction has an ID of 0.
3389 	 */
3390 	ASSERT(root->last_log_commit <= log_transid);
3391 	root->last_log_commit = log_transid;
3392 
3393 out_wake_log_root:
3394 	mutex_lock(&log_root_tree->log_mutex);
3395 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3396 
3397 	log_root_tree->log_transid_committed++;
3398 	atomic_set(&log_root_tree->log_commit[index2], 0);
3399 	mutex_unlock(&log_root_tree->log_mutex);
3400 
3401 	/*
3402 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3403 	 * all the updates above are seen by the woken threads. It might not be
3404 	 * necessary, but proving that seems to be hard.
3405 	 */
3406 	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3407 out:
3408 	mutex_lock(&root->log_mutex);
3409 	btrfs_remove_all_log_ctxs(root, index1, ret);
3410 	root->log_transid_committed++;
3411 	atomic_set(&root->log_commit[index1], 0);
3412 	mutex_unlock(&root->log_mutex);
3413 
3414 	/*
3415 	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3416 	 * all the updates above are seen by the woken threads. It might not be
3417 	 * necessary, but proving that seems to be hard.
3418 	 */
3419 	cond_wake_up(&root->log_commit_wait[index1]);
3420 	return ret;
3421 }
3422 
3423 static void free_log_tree(struct btrfs_trans_handle *trans,
3424 			  struct btrfs_root *log)
3425 {
3426 	int ret;
3427 	struct walk_control wc = {
3428 		.free = 1,
3429 		.process_func = process_one_buffer
3430 	};
3431 
3432 	if (log->node) {
3433 		ret = walk_log_tree(trans, log, &wc);
3434 		if (ret) {
3435 			/*
3436 			 * We weren't able to traverse the entire log tree, the
3437 			 * typical scenario is getting an -EIO when reading an
3438 			 * extent buffer of the tree, due to a previous writeback
3439 			 * failure of it.
3440 			 */
3441 			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3442 				&log->fs_info->fs_state);
3443 
3444 			/*
3445 			 * Some extent buffers of the log tree may still be dirty
3446 			 * and not yet written back to storage, because we may
3447 			 * have updates to a log tree without syncing a log tree,
3448 			 * such as during rename and link operations. So flush
3449 			 * them out and wait for their writeback to complete, so
3450 			 * that we properly cleanup their state and pages.
3451 			 */
3452 			btrfs_write_marked_extents(log->fs_info,
3453 						   &log->dirty_log_pages,
3454 						   EXTENT_DIRTY | EXTENT_NEW);
3455 			btrfs_wait_tree_log_extents(log,
3456 						    EXTENT_DIRTY | EXTENT_NEW);
3457 
3458 			if (trans)
3459 				btrfs_abort_transaction(trans, ret);
3460 			else
3461 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3462 		}
3463 	}
3464 
3465 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3466 			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3467 	extent_io_tree_release(&log->log_csum_range);
3468 
3469 	btrfs_put_root(log);
3470 }
3471 
3472 /*
3473  * free all the extents used by the tree log.  This should be called
3474  * at commit time of the full transaction
3475  */
3476 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3477 {
3478 	if (root->log_root) {
3479 		free_log_tree(trans, root->log_root);
3480 		root->log_root = NULL;
3481 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3482 	}
3483 	return 0;
3484 }
3485 
3486 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3487 			     struct btrfs_fs_info *fs_info)
3488 {
3489 	if (fs_info->log_root_tree) {
3490 		free_log_tree(trans, fs_info->log_root_tree);
3491 		fs_info->log_root_tree = NULL;
3492 		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3493 	}
3494 	return 0;
3495 }
3496 
3497 /*
3498  * Check if an inode was logged in the current transaction. This may often
3499  * return some false positives, because logged_trans is an in memory only field,
3500  * not persisted anywhere. This is meant to be used in contexts where a false
3501  * positive has no functional consequences.
3502  */
3503 static bool inode_logged(struct btrfs_trans_handle *trans,
3504 			 struct btrfs_inode *inode)
3505 {
3506 	if (inode->logged_trans == trans->transid)
3507 		return true;
3508 
3509 	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
3510 		return false;
3511 
3512 	/*
3513 	 * The inode's logged_trans is always 0 when we load it (because it is
3514 	 * not persisted in the inode item or elsewhere). So if it is 0, the
3515 	 * inode was last modified in the current transaction then the inode may
3516 	 * have been logged before in the current transaction, then evicted and
3517 	 * loaded again in the current transaction - or may have never been logged
3518 	 * in the current transaction, but since we can not be sure, we have to
3519 	 * assume it was, otherwise our callers can leave an inconsistent log.
3520 	 */
3521 	if (inode->logged_trans == 0 &&
3522 	    inode->last_trans == trans->transid &&
3523 	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3524 		return true;
3525 
3526 	return false;
3527 }
3528 
3529 /*
3530  * If both a file and directory are logged, and unlinks or renames are
3531  * mixed in, we have a few interesting corners:
3532  *
3533  * create file X in dir Y
3534  * link file X to X.link in dir Y
3535  * fsync file X
3536  * unlink file X but leave X.link
3537  * fsync dir Y
3538  *
3539  * After a crash we would expect only X.link to exist.  But file X
3540  * didn't get fsync'd again so the log has back refs for X and X.link.
3541  *
3542  * We solve this by removing directory entries and inode backrefs from the
3543  * log when a file that was logged in the current transaction is
3544  * unlinked.  Any later fsync will include the updated log entries, and
3545  * we'll be able to reconstruct the proper directory items from backrefs.
3546  *
3547  * This optimizations allows us to avoid relogging the entire inode
3548  * or the entire directory.
3549  */
3550 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3551 				  struct btrfs_root *root,
3552 				  const char *name, int name_len,
3553 				  struct btrfs_inode *dir, u64 index)
3554 {
3555 	struct btrfs_root *log;
3556 	struct btrfs_dir_item *di;
3557 	struct btrfs_path *path;
3558 	int ret;
3559 	int err = 0;
3560 	u64 dir_ino = btrfs_ino(dir);
3561 
3562 	if (!inode_logged(trans, dir))
3563 		return;
3564 
3565 	ret = join_running_log_trans(root);
3566 	if (ret)
3567 		return;
3568 
3569 	mutex_lock(&dir->log_mutex);
3570 
3571 	log = root->log_root;
3572 	path = btrfs_alloc_path();
3573 	if (!path) {
3574 		err = -ENOMEM;
3575 		goto out_unlock;
3576 	}
3577 
3578 	/*
3579 	 * We only log dir index items of a directory, so we don't need to look
3580 	 * for dir item keys.
3581 	 */
3582 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3583 					 index, name, name_len, -1);
3584 	if (IS_ERR(di)) {
3585 		err = PTR_ERR(di);
3586 		goto fail;
3587 	}
3588 	if (di) {
3589 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3590 		if (ret) {
3591 			err = ret;
3592 			goto fail;
3593 		}
3594 	}
3595 
3596 	/*
3597 	 * We do not need to update the size field of the directory's inode item
3598 	 * because on log replay we update the field to reflect all existing
3599 	 * entries in the directory (see overwrite_item()).
3600 	 */
3601 fail:
3602 	btrfs_free_path(path);
3603 out_unlock:
3604 	mutex_unlock(&dir->log_mutex);
3605 	if (err < 0)
3606 		btrfs_set_log_full_commit(trans);
3607 	btrfs_end_log_trans(root);
3608 }
3609 
3610 /* see comments for btrfs_del_dir_entries_in_log */
3611 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3612 				struct btrfs_root *root,
3613 				const char *name, int name_len,
3614 				struct btrfs_inode *inode, u64 dirid)
3615 {
3616 	struct btrfs_root *log;
3617 	u64 index;
3618 	int ret;
3619 
3620 	if (!inode_logged(trans, inode))
3621 		return;
3622 
3623 	ret = join_running_log_trans(root);
3624 	if (ret)
3625 		return;
3626 	log = root->log_root;
3627 	mutex_lock(&inode->log_mutex);
3628 
3629 	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3630 				  dirid, &index);
3631 	mutex_unlock(&inode->log_mutex);
3632 	if (ret < 0 && ret != -ENOENT)
3633 		btrfs_set_log_full_commit(trans);
3634 	btrfs_end_log_trans(root);
3635 }
3636 
3637 /*
3638  * creates a range item in the log for 'dirid'.  first_offset and
3639  * last_offset tell us which parts of the key space the log should
3640  * be considered authoritative for.
3641  */
3642 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3643 				       struct btrfs_root *log,
3644 				       struct btrfs_path *path,
3645 				       u64 dirid,
3646 				       u64 first_offset, u64 last_offset)
3647 {
3648 	int ret;
3649 	struct btrfs_key key;
3650 	struct btrfs_dir_log_item *item;
3651 
3652 	key.objectid = dirid;
3653 	key.offset = first_offset;
3654 	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3655 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3656 	if (ret)
3657 		return ret;
3658 
3659 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3660 			      struct btrfs_dir_log_item);
3661 	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3662 	btrfs_mark_buffer_dirty(path->nodes[0]);
3663 	btrfs_release_path(path);
3664 	return 0;
3665 }
3666 
3667 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3668 				 struct btrfs_root *log,
3669 				 struct extent_buffer *src,
3670 				 struct btrfs_path *dst_path,
3671 				 int start_slot,
3672 				 int count)
3673 {
3674 	char *ins_data = NULL;
3675 	struct btrfs_item_batch batch;
3676 	struct extent_buffer *dst;
3677 	unsigned long src_offset;
3678 	unsigned long dst_offset;
3679 	struct btrfs_key key;
3680 	u32 item_size;
3681 	int ret;
3682 	int i;
3683 
3684 	ASSERT(count > 0);
3685 	batch.nr = count;
3686 
3687 	if (count == 1) {
3688 		btrfs_item_key_to_cpu(src, &key, start_slot);
3689 		item_size = btrfs_item_size(src, start_slot);
3690 		batch.keys = &key;
3691 		batch.data_sizes = &item_size;
3692 		batch.total_data_size = item_size;
3693 	} else {
3694 		struct btrfs_key *ins_keys;
3695 		u32 *ins_sizes;
3696 
3697 		ins_data = kmalloc(count * sizeof(u32) +
3698 				   count * sizeof(struct btrfs_key), GFP_NOFS);
3699 		if (!ins_data)
3700 			return -ENOMEM;
3701 
3702 		ins_sizes = (u32 *)ins_data;
3703 		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3704 		batch.keys = ins_keys;
3705 		batch.data_sizes = ins_sizes;
3706 		batch.total_data_size = 0;
3707 
3708 		for (i = 0; i < count; i++) {
3709 			const int slot = start_slot + i;
3710 
3711 			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3712 			ins_sizes[i] = btrfs_item_size(src, slot);
3713 			batch.total_data_size += ins_sizes[i];
3714 		}
3715 	}
3716 
3717 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3718 	if (ret)
3719 		goto out;
3720 
3721 	dst = dst_path->nodes[0];
3722 	/*
3723 	 * Copy all the items in bulk, in a single copy operation. Item data is
3724 	 * organized such that it's placed at the end of a leaf and from right
3725 	 * to left. For example, the data for the second item ends at an offset
3726 	 * that matches the offset where the data for the first item starts, the
3727 	 * data for the third item ends at an offset that matches the offset
3728 	 * where the data of the second items starts, and so on.
3729 	 * Therefore our source and destination start offsets for copy match the
3730 	 * offsets of the last items (highest slots).
3731 	 */
3732 	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3733 	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3734 	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3735 	btrfs_release_path(dst_path);
3736 out:
3737 	kfree(ins_data);
3738 
3739 	return ret;
3740 }
3741 
3742 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3743 				  struct btrfs_inode *inode,
3744 				  struct btrfs_path *path,
3745 				  struct btrfs_path *dst_path,
3746 				  struct btrfs_log_ctx *ctx)
3747 {
3748 	struct btrfs_root *log = inode->root->log_root;
3749 	struct extent_buffer *src = path->nodes[0];
3750 	const int nritems = btrfs_header_nritems(src);
3751 	const u64 ino = btrfs_ino(inode);
3752 	const bool inode_logged_before = inode_logged(trans, inode);
3753 	bool last_found = false;
3754 	int batch_start = 0;
3755 	int batch_size = 0;
3756 	int i;
3757 
3758 	for (i = path->slots[0]; i < nritems; i++) {
3759 		struct btrfs_key key;
3760 		int ret;
3761 
3762 		btrfs_item_key_to_cpu(src, &key, i);
3763 
3764 		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3765 			last_found = true;
3766 			break;
3767 		}
3768 
3769 		ctx->last_dir_item_offset = key.offset;
3770 		/*
3771 		 * We must make sure that when we log a directory entry, the
3772 		 * corresponding inode, after log replay, has a matching link
3773 		 * count. For example:
3774 		 *
3775 		 * touch foo
3776 		 * mkdir mydir
3777 		 * sync
3778 		 * ln foo mydir/bar
3779 		 * xfs_io -c "fsync" mydir
3780 		 * <crash>
3781 		 * <mount fs and log replay>
3782 		 *
3783 		 * Would result in a fsync log that when replayed, our file inode
3784 		 * would have a link count of 1, but we get two directory entries
3785 		 * pointing to the same inode. After removing one of the names,
3786 		 * it would not be possible to remove the other name, which
3787 		 * resulted always in stale file handle errors, and would not be
3788 		 * possible to rmdir the parent directory, since its i_size could
3789 		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3790 		 * resulting in -ENOTEMPTY errors.
3791 		 */
3792 		if (!ctx->log_new_dentries) {
3793 			struct btrfs_dir_item *di;
3794 			struct btrfs_key di_key;
3795 
3796 			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3797 			btrfs_dir_item_key_to_cpu(src, di, &di_key);
3798 			if ((btrfs_dir_transid(src, di) == trans->transid ||
3799 			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3800 			    di_key.type != BTRFS_ROOT_ITEM_KEY)
3801 				ctx->log_new_dentries = true;
3802 		}
3803 
3804 		if (!inode_logged_before)
3805 			goto add_to_batch;
3806 
3807 		/*
3808 		 * If we were logged before and have logged dir items, we can skip
3809 		 * checking if any item with a key offset larger than the last one
3810 		 * we logged is in the log tree, saving time and avoiding adding
3811 		 * contention on the log tree.
3812 		 */
3813 		if (key.offset > inode->last_dir_index_offset)
3814 			goto add_to_batch;
3815 		/*
3816 		 * Check if the key was already logged before. If not we can add
3817 		 * it to a batch for bulk insertion.
3818 		 */
3819 		ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
3820 		if (ret < 0) {
3821 			return ret;
3822 		} else if (ret > 0) {
3823 			btrfs_release_path(dst_path);
3824 			goto add_to_batch;
3825 		}
3826 
3827 		/*
3828 		 * Item exists in the log. Overwrite the item in the log if it
3829 		 * has different content or do nothing if it has exactly the same
3830 		 * content. And then flush the current batch if any - do it after
3831 		 * overwriting the current item, or we would deadlock otherwise,
3832 		 * since we are holding a path for the existing item.
3833 		 */
3834 		ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
3835 		if (ret < 0)
3836 			return ret;
3837 
3838 		if (batch_size > 0) {
3839 			ret = flush_dir_items_batch(trans, log, src, dst_path,
3840 						    batch_start, batch_size);
3841 			if (ret < 0)
3842 				return ret;
3843 			batch_size = 0;
3844 		}
3845 		continue;
3846 add_to_batch:
3847 		if (batch_size == 0)
3848 			batch_start = i;
3849 		batch_size++;
3850 	}
3851 
3852 	if (batch_size > 0) {
3853 		int ret;
3854 
3855 		ret = flush_dir_items_batch(trans, log, src, dst_path,
3856 					    batch_start, batch_size);
3857 		if (ret < 0)
3858 			return ret;
3859 	}
3860 
3861 	return last_found ? 1 : 0;
3862 }
3863 
3864 /*
3865  * log all the items included in the current transaction for a given
3866  * directory.  This also creates the range items in the log tree required
3867  * to replay anything deleted before the fsync
3868  */
3869 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3870 			  struct btrfs_inode *inode,
3871 			  struct btrfs_path *path,
3872 			  struct btrfs_path *dst_path,
3873 			  struct btrfs_log_ctx *ctx,
3874 			  u64 min_offset, u64 *last_offset_ret)
3875 {
3876 	struct btrfs_key min_key;
3877 	struct btrfs_root *root = inode->root;
3878 	struct btrfs_root *log = root->log_root;
3879 	int err = 0;
3880 	int ret;
3881 	u64 first_offset = min_offset;
3882 	u64 last_offset = (u64)-1;
3883 	u64 ino = btrfs_ino(inode);
3884 
3885 	min_key.objectid = ino;
3886 	min_key.type = BTRFS_DIR_INDEX_KEY;
3887 	min_key.offset = min_offset;
3888 
3889 	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3890 
3891 	/*
3892 	 * we didn't find anything from this transaction, see if there
3893 	 * is anything at all
3894 	 */
3895 	if (ret != 0 || min_key.objectid != ino ||
3896 	    min_key.type != BTRFS_DIR_INDEX_KEY) {
3897 		min_key.objectid = ino;
3898 		min_key.type = BTRFS_DIR_INDEX_KEY;
3899 		min_key.offset = (u64)-1;
3900 		btrfs_release_path(path);
3901 		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3902 		if (ret < 0) {
3903 			btrfs_release_path(path);
3904 			return ret;
3905 		}
3906 		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3907 
3908 		/* if ret == 0 there are items for this type,
3909 		 * create a range to tell us the last key of this type.
3910 		 * otherwise, there are no items in this directory after
3911 		 * *min_offset, and we create a range to indicate that.
3912 		 */
3913 		if (ret == 0) {
3914 			struct btrfs_key tmp;
3915 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3916 					      path->slots[0]);
3917 			if (tmp.type == BTRFS_DIR_INDEX_KEY)
3918 				first_offset = max(min_offset, tmp.offset) + 1;
3919 		}
3920 		goto done;
3921 	}
3922 
3923 	/* go backward to find any previous key */
3924 	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3925 	if (ret == 0) {
3926 		struct btrfs_key tmp;
3927 		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3928 		if (tmp.type == BTRFS_DIR_INDEX_KEY) {
3929 			first_offset = tmp.offset;
3930 			ret = overwrite_item(trans, log, dst_path,
3931 					     path->nodes[0], path->slots[0],
3932 					     &tmp);
3933 			if (ret) {
3934 				err = ret;
3935 				goto done;
3936 			}
3937 		}
3938 	}
3939 	btrfs_release_path(path);
3940 
3941 	/*
3942 	 * Find the first key from this transaction again.  See the note for
3943 	 * log_new_dir_dentries, if we're logging a directory recursively we
3944 	 * won't be holding its i_mutex, which means we can modify the directory
3945 	 * while we're logging it.  If we remove an entry between our first
3946 	 * search and this search we'll not find the key again and can just
3947 	 * bail.
3948 	 */
3949 search:
3950 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3951 	if (ret != 0)
3952 		goto done;
3953 
3954 	/*
3955 	 * we have a block from this transaction, log every item in it
3956 	 * from our directory
3957 	 */
3958 	while (1) {
3959 		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
3960 		if (ret != 0) {
3961 			if (ret < 0)
3962 				err = ret;
3963 			goto done;
3964 		}
3965 		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3966 
3967 		/*
3968 		 * look ahead to the next item and see if it is also
3969 		 * from this directory and from this transaction
3970 		 */
3971 		ret = btrfs_next_leaf(root, path);
3972 		if (ret) {
3973 			if (ret == 1)
3974 				last_offset = (u64)-1;
3975 			else
3976 				err = ret;
3977 			goto done;
3978 		}
3979 		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3980 		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3981 			last_offset = (u64)-1;
3982 			goto done;
3983 		}
3984 		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3985 			ctx->last_dir_item_offset = min_key.offset;
3986 			ret = overwrite_item(trans, log, dst_path,
3987 					     path->nodes[0], path->slots[0],
3988 					     &min_key);
3989 			if (ret)
3990 				err = ret;
3991 			else
3992 				last_offset = min_key.offset;
3993 			goto done;
3994 		}
3995 		if (need_resched()) {
3996 			btrfs_release_path(path);
3997 			cond_resched();
3998 			goto search;
3999 		}
4000 	}
4001 done:
4002 	btrfs_release_path(path);
4003 	btrfs_release_path(dst_path);
4004 
4005 	if (err == 0) {
4006 		*last_offset_ret = last_offset;
4007 		/*
4008 		 * insert the log range keys to indicate where the log
4009 		 * is valid
4010 		 */
4011 		ret = insert_dir_log_key(trans, log, path, ino, first_offset,
4012 					 last_offset);
4013 		if (ret)
4014 			err = ret;
4015 	}
4016 	return err;
4017 }
4018 
4019 /*
4020  * logging directories is very similar to logging inodes, We find all the items
4021  * from the current transaction and write them to the log.
4022  *
4023  * The recovery code scans the directory in the subvolume, and if it finds a
4024  * key in the range logged that is not present in the log tree, then it means
4025  * that dir entry was unlinked during the transaction.
4026  *
4027  * In order for that scan to work, we must include one key smaller than
4028  * the smallest logged by this transaction and one key larger than the largest
4029  * key logged by this transaction.
4030  */
4031 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4032 			  struct btrfs_inode *inode,
4033 			  struct btrfs_path *path,
4034 			  struct btrfs_path *dst_path,
4035 			  struct btrfs_log_ctx *ctx)
4036 {
4037 	u64 min_key;
4038 	u64 max_key;
4039 	int ret;
4040 
4041 	/*
4042 	 * If this is the first time we are being logged in the current
4043 	 * transaction, or we were logged before but the inode was evicted and
4044 	 * reloaded later, in which case its logged_trans is 0, reset the value
4045 	 * of the last logged key offset. Note that we don't use the helper
4046 	 * function inode_logged() here - that is because the function returns
4047 	 * true after an inode eviction, assuming the worst case as it can not
4048 	 * know for sure if the inode was logged before. So we can not skip key
4049 	 * searches in the case the inode was evicted, because it may not have
4050 	 * been logged in this transaction and may have been logged in a past
4051 	 * transaction, so we need to reset the last dir index offset to (u64)-1.
4052 	 */
4053 	if (inode->logged_trans != trans->transid)
4054 		inode->last_dir_index_offset = (u64)-1;
4055 
4056 	min_key = 0;
4057 	max_key = 0;
4058 	ctx->last_dir_item_offset = inode->last_dir_index_offset;
4059 
4060 	while (1) {
4061 		ret = log_dir_items(trans, inode, path, dst_path,
4062 				ctx, min_key, &max_key);
4063 		if (ret)
4064 			return ret;
4065 		if (max_key == (u64)-1)
4066 			break;
4067 		min_key = max_key + 1;
4068 	}
4069 
4070 	inode->last_dir_index_offset = ctx->last_dir_item_offset;
4071 
4072 	return 0;
4073 }
4074 
4075 /*
4076  * a helper function to drop items from the log before we relog an
4077  * inode.  max_key_type indicates the highest item type to remove.
4078  * This cannot be run for file data extents because it does not
4079  * free the extents they point to.
4080  */
4081 static int drop_inode_items(struct btrfs_trans_handle *trans,
4082 				  struct btrfs_root *log,
4083 				  struct btrfs_path *path,
4084 				  struct btrfs_inode *inode,
4085 				  int max_key_type)
4086 {
4087 	int ret;
4088 	struct btrfs_key key;
4089 	struct btrfs_key found_key;
4090 	int start_slot;
4091 
4092 	if (!inode_logged(trans, inode))
4093 		return 0;
4094 
4095 	key.objectid = btrfs_ino(inode);
4096 	key.type = max_key_type;
4097 	key.offset = (u64)-1;
4098 
4099 	while (1) {
4100 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4101 		BUG_ON(ret == 0); /* Logic error */
4102 		if (ret < 0)
4103 			break;
4104 
4105 		if (path->slots[0] == 0)
4106 			break;
4107 
4108 		path->slots[0]--;
4109 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4110 				      path->slots[0]);
4111 
4112 		if (found_key.objectid != key.objectid)
4113 			break;
4114 
4115 		found_key.offset = 0;
4116 		found_key.type = 0;
4117 		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
4118 		if (ret < 0)
4119 			break;
4120 
4121 		ret = btrfs_del_items(trans, log, path, start_slot,
4122 				      path->slots[0] - start_slot + 1);
4123 		/*
4124 		 * If start slot isn't 0 then we don't need to re-search, we've
4125 		 * found the last guy with the objectid in this tree.
4126 		 */
4127 		if (ret || start_slot != 0)
4128 			break;
4129 		btrfs_release_path(path);
4130 	}
4131 	btrfs_release_path(path);
4132 	if (ret > 0)
4133 		ret = 0;
4134 	return ret;
4135 }
4136 
4137 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4138 				struct btrfs_root *log_root,
4139 				struct btrfs_inode *inode,
4140 				u64 new_size, u32 min_type)
4141 {
4142 	struct btrfs_truncate_control control = {
4143 		.new_size = new_size,
4144 		.ino = btrfs_ino(inode),
4145 		.min_type = min_type,
4146 		.skip_ref_updates = true,
4147 	};
4148 
4149 	return btrfs_truncate_inode_items(trans, log_root, &control);
4150 }
4151 
4152 static void fill_inode_item(struct btrfs_trans_handle *trans,
4153 			    struct extent_buffer *leaf,
4154 			    struct btrfs_inode_item *item,
4155 			    struct inode *inode, int log_inode_only,
4156 			    u64 logged_isize)
4157 {
4158 	struct btrfs_map_token token;
4159 	u64 flags;
4160 
4161 	btrfs_init_map_token(&token, leaf);
4162 
4163 	if (log_inode_only) {
4164 		/* set the generation to zero so the recover code
4165 		 * can tell the difference between an logging
4166 		 * just to say 'this inode exists' and a logging
4167 		 * to say 'update this inode with these values'
4168 		 */
4169 		btrfs_set_token_inode_generation(&token, item, 0);
4170 		btrfs_set_token_inode_size(&token, item, logged_isize);
4171 	} else {
4172 		btrfs_set_token_inode_generation(&token, item,
4173 						 BTRFS_I(inode)->generation);
4174 		btrfs_set_token_inode_size(&token, item, inode->i_size);
4175 	}
4176 
4177 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4178 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4179 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4180 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4181 
4182 	btrfs_set_token_timespec_sec(&token, &item->atime,
4183 				     inode->i_atime.tv_sec);
4184 	btrfs_set_token_timespec_nsec(&token, &item->atime,
4185 				      inode->i_atime.tv_nsec);
4186 
4187 	btrfs_set_token_timespec_sec(&token, &item->mtime,
4188 				     inode->i_mtime.tv_sec);
4189 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4190 				      inode->i_mtime.tv_nsec);
4191 
4192 	btrfs_set_token_timespec_sec(&token, &item->ctime,
4193 				     inode->i_ctime.tv_sec);
4194 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4195 				      inode->i_ctime.tv_nsec);
4196 
4197 	/*
4198 	 * We do not need to set the nbytes field, in fact during a fast fsync
4199 	 * its value may not even be correct, since a fast fsync does not wait
4200 	 * for ordered extent completion, which is where we update nbytes, it
4201 	 * only waits for writeback to complete. During log replay as we find
4202 	 * file extent items and replay them, we adjust the nbytes field of the
4203 	 * inode item in subvolume tree as needed (see overwrite_item()).
4204 	 */
4205 
4206 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4207 	btrfs_set_token_inode_transid(&token, item, trans->transid);
4208 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4209 	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4210 					  BTRFS_I(inode)->ro_flags);
4211 	btrfs_set_token_inode_flags(&token, item, flags);
4212 	btrfs_set_token_inode_block_group(&token, item, 0);
4213 }
4214 
4215 static int log_inode_item(struct btrfs_trans_handle *trans,
4216 			  struct btrfs_root *log, struct btrfs_path *path,
4217 			  struct btrfs_inode *inode, bool inode_item_dropped)
4218 {
4219 	struct btrfs_inode_item *inode_item;
4220 	int ret;
4221 
4222 	/*
4223 	 * If we are doing a fast fsync and the inode was logged before in the
4224 	 * current transaction, then we know the inode was previously logged and
4225 	 * it exists in the log tree. For performance reasons, in this case use
4226 	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4227 	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4228 	 * contention in case there are concurrent fsyncs for other inodes of the
4229 	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4230 	 * already exists can also result in unnecessarily splitting a leaf.
4231 	 */
4232 	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4233 		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4234 		ASSERT(ret <= 0);
4235 		if (ret > 0)
4236 			ret = -ENOENT;
4237 	} else {
4238 		/*
4239 		 * This means it is the first fsync in the current transaction,
4240 		 * so the inode item is not in the log and we need to insert it.
4241 		 * We can never get -EEXIST because we are only called for a fast
4242 		 * fsync and in case an inode eviction happens after the inode was
4243 		 * logged before in the current transaction, when we load again
4244 		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4245 		 * flags and set ->logged_trans to 0.
4246 		 */
4247 		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4248 					      sizeof(*inode_item));
4249 		ASSERT(ret != -EEXIST);
4250 	}
4251 	if (ret)
4252 		return ret;
4253 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4254 				    struct btrfs_inode_item);
4255 	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4256 			0, 0);
4257 	btrfs_release_path(path);
4258 	return 0;
4259 }
4260 
4261 static int log_csums(struct btrfs_trans_handle *trans,
4262 		     struct btrfs_inode *inode,
4263 		     struct btrfs_root *log_root,
4264 		     struct btrfs_ordered_sum *sums)
4265 {
4266 	const u64 lock_end = sums->bytenr + sums->len - 1;
4267 	struct extent_state *cached_state = NULL;
4268 	int ret;
4269 
4270 	/*
4271 	 * If this inode was not used for reflink operations in the current
4272 	 * transaction with new extents, then do the fast path, no need to
4273 	 * worry about logging checksum items with overlapping ranges.
4274 	 */
4275 	if (inode->last_reflink_trans < trans->transid)
4276 		return btrfs_csum_file_blocks(trans, log_root, sums);
4277 
4278 	/*
4279 	 * Serialize logging for checksums. This is to avoid racing with the
4280 	 * same checksum being logged by another task that is logging another
4281 	 * file which happens to refer to the same extent as well. Such races
4282 	 * can leave checksum items in the log with overlapping ranges.
4283 	 */
4284 	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
4285 			       lock_end, &cached_state);
4286 	if (ret)
4287 		return ret;
4288 	/*
4289 	 * Due to extent cloning, we might have logged a csum item that covers a
4290 	 * subrange of a cloned extent, and later we can end up logging a csum
4291 	 * item for a larger subrange of the same extent or the entire range.
4292 	 * This would leave csum items in the log tree that cover the same range
4293 	 * and break the searches for checksums in the log tree, resulting in
4294 	 * some checksums missing in the fs/subvolume tree. So just delete (or
4295 	 * trim and adjust) any existing csum items in the log for this range.
4296 	 */
4297 	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
4298 	if (!ret)
4299 		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4300 
4301 	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
4302 			     &cached_state);
4303 
4304 	return ret;
4305 }
4306 
4307 static noinline int copy_items(struct btrfs_trans_handle *trans,
4308 			       struct btrfs_inode *inode,
4309 			       struct btrfs_path *dst_path,
4310 			       struct btrfs_path *src_path,
4311 			       int start_slot, int nr, int inode_only,
4312 			       u64 logged_isize)
4313 {
4314 	struct btrfs_fs_info *fs_info = trans->fs_info;
4315 	unsigned long src_offset;
4316 	unsigned long dst_offset;
4317 	struct btrfs_root *log = inode->root->log_root;
4318 	struct btrfs_file_extent_item *extent;
4319 	struct btrfs_inode_item *inode_item;
4320 	struct extent_buffer *src = src_path->nodes[0];
4321 	int ret;
4322 	struct btrfs_key *ins_keys;
4323 	u32 *ins_sizes;
4324 	struct btrfs_item_batch batch;
4325 	char *ins_data;
4326 	int i;
4327 	struct list_head ordered_sums;
4328 	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4329 
4330 	INIT_LIST_HEAD(&ordered_sums);
4331 
4332 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4333 			   nr * sizeof(u32), GFP_NOFS);
4334 	if (!ins_data)
4335 		return -ENOMEM;
4336 
4337 	ins_sizes = (u32 *)ins_data;
4338 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4339 	batch.keys = ins_keys;
4340 	batch.data_sizes = ins_sizes;
4341 	batch.total_data_size = 0;
4342 	batch.nr = nr;
4343 
4344 	for (i = 0; i < nr; i++) {
4345 		ins_sizes[i] = btrfs_item_size(src, i + start_slot);
4346 		batch.total_data_size += ins_sizes[i];
4347 		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4348 	}
4349 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4350 	if (ret) {
4351 		kfree(ins_data);
4352 		return ret;
4353 	}
4354 
4355 	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4356 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4357 						   dst_path->slots[0]);
4358 
4359 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4360 
4361 		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4362 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
4363 						    dst_path->slots[0],
4364 						    struct btrfs_inode_item);
4365 			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4366 					&inode->vfs_inode,
4367 					inode_only == LOG_INODE_EXISTS,
4368 					logged_isize);
4369 		} else {
4370 			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4371 					   src_offset, ins_sizes[i]);
4372 		}
4373 
4374 		/* take a reference on file data extents so that truncates
4375 		 * or deletes of this inode don't have to relog the inode
4376 		 * again
4377 		 */
4378 		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4379 		    !skip_csum) {
4380 			int found_type;
4381 			extent = btrfs_item_ptr(src, start_slot + i,
4382 						struct btrfs_file_extent_item);
4383 
4384 			if (btrfs_file_extent_generation(src, extent) < trans->transid)
4385 				continue;
4386 
4387 			found_type = btrfs_file_extent_type(src, extent);
4388 			if (found_type == BTRFS_FILE_EXTENT_REG) {
4389 				struct btrfs_root *csum_root;
4390 				u64 ds, dl, cs, cl;
4391 				ds = btrfs_file_extent_disk_bytenr(src,
4392 								extent);
4393 				/* ds == 0 is a hole */
4394 				if (ds == 0)
4395 					continue;
4396 
4397 				dl = btrfs_file_extent_disk_num_bytes(src,
4398 								extent);
4399 				cs = btrfs_file_extent_offset(src, extent);
4400 				cl = btrfs_file_extent_num_bytes(src,
4401 								extent);
4402 				if (btrfs_file_extent_compression(src,
4403 								  extent)) {
4404 					cs = 0;
4405 					cl = dl;
4406 				}
4407 
4408 				csum_root = btrfs_csum_root(fs_info, ds);
4409 				ret = btrfs_lookup_csums_range(csum_root,
4410 						ds + cs, ds + cs + cl - 1,
4411 						&ordered_sums, 0);
4412 				if (ret)
4413 					break;
4414 			}
4415 		}
4416 	}
4417 
4418 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4419 	btrfs_release_path(dst_path);
4420 	kfree(ins_data);
4421 
4422 	/*
4423 	 * we have to do this after the loop above to avoid changing the
4424 	 * log tree while trying to change the log tree.
4425 	 */
4426 	while (!list_empty(&ordered_sums)) {
4427 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4428 						   struct btrfs_ordered_sum,
4429 						   list);
4430 		if (!ret)
4431 			ret = log_csums(trans, inode, log, sums);
4432 		list_del(&sums->list);
4433 		kfree(sums);
4434 	}
4435 
4436 	return ret;
4437 }
4438 
4439 static int extent_cmp(void *priv, const struct list_head *a,
4440 		      const struct list_head *b)
4441 {
4442 	const struct extent_map *em1, *em2;
4443 
4444 	em1 = list_entry(a, struct extent_map, list);
4445 	em2 = list_entry(b, struct extent_map, list);
4446 
4447 	if (em1->start < em2->start)
4448 		return -1;
4449 	else if (em1->start > em2->start)
4450 		return 1;
4451 	return 0;
4452 }
4453 
4454 static int log_extent_csums(struct btrfs_trans_handle *trans,
4455 			    struct btrfs_inode *inode,
4456 			    struct btrfs_root *log_root,
4457 			    const struct extent_map *em,
4458 			    struct btrfs_log_ctx *ctx)
4459 {
4460 	struct btrfs_ordered_extent *ordered;
4461 	struct btrfs_root *csum_root;
4462 	u64 csum_offset;
4463 	u64 csum_len;
4464 	u64 mod_start = em->mod_start;
4465 	u64 mod_len = em->mod_len;
4466 	LIST_HEAD(ordered_sums);
4467 	int ret = 0;
4468 
4469 	if (inode->flags & BTRFS_INODE_NODATASUM ||
4470 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4471 	    em->block_start == EXTENT_MAP_HOLE)
4472 		return 0;
4473 
4474 	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4475 		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4476 		const u64 mod_end = mod_start + mod_len;
4477 		struct btrfs_ordered_sum *sums;
4478 
4479 		if (mod_len == 0)
4480 			break;
4481 
4482 		if (ordered_end <= mod_start)
4483 			continue;
4484 		if (mod_end <= ordered->file_offset)
4485 			break;
4486 
4487 		/*
4488 		 * We are going to copy all the csums on this ordered extent, so
4489 		 * go ahead and adjust mod_start and mod_len in case this ordered
4490 		 * extent has already been logged.
4491 		 */
4492 		if (ordered->file_offset > mod_start) {
4493 			if (ordered_end >= mod_end)
4494 				mod_len = ordered->file_offset - mod_start;
4495 			/*
4496 			 * If we have this case
4497 			 *
4498 			 * |--------- logged extent ---------|
4499 			 *       |----- ordered extent ----|
4500 			 *
4501 			 * Just don't mess with mod_start and mod_len, we'll
4502 			 * just end up logging more csums than we need and it
4503 			 * will be ok.
4504 			 */
4505 		} else {
4506 			if (ordered_end < mod_end) {
4507 				mod_len = mod_end - ordered_end;
4508 				mod_start = ordered_end;
4509 			} else {
4510 				mod_len = 0;
4511 			}
4512 		}
4513 
4514 		/*
4515 		 * To keep us from looping for the above case of an ordered
4516 		 * extent that falls inside of the logged extent.
4517 		 */
4518 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4519 			continue;
4520 
4521 		list_for_each_entry(sums, &ordered->list, list) {
4522 			ret = log_csums(trans, inode, log_root, sums);
4523 			if (ret)
4524 				return ret;
4525 		}
4526 	}
4527 
4528 	/* We're done, found all csums in the ordered extents. */
4529 	if (mod_len == 0)
4530 		return 0;
4531 
4532 	/* If we're compressed we have to save the entire range of csums. */
4533 	if (em->compress_type) {
4534 		csum_offset = 0;
4535 		csum_len = max(em->block_len, em->orig_block_len);
4536 	} else {
4537 		csum_offset = mod_start - em->start;
4538 		csum_len = mod_len;
4539 	}
4540 
4541 	/* block start is already adjusted for the file extent offset. */
4542 	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4543 	ret = btrfs_lookup_csums_range(csum_root,
4544 				       em->block_start + csum_offset,
4545 				       em->block_start + csum_offset +
4546 				       csum_len - 1, &ordered_sums, 0);
4547 	if (ret)
4548 		return ret;
4549 
4550 	while (!list_empty(&ordered_sums)) {
4551 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4552 						   struct btrfs_ordered_sum,
4553 						   list);
4554 		if (!ret)
4555 			ret = log_csums(trans, inode, log_root, sums);
4556 		list_del(&sums->list);
4557 		kfree(sums);
4558 	}
4559 
4560 	return ret;
4561 }
4562 
4563 static int log_one_extent(struct btrfs_trans_handle *trans,
4564 			  struct btrfs_inode *inode,
4565 			  const struct extent_map *em,
4566 			  struct btrfs_path *path,
4567 			  struct btrfs_log_ctx *ctx)
4568 {
4569 	struct btrfs_drop_extents_args drop_args = { 0 };
4570 	struct btrfs_root *log = inode->root->log_root;
4571 	struct btrfs_file_extent_item *fi;
4572 	struct extent_buffer *leaf;
4573 	struct btrfs_map_token token;
4574 	struct btrfs_key key;
4575 	u64 extent_offset = em->start - em->orig_start;
4576 	u64 block_len;
4577 	int ret;
4578 
4579 	ret = log_extent_csums(trans, inode, log, em, ctx);
4580 	if (ret)
4581 		return ret;
4582 
4583 	/*
4584 	 * If this is the first time we are logging the inode in the current
4585 	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4586 	 * because it does a deletion search, which always acquires write locks
4587 	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4588 	 * but also adds significant contention in a log tree, since log trees
4589 	 * are small, with a root at level 2 or 3 at most, due to their short
4590 	 * life span.
4591 	 */
4592 	if (inode_logged(trans, inode)) {
4593 		drop_args.path = path;
4594 		drop_args.start = em->start;
4595 		drop_args.end = em->start + em->len;
4596 		drop_args.replace_extent = true;
4597 		drop_args.extent_item_size = sizeof(*fi);
4598 		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4599 		if (ret)
4600 			return ret;
4601 	}
4602 
4603 	if (!drop_args.extent_inserted) {
4604 		key.objectid = btrfs_ino(inode);
4605 		key.type = BTRFS_EXTENT_DATA_KEY;
4606 		key.offset = em->start;
4607 
4608 		ret = btrfs_insert_empty_item(trans, log, path, &key,
4609 					      sizeof(*fi));
4610 		if (ret)
4611 			return ret;
4612 	}
4613 	leaf = path->nodes[0];
4614 	btrfs_init_map_token(&token, leaf);
4615 	fi = btrfs_item_ptr(leaf, path->slots[0],
4616 			    struct btrfs_file_extent_item);
4617 
4618 	btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4619 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4620 		btrfs_set_token_file_extent_type(&token, fi,
4621 						 BTRFS_FILE_EXTENT_PREALLOC);
4622 	else
4623 		btrfs_set_token_file_extent_type(&token, fi,
4624 						 BTRFS_FILE_EXTENT_REG);
4625 
4626 	block_len = max(em->block_len, em->orig_block_len);
4627 	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4628 		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4629 							em->block_start);
4630 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4631 	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4632 		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4633 							em->block_start -
4634 							extent_offset);
4635 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4636 	} else {
4637 		btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4638 		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4639 	}
4640 
4641 	btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4642 	btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4643 	btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4644 	btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4645 	btrfs_set_token_file_extent_encryption(&token, fi, 0);
4646 	btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4647 	btrfs_mark_buffer_dirty(leaf);
4648 
4649 	btrfs_release_path(path);
4650 
4651 	return ret;
4652 }
4653 
4654 /*
4655  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4656  * lose them after doing a full/fast fsync and replaying the log. We scan the
4657  * subvolume's root instead of iterating the inode's extent map tree because
4658  * otherwise we can log incorrect extent items based on extent map conversion.
4659  * That can happen due to the fact that extent maps are merged when they
4660  * are not in the extent map tree's list of modified extents.
4661  */
4662 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4663 				      struct btrfs_inode *inode,
4664 				      struct btrfs_path *path)
4665 {
4666 	struct btrfs_root *root = inode->root;
4667 	struct btrfs_key key;
4668 	const u64 i_size = i_size_read(&inode->vfs_inode);
4669 	const u64 ino = btrfs_ino(inode);
4670 	struct btrfs_path *dst_path = NULL;
4671 	bool dropped_extents = false;
4672 	u64 truncate_offset = i_size;
4673 	struct extent_buffer *leaf;
4674 	int slot;
4675 	int ins_nr = 0;
4676 	int start_slot;
4677 	int ret;
4678 
4679 	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4680 		return 0;
4681 
4682 	key.objectid = ino;
4683 	key.type = BTRFS_EXTENT_DATA_KEY;
4684 	key.offset = i_size;
4685 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4686 	if (ret < 0)
4687 		goto out;
4688 
4689 	/*
4690 	 * We must check if there is a prealloc extent that starts before the
4691 	 * i_size and crosses the i_size boundary. This is to ensure later we
4692 	 * truncate down to the end of that extent and not to the i_size, as
4693 	 * otherwise we end up losing part of the prealloc extent after a log
4694 	 * replay and with an implicit hole if there is another prealloc extent
4695 	 * that starts at an offset beyond i_size.
4696 	 */
4697 	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4698 	if (ret < 0)
4699 		goto out;
4700 
4701 	if (ret == 0) {
4702 		struct btrfs_file_extent_item *ei;
4703 
4704 		leaf = path->nodes[0];
4705 		slot = path->slots[0];
4706 		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4707 
4708 		if (btrfs_file_extent_type(leaf, ei) ==
4709 		    BTRFS_FILE_EXTENT_PREALLOC) {
4710 			u64 extent_end;
4711 
4712 			btrfs_item_key_to_cpu(leaf, &key, slot);
4713 			extent_end = key.offset +
4714 				btrfs_file_extent_num_bytes(leaf, ei);
4715 
4716 			if (extent_end > i_size)
4717 				truncate_offset = extent_end;
4718 		}
4719 	} else {
4720 		ret = 0;
4721 	}
4722 
4723 	while (true) {
4724 		leaf = path->nodes[0];
4725 		slot = path->slots[0];
4726 
4727 		if (slot >= btrfs_header_nritems(leaf)) {
4728 			if (ins_nr > 0) {
4729 				ret = copy_items(trans, inode, dst_path, path,
4730 						 start_slot, ins_nr, 1, 0);
4731 				if (ret < 0)
4732 					goto out;
4733 				ins_nr = 0;
4734 			}
4735 			ret = btrfs_next_leaf(root, path);
4736 			if (ret < 0)
4737 				goto out;
4738 			if (ret > 0) {
4739 				ret = 0;
4740 				break;
4741 			}
4742 			continue;
4743 		}
4744 
4745 		btrfs_item_key_to_cpu(leaf, &key, slot);
4746 		if (key.objectid > ino)
4747 			break;
4748 		if (WARN_ON_ONCE(key.objectid < ino) ||
4749 		    key.type < BTRFS_EXTENT_DATA_KEY ||
4750 		    key.offset < i_size) {
4751 			path->slots[0]++;
4752 			continue;
4753 		}
4754 		if (!dropped_extents) {
4755 			/*
4756 			 * Avoid logging extent items logged in past fsync calls
4757 			 * and leading to duplicate keys in the log tree.
4758 			 */
4759 			ret = truncate_inode_items(trans, root->log_root, inode,
4760 						   truncate_offset,
4761 						   BTRFS_EXTENT_DATA_KEY);
4762 			if (ret)
4763 				goto out;
4764 			dropped_extents = true;
4765 		}
4766 		if (ins_nr == 0)
4767 			start_slot = slot;
4768 		ins_nr++;
4769 		path->slots[0]++;
4770 		if (!dst_path) {
4771 			dst_path = btrfs_alloc_path();
4772 			if (!dst_path) {
4773 				ret = -ENOMEM;
4774 				goto out;
4775 			}
4776 		}
4777 	}
4778 	if (ins_nr > 0)
4779 		ret = copy_items(trans, inode, dst_path, path,
4780 				 start_slot, ins_nr, 1, 0);
4781 out:
4782 	btrfs_release_path(path);
4783 	btrfs_free_path(dst_path);
4784 	return ret;
4785 }
4786 
4787 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4788 				     struct btrfs_inode *inode,
4789 				     struct btrfs_path *path,
4790 				     struct btrfs_log_ctx *ctx)
4791 {
4792 	struct btrfs_ordered_extent *ordered;
4793 	struct btrfs_ordered_extent *tmp;
4794 	struct extent_map *em, *n;
4795 	struct list_head extents;
4796 	struct extent_map_tree *tree = &inode->extent_tree;
4797 	int ret = 0;
4798 	int num = 0;
4799 
4800 	INIT_LIST_HEAD(&extents);
4801 
4802 	write_lock(&tree->lock);
4803 
4804 	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4805 		list_del_init(&em->list);
4806 		/*
4807 		 * Just an arbitrary number, this can be really CPU intensive
4808 		 * once we start getting a lot of extents, and really once we
4809 		 * have a bunch of extents we just want to commit since it will
4810 		 * be faster.
4811 		 */
4812 		if (++num > 32768) {
4813 			list_del_init(&tree->modified_extents);
4814 			ret = -EFBIG;
4815 			goto process;
4816 		}
4817 
4818 		if (em->generation < trans->transid)
4819 			continue;
4820 
4821 		/* We log prealloc extents beyond eof later. */
4822 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4823 		    em->start >= i_size_read(&inode->vfs_inode))
4824 			continue;
4825 
4826 		/* Need a ref to keep it from getting evicted from cache */
4827 		refcount_inc(&em->refs);
4828 		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4829 		list_add_tail(&em->list, &extents);
4830 		num++;
4831 	}
4832 
4833 	list_sort(NULL, &extents, extent_cmp);
4834 process:
4835 	while (!list_empty(&extents)) {
4836 		em = list_entry(extents.next, struct extent_map, list);
4837 
4838 		list_del_init(&em->list);
4839 
4840 		/*
4841 		 * If we had an error we just need to delete everybody from our
4842 		 * private list.
4843 		 */
4844 		if (ret) {
4845 			clear_em_logging(tree, em);
4846 			free_extent_map(em);
4847 			continue;
4848 		}
4849 
4850 		write_unlock(&tree->lock);
4851 
4852 		ret = log_one_extent(trans, inode, em, path, ctx);
4853 		write_lock(&tree->lock);
4854 		clear_em_logging(tree, em);
4855 		free_extent_map(em);
4856 	}
4857 	WARN_ON(!list_empty(&extents));
4858 	write_unlock(&tree->lock);
4859 
4860 	btrfs_release_path(path);
4861 	if (!ret)
4862 		ret = btrfs_log_prealloc_extents(trans, inode, path);
4863 	if (ret)
4864 		return ret;
4865 
4866 	/*
4867 	 * We have logged all extents successfully, now make sure the commit of
4868 	 * the current transaction waits for the ordered extents to complete
4869 	 * before it commits and wipes out the log trees, otherwise we would
4870 	 * lose data if an ordered extents completes after the transaction
4871 	 * commits and a power failure happens after the transaction commit.
4872 	 */
4873 	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4874 		list_del_init(&ordered->log_list);
4875 		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4876 
4877 		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4878 			spin_lock_irq(&inode->ordered_tree.lock);
4879 			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4880 				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4881 				atomic_inc(&trans->transaction->pending_ordered);
4882 			}
4883 			spin_unlock_irq(&inode->ordered_tree.lock);
4884 		}
4885 		btrfs_put_ordered_extent(ordered);
4886 	}
4887 
4888 	return 0;
4889 }
4890 
4891 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4892 			     struct btrfs_path *path, u64 *size_ret)
4893 {
4894 	struct btrfs_key key;
4895 	int ret;
4896 
4897 	key.objectid = btrfs_ino(inode);
4898 	key.type = BTRFS_INODE_ITEM_KEY;
4899 	key.offset = 0;
4900 
4901 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4902 	if (ret < 0) {
4903 		return ret;
4904 	} else if (ret > 0) {
4905 		*size_ret = 0;
4906 	} else {
4907 		struct btrfs_inode_item *item;
4908 
4909 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4910 				      struct btrfs_inode_item);
4911 		*size_ret = btrfs_inode_size(path->nodes[0], item);
4912 		/*
4913 		 * If the in-memory inode's i_size is smaller then the inode
4914 		 * size stored in the btree, return the inode's i_size, so
4915 		 * that we get a correct inode size after replaying the log
4916 		 * when before a power failure we had a shrinking truncate
4917 		 * followed by addition of a new name (rename / new hard link).
4918 		 * Otherwise return the inode size from the btree, to avoid
4919 		 * data loss when replaying a log due to previously doing a
4920 		 * write that expands the inode's size and logging a new name
4921 		 * immediately after.
4922 		 */
4923 		if (*size_ret > inode->vfs_inode.i_size)
4924 			*size_ret = inode->vfs_inode.i_size;
4925 	}
4926 
4927 	btrfs_release_path(path);
4928 	return 0;
4929 }
4930 
4931 /*
4932  * At the moment we always log all xattrs. This is to figure out at log replay
4933  * time which xattrs must have their deletion replayed. If a xattr is missing
4934  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4935  * because if a xattr is deleted, the inode is fsynced and a power failure
4936  * happens, causing the log to be replayed the next time the fs is mounted,
4937  * we want the xattr to not exist anymore (same behaviour as other filesystems
4938  * with a journal, ext3/4, xfs, f2fs, etc).
4939  */
4940 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4941 				struct btrfs_inode *inode,
4942 				struct btrfs_path *path,
4943 				struct btrfs_path *dst_path)
4944 {
4945 	struct btrfs_root *root = inode->root;
4946 	int ret;
4947 	struct btrfs_key key;
4948 	const u64 ino = btrfs_ino(inode);
4949 	int ins_nr = 0;
4950 	int start_slot = 0;
4951 	bool found_xattrs = false;
4952 
4953 	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4954 		return 0;
4955 
4956 	key.objectid = ino;
4957 	key.type = BTRFS_XATTR_ITEM_KEY;
4958 	key.offset = 0;
4959 
4960 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4961 	if (ret < 0)
4962 		return ret;
4963 
4964 	while (true) {
4965 		int slot = path->slots[0];
4966 		struct extent_buffer *leaf = path->nodes[0];
4967 		int nritems = btrfs_header_nritems(leaf);
4968 
4969 		if (slot >= nritems) {
4970 			if (ins_nr > 0) {
4971 				ret = copy_items(trans, inode, dst_path, path,
4972 						 start_slot, ins_nr, 1, 0);
4973 				if (ret < 0)
4974 					return ret;
4975 				ins_nr = 0;
4976 			}
4977 			ret = btrfs_next_leaf(root, path);
4978 			if (ret < 0)
4979 				return ret;
4980 			else if (ret > 0)
4981 				break;
4982 			continue;
4983 		}
4984 
4985 		btrfs_item_key_to_cpu(leaf, &key, slot);
4986 		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4987 			break;
4988 
4989 		if (ins_nr == 0)
4990 			start_slot = slot;
4991 		ins_nr++;
4992 		path->slots[0]++;
4993 		found_xattrs = true;
4994 		cond_resched();
4995 	}
4996 	if (ins_nr > 0) {
4997 		ret = copy_items(trans, inode, dst_path, path,
4998 				 start_slot, ins_nr, 1, 0);
4999 		if (ret < 0)
5000 			return ret;
5001 	}
5002 
5003 	if (!found_xattrs)
5004 		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5005 
5006 	return 0;
5007 }
5008 
5009 /*
5010  * When using the NO_HOLES feature if we punched a hole that causes the
5011  * deletion of entire leafs or all the extent items of the first leaf (the one
5012  * that contains the inode item and references) we may end up not processing
5013  * any extents, because there are no leafs with a generation matching the
5014  * current transaction that have extent items for our inode. So we need to find
5015  * if any holes exist and then log them. We also need to log holes after any
5016  * truncate operation that changes the inode's size.
5017  */
5018 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5019 			   struct btrfs_inode *inode,
5020 			   struct btrfs_path *path)
5021 {
5022 	struct btrfs_root *root = inode->root;
5023 	struct btrfs_fs_info *fs_info = root->fs_info;
5024 	struct btrfs_key key;
5025 	const u64 ino = btrfs_ino(inode);
5026 	const u64 i_size = i_size_read(&inode->vfs_inode);
5027 	u64 prev_extent_end = 0;
5028 	int ret;
5029 
5030 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5031 		return 0;
5032 
5033 	key.objectid = ino;
5034 	key.type = BTRFS_EXTENT_DATA_KEY;
5035 	key.offset = 0;
5036 
5037 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5038 	if (ret < 0)
5039 		return ret;
5040 
5041 	while (true) {
5042 		struct extent_buffer *leaf = path->nodes[0];
5043 
5044 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5045 			ret = btrfs_next_leaf(root, path);
5046 			if (ret < 0)
5047 				return ret;
5048 			if (ret > 0) {
5049 				ret = 0;
5050 				break;
5051 			}
5052 			leaf = path->nodes[0];
5053 		}
5054 
5055 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5056 		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5057 			break;
5058 
5059 		/* We have a hole, log it. */
5060 		if (prev_extent_end < key.offset) {
5061 			const u64 hole_len = key.offset - prev_extent_end;
5062 
5063 			/*
5064 			 * Release the path to avoid deadlocks with other code
5065 			 * paths that search the root while holding locks on
5066 			 * leafs from the log root.
5067 			 */
5068 			btrfs_release_path(path);
5069 			ret = btrfs_insert_file_extent(trans, root->log_root,
5070 						       ino, prev_extent_end, 0,
5071 						       0, hole_len, 0, hole_len,
5072 						       0, 0, 0);
5073 			if (ret < 0)
5074 				return ret;
5075 
5076 			/*
5077 			 * Search for the same key again in the root. Since it's
5078 			 * an extent item and we are holding the inode lock, the
5079 			 * key must still exist. If it doesn't just emit warning
5080 			 * and return an error to fall back to a transaction
5081 			 * commit.
5082 			 */
5083 			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5084 			if (ret < 0)
5085 				return ret;
5086 			if (WARN_ON(ret > 0))
5087 				return -ENOENT;
5088 			leaf = path->nodes[0];
5089 		}
5090 
5091 		prev_extent_end = btrfs_file_extent_end(path);
5092 		path->slots[0]++;
5093 		cond_resched();
5094 	}
5095 
5096 	if (prev_extent_end < i_size) {
5097 		u64 hole_len;
5098 
5099 		btrfs_release_path(path);
5100 		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5101 		ret = btrfs_insert_file_extent(trans, root->log_root,
5102 					       ino, prev_extent_end, 0, 0,
5103 					       hole_len, 0, hole_len,
5104 					       0, 0, 0);
5105 		if (ret < 0)
5106 			return ret;
5107 	}
5108 
5109 	return 0;
5110 }
5111 
5112 /*
5113  * When we are logging a new inode X, check if it doesn't have a reference that
5114  * matches the reference from some other inode Y created in a past transaction
5115  * and that was renamed in the current transaction. If we don't do this, then at
5116  * log replay time we can lose inode Y (and all its files if it's a directory):
5117  *
5118  * mkdir /mnt/x
5119  * echo "hello world" > /mnt/x/foobar
5120  * sync
5121  * mv /mnt/x /mnt/y
5122  * mkdir /mnt/x                 # or touch /mnt/x
5123  * xfs_io -c fsync /mnt/x
5124  * <power fail>
5125  * mount fs, trigger log replay
5126  *
5127  * After the log replay procedure, we would lose the first directory and all its
5128  * files (file foobar).
5129  * For the case where inode Y is not a directory we simply end up losing it:
5130  *
5131  * echo "123" > /mnt/foo
5132  * sync
5133  * mv /mnt/foo /mnt/bar
5134  * echo "abc" > /mnt/foo
5135  * xfs_io -c fsync /mnt/foo
5136  * <power fail>
5137  *
5138  * We also need this for cases where a snapshot entry is replaced by some other
5139  * entry (file or directory) otherwise we end up with an unreplayable log due to
5140  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5141  * if it were a regular entry:
5142  *
5143  * mkdir /mnt/x
5144  * btrfs subvolume snapshot /mnt /mnt/x/snap
5145  * btrfs subvolume delete /mnt/x/snap
5146  * rmdir /mnt/x
5147  * mkdir /mnt/x
5148  * fsync /mnt/x or fsync some new file inside it
5149  * <power fail>
5150  *
5151  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5152  * the same transaction.
5153  */
5154 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5155 					 const int slot,
5156 					 const struct btrfs_key *key,
5157 					 struct btrfs_inode *inode,
5158 					 u64 *other_ino, u64 *other_parent)
5159 {
5160 	int ret;
5161 	struct btrfs_path *search_path;
5162 	char *name = NULL;
5163 	u32 name_len = 0;
5164 	u32 item_size = btrfs_item_size(eb, slot);
5165 	u32 cur_offset = 0;
5166 	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5167 
5168 	search_path = btrfs_alloc_path();
5169 	if (!search_path)
5170 		return -ENOMEM;
5171 	search_path->search_commit_root = 1;
5172 	search_path->skip_locking = 1;
5173 
5174 	while (cur_offset < item_size) {
5175 		u64 parent;
5176 		u32 this_name_len;
5177 		u32 this_len;
5178 		unsigned long name_ptr;
5179 		struct btrfs_dir_item *di;
5180 
5181 		if (key->type == BTRFS_INODE_REF_KEY) {
5182 			struct btrfs_inode_ref *iref;
5183 
5184 			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5185 			parent = key->offset;
5186 			this_name_len = btrfs_inode_ref_name_len(eb, iref);
5187 			name_ptr = (unsigned long)(iref + 1);
5188 			this_len = sizeof(*iref) + this_name_len;
5189 		} else {
5190 			struct btrfs_inode_extref *extref;
5191 
5192 			extref = (struct btrfs_inode_extref *)(ptr +
5193 							       cur_offset);
5194 			parent = btrfs_inode_extref_parent(eb, extref);
5195 			this_name_len = btrfs_inode_extref_name_len(eb, extref);
5196 			name_ptr = (unsigned long)&extref->name;
5197 			this_len = sizeof(*extref) + this_name_len;
5198 		}
5199 
5200 		if (this_name_len > name_len) {
5201 			char *new_name;
5202 
5203 			new_name = krealloc(name, this_name_len, GFP_NOFS);
5204 			if (!new_name) {
5205 				ret = -ENOMEM;
5206 				goto out;
5207 			}
5208 			name_len = this_name_len;
5209 			name = new_name;
5210 		}
5211 
5212 		read_extent_buffer(eb, name, name_ptr, this_name_len);
5213 		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5214 				parent, name, this_name_len, 0);
5215 		if (di && !IS_ERR(di)) {
5216 			struct btrfs_key di_key;
5217 
5218 			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5219 						  di, &di_key);
5220 			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5221 				if (di_key.objectid != key->objectid) {
5222 					ret = 1;
5223 					*other_ino = di_key.objectid;
5224 					*other_parent = parent;
5225 				} else {
5226 					ret = 0;
5227 				}
5228 			} else {
5229 				ret = -EAGAIN;
5230 			}
5231 			goto out;
5232 		} else if (IS_ERR(di)) {
5233 			ret = PTR_ERR(di);
5234 			goto out;
5235 		}
5236 		btrfs_release_path(search_path);
5237 
5238 		cur_offset += this_len;
5239 	}
5240 	ret = 0;
5241 out:
5242 	btrfs_free_path(search_path);
5243 	kfree(name);
5244 	return ret;
5245 }
5246 
5247 struct btrfs_ino_list {
5248 	u64 ino;
5249 	u64 parent;
5250 	struct list_head list;
5251 };
5252 
5253 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5254 				  struct btrfs_root *root,
5255 				  struct btrfs_path *path,
5256 				  struct btrfs_log_ctx *ctx,
5257 				  u64 ino, u64 parent)
5258 {
5259 	struct btrfs_ino_list *ino_elem;
5260 	LIST_HEAD(inode_list);
5261 	int ret = 0;
5262 
5263 	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5264 	if (!ino_elem)
5265 		return -ENOMEM;
5266 	ino_elem->ino = ino;
5267 	ino_elem->parent = parent;
5268 	list_add_tail(&ino_elem->list, &inode_list);
5269 
5270 	while (!list_empty(&inode_list)) {
5271 		struct btrfs_fs_info *fs_info = root->fs_info;
5272 		struct btrfs_key key;
5273 		struct inode *inode;
5274 
5275 		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
5276 					    list);
5277 		ino = ino_elem->ino;
5278 		parent = ino_elem->parent;
5279 		list_del(&ino_elem->list);
5280 		kfree(ino_elem);
5281 		if (ret)
5282 			continue;
5283 
5284 		btrfs_release_path(path);
5285 
5286 		inode = btrfs_iget(fs_info->sb, ino, root);
5287 		/*
5288 		 * If the other inode that had a conflicting dir entry was
5289 		 * deleted in the current transaction, we need to log its parent
5290 		 * directory.
5291 		 */
5292 		if (IS_ERR(inode)) {
5293 			ret = PTR_ERR(inode);
5294 			if (ret == -ENOENT) {
5295 				inode = btrfs_iget(fs_info->sb, parent, root);
5296 				if (IS_ERR(inode)) {
5297 					ret = PTR_ERR(inode);
5298 				} else {
5299 					ret = btrfs_log_inode(trans,
5300 						      BTRFS_I(inode),
5301 						      LOG_OTHER_INODE_ALL,
5302 						      ctx);
5303 					btrfs_add_delayed_iput(inode);
5304 				}
5305 			}
5306 			continue;
5307 		}
5308 		/*
5309 		 * If the inode was already logged skip it - otherwise we can
5310 		 * hit an infinite loop. Example:
5311 		 *
5312 		 * From the commit root (previous transaction) we have the
5313 		 * following inodes:
5314 		 *
5315 		 * inode 257 a directory
5316 		 * inode 258 with references "zz" and "zz_link" on inode 257
5317 		 * inode 259 with reference "a" on inode 257
5318 		 *
5319 		 * And in the current (uncommitted) transaction we have:
5320 		 *
5321 		 * inode 257 a directory, unchanged
5322 		 * inode 258 with references "a" and "a2" on inode 257
5323 		 * inode 259 with reference "zz_link" on inode 257
5324 		 * inode 261 with reference "zz" on inode 257
5325 		 *
5326 		 * When logging inode 261 the following infinite loop could
5327 		 * happen if we don't skip already logged inodes:
5328 		 *
5329 		 * - we detect inode 258 as a conflicting inode, with inode 261
5330 		 *   on reference "zz", and log it;
5331 		 *
5332 		 * - we detect inode 259 as a conflicting inode, with inode 258
5333 		 *   on reference "a", and log it;
5334 		 *
5335 		 * - we detect inode 258 as a conflicting inode, with inode 259
5336 		 *   on reference "zz_link", and log it - again! After this we
5337 		 *   repeat the above steps forever.
5338 		 */
5339 		spin_lock(&BTRFS_I(inode)->lock);
5340 		/*
5341 		 * Check the inode's logged_trans only instead of
5342 		 * btrfs_inode_in_log(). This is because the last_log_commit of
5343 		 * the inode is not updated when we only log that it exists (see
5344 		 * btrfs_log_inode()).
5345 		 */
5346 		if (BTRFS_I(inode)->logged_trans == trans->transid) {
5347 			spin_unlock(&BTRFS_I(inode)->lock);
5348 			btrfs_add_delayed_iput(inode);
5349 			continue;
5350 		}
5351 		spin_unlock(&BTRFS_I(inode)->lock);
5352 		/*
5353 		 * We are safe logging the other inode without acquiring its
5354 		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5355 		 * are safe against concurrent renames of the other inode as
5356 		 * well because during a rename we pin the log and update the
5357 		 * log with the new name before we unpin it.
5358 		 */
5359 		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
5360 		if (ret) {
5361 			btrfs_add_delayed_iput(inode);
5362 			continue;
5363 		}
5364 
5365 		key.objectid = ino;
5366 		key.type = BTRFS_INODE_REF_KEY;
5367 		key.offset = 0;
5368 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5369 		if (ret < 0) {
5370 			btrfs_add_delayed_iput(inode);
5371 			continue;
5372 		}
5373 
5374 		while (true) {
5375 			struct extent_buffer *leaf = path->nodes[0];
5376 			int slot = path->slots[0];
5377 			u64 other_ino = 0;
5378 			u64 other_parent = 0;
5379 
5380 			if (slot >= btrfs_header_nritems(leaf)) {
5381 				ret = btrfs_next_leaf(root, path);
5382 				if (ret < 0) {
5383 					break;
5384 				} else if (ret > 0) {
5385 					ret = 0;
5386 					break;
5387 				}
5388 				continue;
5389 			}
5390 
5391 			btrfs_item_key_to_cpu(leaf, &key, slot);
5392 			if (key.objectid != ino ||
5393 			    (key.type != BTRFS_INODE_REF_KEY &&
5394 			     key.type != BTRFS_INODE_EXTREF_KEY)) {
5395 				ret = 0;
5396 				break;
5397 			}
5398 
5399 			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5400 					BTRFS_I(inode), &other_ino,
5401 					&other_parent);
5402 			if (ret < 0)
5403 				break;
5404 			if (ret > 0) {
5405 				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5406 				if (!ino_elem) {
5407 					ret = -ENOMEM;
5408 					break;
5409 				}
5410 				ino_elem->ino = other_ino;
5411 				ino_elem->parent = other_parent;
5412 				list_add_tail(&ino_elem->list, &inode_list);
5413 				ret = 0;
5414 			}
5415 			path->slots[0]++;
5416 		}
5417 		btrfs_add_delayed_iput(inode);
5418 	}
5419 
5420 	return ret;
5421 }
5422 
5423 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5424 				   struct btrfs_inode *inode,
5425 				   struct btrfs_key *min_key,
5426 				   const struct btrfs_key *max_key,
5427 				   struct btrfs_path *path,
5428 				   struct btrfs_path *dst_path,
5429 				   const u64 logged_isize,
5430 				   const bool recursive_logging,
5431 				   const int inode_only,
5432 				   struct btrfs_log_ctx *ctx,
5433 				   bool *need_log_inode_item)
5434 {
5435 	const u64 i_size = i_size_read(&inode->vfs_inode);
5436 	struct btrfs_root *root = inode->root;
5437 	int ins_start_slot = 0;
5438 	int ins_nr = 0;
5439 	int ret;
5440 
5441 	while (1) {
5442 		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5443 		if (ret < 0)
5444 			return ret;
5445 		if (ret > 0) {
5446 			ret = 0;
5447 			break;
5448 		}
5449 again:
5450 		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5451 		if (min_key->objectid != max_key->objectid)
5452 			break;
5453 		if (min_key->type > max_key->type)
5454 			break;
5455 
5456 		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5457 			*need_log_inode_item = false;
5458 		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5459 			   min_key->offset >= i_size) {
5460 			/*
5461 			 * Extents at and beyond eof are logged with
5462 			 * btrfs_log_prealloc_extents().
5463 			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5464 			 * and no keys greater than that, so bail out.
5465 			 */
5466 			break;
5467 		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5468 			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5469 			   inode->generation == trans->transid &&
5470 			   !recursive_logging) {
5471 			u64 other_ino = 0;
5472 			u64 other_parent = 0;
5473 
5474 			ret = btrfs_check_ref_name_override(path->nodes[0],
5475 					path->slots[0], min_key, inode,
5476 					&other_ino, &other_parent);
5477 			if (ret < 0) {
5478 				return ret;
5479 			} else if (ret > 0 &&
5480 				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5481 				if (ins_nr > 0) {
5482 					ins_nr++;
5483 				} else {
5484 					ins_nr = 1;
5485 					ins_start_slot = path->slots[0];
5486 				}
5487 				ret = copy_items(trans, inode, dst_path, path,
5488 						 ins_start_slot, ins_nr,
5489 						 inode_only, logged_isize);
5490 				if (ret < 0)
5491 					return ret;
5492 				ins_nr = 0;
5493 
5494 				ret = log_conflicting_inodes(trans, root, path,
5495 						ctx, other_ino, other_parent);
5496 				if (ret)
5497 					return ret;
5498 				btrfs_release_path(path);
5499 				goto next_key;
5500 			}
5501 		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5502 			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5503 			if (ins_nr == 0)
5504 				goto next_slot;
5505 			ret = copy_items(trans, inode, dst_path, path,
5506 					 ins_start_slot,
5507 					 ins_nr, inode_only, logged_isize);
5508 			if (ret < 0)
5509 				return ret;
5510 			ins_nr = 0;
5511 			goto next_slot;
5512 		}
5513 
5514 		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5515 			ins_nr++;
5516 			goto next_slot;
5517 		} else if (!ins_nr) {
5518 			ins_start_slot = path->slots[0];
5519 			ins_nr = 1;
5520 			goto next_slot;
5521 		}
5522 
5523 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5524 				 ins_nr, inode_only, logged_isize);
5525 		if (ret < 0)
5526 			return ret;
5527 		ins_nr = 1;
5528 		ins_start_slot = path->slots[0];
5529 next_slot:
5530 		path->slots[0]++;
5531 		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5532 			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5533 					      path->slots[0]);
5534 			goto again;
5535 		}
5536 		if (ins_nr) {
5537 			ret = copy_items(trans, inode, dst_path, path,
5538 					 ins_start_slot, ins_nr, inode_only,
5539 					 logged_isize);
5540 			if (ret < 0)
5541 				return ret;
5542 			ins_nr = 0;
5543 		}
5544 		btrfs_release_path(path);
5545 next_key:
5546 		if (min_key->offset < (u64)-1) {
5547 			min_key->offset++;
5548 		} else if (min_key->type < max_key->type) {
5549 			min_key->type++;
5550 			min_key->offset = 0;
5551 		} else {
5552 			break;
5553 		}
5554 	}
5555 	if (ins_nr) {
5556 		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5557 				 ins_nr, inode_only, logged_isize);
5558 		if (ret)
5559 			return ret;
5560 	}
5561 
5562 	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5563 		/*
5564 		 * Release the path because otherwise we might attempt to double
5565 		 * lock the same leaf with btrfs_log_prealloc_extents() below.
5566 		 */
5567 		btrfs_release_path(path);
5568 		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5569 	}
5570 
5571 	return ret;
5572 }
5573 
5574 /* log a single inode in the tree log.
5575  * At least one parent directory for this inode must exist in the tree
5576  * or be logged already.
5577  *
5578  * Any items from this inode changed by the current transaction are copied
5579  * to the log tree.  An extra reference is taken on any extents in this
5580  * file, allowing us to avoid a whole pile of corner cases around logging
5581  * blocks that have been removed from the tree.
5582  *
5583  * See LOG_INODE_ALL and related defines for a description of what inode_only
5584  * does.
5585  *
5586  * This handles both files and directories.
5587  */
5588 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5589 			   struct btrfs_inode *inode,
5590 			   int inode_only,
5591 			   struct btrfs_log_ctx *ctx)
5592 {
5593 	struct btrfs_path *path;
5594 	struct btrfs_path *dst_path;
5595 	struct btrfs_key min_key;
5596 	struct btrfs_key max_key;
5597 	struct btrfs_root *log = inode->root->log_root;
5598 	int err = 0;
5599 	int ret = 0;
5600 	bool fast_search = false;
5601 	u64 ino = btrfs_ino(inode);
5602 	struct extent_map_tree *em_tree = &inode->extent_tree;
5603 	u64 logged_isize = 0;
5604 	bool need_log_inode_item = true;
5605 	bool xattrs_logged = false;
5606 	bool recursive_logging = false;
5607 	bool inode_item_dropped = true;
5608 
5609 	path = btrfs_alloc_path();
5610 	if (!path)
5611 		return -ENOMEM;
5612 	dst_path = btrfs_alloc_path();
5613 	if (!dst_path) {
5614 		btrfs_free_path(path);
5615 		return -ENOMEM;
5616 	}
5617 
5618 	min_key.objectid = ino;
5619 	min_key.type = BTRFS_INODE_ITEM_KEY;
5620 	min_key.offset = 0;
5621 
5622 	max_key.objectid = ino;
5623 
5624 
5625 	/* today the code can only do partial logging of directories */
5626 	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5627 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5628 		       &inode->runtime_flags) &&
5629 	     inode_only >= LOG_INODE_EXISTS))
5630 		max_key.type = BTRFS_XATTR_ITEM_KEY;
5631 	else
5632 		max_key.type = (u8)-1;
5633 	max_key.offset = (u64)-1;
5634 
5635 	/*
5636 	 * Only run delayed items if we are a directory. We want to make sure
5637 	 * all directory indexes hit the fs/subvolume tree so we can find them
5638 	 * and figure out which index ranges have to be logged.
5639 	 */
5640 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5641 		err = btrfs_commit_inode_delayed_items(trans, inode);
5642 		if (err)
5643 			goto out;
5644 	}
5645 
5646 	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5647 		recursive_logging = true;
5648 		if (inode_only == LOG_OTHER_INODE)
5649 			inode_only = LOG_INODE_EXISTS;
5650 		else
5651 			inode_only = LOG_INODE_ALL;
5652 		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5653 	} else {
5654 		mutex_lock(&inode->log_mutex);
5655 	}
5656 
5657 	/*
5658 	 * This is for cases where logging a directory could result in losing a
5659 	 * a file after replaying the log. For example, if we move a file from a
5660 	 * directory A to a directory B, then fsync directory A, we have no way
5661 	 * to known the file was moved from A to B, so logging just A would
5662 	 * result in losing the file after a log replay.
5663 	 */
5664 	if (S_ISDIR(inode->vfs_inode.i_mode) &&
5665 	    inode_only == LOG_INODE_ALL &&
5666 	    inode->last_unlink_trans >= trans->transid) {
5667 		btrfs_set_log_full_commit(trans);
5668 		err = 1;
5669 		goto out_unlock;
5670 	}
5671 
5672 	/*
5673 	 * a brute force approach to making sure we get the most uptodate
5674 	 * copies of everything.
5675 	 */
5676 	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5677 		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5678 
5679 		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
5680 		if (inode_only == LOG_INODE_EXISTS)
5681 			max_key_type = BTRFS_XATTR_ITEM_KEY;
5682 		ret = drop_inode_items(trans, log, path, inode, max_key_type);
5683 	} else {
5684 		if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
5685 			/*
5686 			 * Make sure the new inode item we write to the log has
5687 			 * the same isize as the current one (if it exists).
5688 			 * This is necessary to prevent data loss after log
5689 			 * replay, and also to prevent doing a wrong expanding
5690 			 * truncate - for e.g. create file, write 4K into offset
5691 			 * 0, fsync, write 4K into offset 4096, add hard link,
5692 			 * fsync some other file (to sync log), power fail - if
5693 			 * we use the inode's current i_size, after log replay
5694 			 * we get a 8Kb file, with the last 4Kb extent as a hole
5695 			 * (zeroes), as if an expanding truncate happened,
5696 			 * instead of getting a file of 4Kb only.
5697 			 */
5698 			err = logged_inode_size(log, inode, path, &logged_isize);
5699 			if (err)
5700 				goto out_unlock;
5701 		}
5702 		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5703 			     &inode->runtime_flags)) {
5704 			if (inode_only == LOG_INODE_EXISTS) {
5705 				max_key.type = BTRFS_XATTR_ITEM_KEY;
5706 				ret = drop_inode_items(trans, log, path, inode,
5707 						       max_key.type);
5708 			} else {
5709 				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5710 					  &inode->runtime_flags);
5711 				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5712 					  &inode->runtime_flags);
5713 				if (inode_logged(trans, inode))
5714 					ret = truncate_inode_items(trans, log,
5715 								   inode, 0, 0);
5716 			}
5717 		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5718 					      &inode->runtime_flags) ||
5719 			   inode_only == LOG_INODE_EXISTS) {
5720 			if (inode_only == LOG_INODE_ALL)
5721 				fast_search = true;
5722 			max_key.type = BTRFS_XATTR_ITEM_KEY;
5723 			ret = drop_inode_items(trans, log, path, inode,
5724 					       max_key.type);
5725 		} else {
5726 			if (inode_only == LOG_INODE_ALL)
5727 				fast_search = true;
5728 			inode_item_dropped = false;
5729 			goto log_extents;
5730 		}
5731 
5732 	}
5733 	if (ret) {
5734 		err = ret;
5735 		goto out_unlock;
5736 	}
5737 
5738 	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5739 				      path, dst_path, logged_isize,
5740 				      recursive_logging, inode_only, ctx,
5741 				      &need_log_inode_item);
5742 	if (err)
5743 		goto out_unlock;
5744 
5745 	btrfs_release_path(path);
5746 	btrfs_release_path(dst_path);
5747 	err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
5748 	if (err)
5749 		goto out_unlock;
5750 	xattrs_logged = true;
5751 	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5752 		btrfs_release_path(path);
5753 		btrfs_release_path(dst_path);
5754 		err = btrfs_log_holes(trans, inode, path);
5755 		if (err)
5756 			goto out_unlock;
5757 	}
5758 log_extents:
5759 	btrfs_release_path(path);
5760 	btrfs_release_path(dst_path);
5761 	if (need_log_inode_item) {
5762 		err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
5763 		if (err)
5764 			goto out_unlock;
5765 		/*
5766 		 * If we are doing a fast fsync and the inode was logged before
5767 		 * in this transaction, we don't need to log the xattrs because
5768 		 * they were logged before. If xattrs were added, changed or
5769 		 * deleted since the last time we logged the inode, then we have
5770 		 * already logged them because the inode had the runtime flag
5771 		 * BTRFS_INODE_COPY_EVERYTHING set.
5772 		 */
5773 		if (!xattrs_logged && inode->logged_trans < trans->transid) {
5774 			err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
5775 			if (err)
5776 				goto out_unlock;
5777 			btrfs_release_path(path);
5778 		}
5779 	}
5780 	if (fast_search) {
5781 		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
5782 		if (ret) {
5783 			err = ret;
5784 			goto out_unlock;
5785 		}
5786 	} else if (inode_only == LOG_INODE_ALL) {
5787 		struct extent_map *em, *n;
5788 
5789 		write_lock(&em_tree->lock);
5790 		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5791 			list_del_init(&em->list);
5792 		write_unlock(&em_tree->lock);
5793 	}
5794 
5795 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5796 		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
5797 		if (ret) {
5798 			err = ret;
5799 			goto out_unlock;
5800 		}
5801 	}
5802 
5803 	spin_lock(&inode->lock);
5804 	inode->logged_trans = trans->transid;
5805 	/*
5806 	 * Don't update last_log_commit if we logged that an inode exists.
5807 	 * We do this for three reasons:
5808 	 *
5809 	 * 1) We might have had buffered writes to this inode that were
5810 	 *    flushed and had their ordered extents completed in this
5811 	 *    transaction, but we did not previously log the inode with
5812 	 *    LOG_INODE_ALL. Later the inode was evicted and after that
5813 	 *    it was loaded again and this LOG_INODE_EXISTS log operation
5814 	 *    happened. We must make sure that if an explicit fsync against
5815 	 *    the inode is performed later, it logs the new extents, an
5816 	 *    updated inode item, etc, and syncs the log. The same logic
5817 	 *    applies to direct IO writes instead of buffered writes.
5818 	 *
5819 	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
5820 	 *    is logged with an i_size of 0 or whatever value was logged
5821 	 *    before. If later the i_size of the inode is increased by a
5822 	 *    truncate operation, the log is synced through an fsync of
5823 	 *    some other inode and then finally an explicit fsync against
5824 	 *    this inode is made, we must make sure this fsync logs the
5825 	 *    inode with the new i_size, the hole between old i_size and
5826 	 *    the new i_size, and syncs the log.
5827 	 *
5828 	 * 3) If we are logging that an ancestor inode exists as part of
5829 	 *    logging a new name from a link or rename operation, don't update
5830 	 *    its last_log_commit - otherwise if an explicit fsync is made
5831 	 *    against an ancestor, the fsync considers the inode in the log
5832 	 *    and doesn't sync the log, resulting in the ancestor missing after
5833 	 *    a power failure unless the log was synced as part of an fsync
5834 	 *    against any other unrelated inode.
5835 	 */
5836 	if (inode_only != LOG_INODE_EXISTS)
5837 		inode->last_log_commit = inode->last_sub_trans;
5838 	spin_unlock(&inode->lock);
5839 out_unlock:
5840 	mutex_unlock(&inode->log_mutex);
5841 out:
5842 	btrfs_free_path(path);
5843 	btrfs_free_path(dst_path);
5844 	return err;
5845 }
5846 
5847 /*
5848  * Check if we need to log an inode. This is used in contexts where while
5849  * logging an inode we need to log another inode (either that it exists or in
5850  * full mode). This is used instead of btrfs_inode_in_log() because the later
5851  * requires the inode to be in the log and have the log transaction committed,
5852  * while here we do not care if the log transaction was already committed - our
5853  * caller will commit the log later - and we want to avoid logging an inode
5854  * multiple times when multiple tasks have joined the same log transaction.
5855  */
5856 static bool need_log_inode(struct btrfs_trans_handle *trans,
5857 			   struct btrfs_inode *inode)
5858 {
5859 	/*
5860 	 * If a directory was not modified, no dentries added or removed, we can
5861 	 * and should avoid logging it.
5862 	 */
5863 	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5864 		return false;
5865 
5866 	/*
5867 	 * If this inode does not have new/updated/deleted xattrs since the last
5868 	 * time it was logged and is flagged as logged in the current transaction,
5869 	 * we can skip logging it. As for new/deleted names, those are updated in
5870 	 * the log by link/unlink/rename operations.
5871 	 * In case the inode was logged and then evicted and reloaded, its
5872 	 * logged_trans will be 0, in which case we have to fully log it since
5873 	 * logged_trans is a transient field, not persisted.
5874 	 */
5875 	if (inode->logged_trans == trans->transid &&
5876 	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5877 		return false;
5878 
5879 	return true;
5880 }
5881 
5882 struct btrfs_dir_list {
5883 	u64 ino;
5884 	struct list_head list;
5885 };
5886 
5887 /*
5888  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5889  * details about the why it is needed.
5890  * This is a recursive operation - if an existing dentry corresponds to a
5891  * directory, that directory's new entries are logged too (same behaviour as
5892  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5893  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5894  * complains about the following circular lock dependency / possible deadlock:
5895  *
5896  *        CPU0                                        CPU1
5897  *        ----                                        ----
5898  * lock(&type->i_mutex_dir_key#3/2);
5899  *                                            lock(sb_internal#2);
5900  *                                            lock(&type->i_mutex_dir_key#3/2);
5901  * lock(&sb->s_type->i_mutex_key#14);
5902  *
5903  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5904  * sb_start_intwrite() in btrfs_start_transaction().
5905  * Not locking i_mutex of the inodes is still safe because:
5906  *
5907  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5908  *    that while logging the inode new references (names) are added or removed
5909  *    from the inode, leaving the logged inode item with a link count that does
5910  *    not match the number of logged inode reference items. This is fine because
5911  *    at log replay time we compute the real number of links and correct the
5912  *    link count in the inode item (see replay_one_buffer() and
5913  *    link_to_fixup_dir());
5914  *
5915  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5916  *    while logging the inode's items new index items (key type
5917  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5918  *    has a size that doesn't match the sum of the lengths of all the logged
5919  *    names - this is ok, not a problem, because at log replay time we set the
5920  *    directory's i_size to the correct value (see replay_one_name() and
5921  *    do_overwrite_item()).
5922  */
5923 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5924 				struct btrfs_root *root,
5925 				struct btrfs_inode *start_inode,
5926 				struct btrfs_log_ctx *ctx)
5927 {
5928 	struct btrfs_fs_info *fs_info = root->fs_info;
5929 	struct btrfs_root *log = root->log_root;
5930 	struct btrfs_path *path;
5931 	LIST_HEAD(dir_list);
5932 	struct btrfs_dir_list *dir_elem;
5933 	int ret = 0;
5934 
5935 	/*
5936 	 * If we are logging a new name, as part of a link or rename operation,
5937 	 * don't bother logging new dentries, as we just want to log the names
5938 	 * of an inode and that any new parents exist.
5939 	 */
5940 	if (ctx->logging_new_name)
5941 		return 0;
5942 
5943 	path = btrfs_alloc_path();
5944 	if (!path)
5945 		return -ENOMEM;
5946 
5947 	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5948 	if (!dir_elem) {
5949 		btrfs_free_path(path);
5950 		return -ENOMEM;
5951 	}
5952 	dir_elem->ino = btrfs_ino(start_inode);
5953 	list_add_tail(&dir_elem->list, &dir_list);
5954 
5955 	while (!list_empty(&dir_list)) {
5956 		struct extent_buffer *leaf;
5957 		struct btrfs_key min_key;
5958 		int nritems;
5959 		int i;
5960 
5961 		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5962 					    list);
5963 		if (ret)
5964 			goto next_dir_inode;
5965 
5966 		min_key.objectid = dir_elem->ino;
5967 		min_key.type = BTRFS_DIR_INDEX_KEY;
5968 		min_key.offset = 0;
5969 again:
5970 		btrfs_release_path(path);
5971 		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5972 		if (ret < 0) {
5973 			goto next_dir_inode;
5974 		} else if (ret > 0) {
5975 			ret = 0;
5976 			goto next_dir_inode;
5977 		}
5978 
5979 process_leaf:
5980 		leaf = path->nodes[0];
5981 		nritems = btrfs_header_nritems(leaf);
5982 		for (i = path->slots[0]; i < nritems; i++) {
5983 			struct btrfs_dir_item *di;
5984 			struct btrfs_key di_key;
5985 			struct inode *di_inode;
5986 			struct btrfs_dir_list *new_dir_elem;
5987 			int log_mode = LOG_INODE_EXISTS;
5988 			int type;
5989 
5990 			btrfs_item_key_to_cpu(leaf, &min_key, i);
5991 			if (min_key.objectid != dir_elem->ino ||
5992 			    min_key.type != BTRFS_DIR_INDEX_KEY)
5993 				goto next_dir_inode;
5994 
5995 			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5996 			type = btrfs_dir_type(leaf, di);
5997 			if (btrfs_dir_transid(leaf, di) < trans->transid &&
5998 			    type != BTRFS_FT_DIR)
5999 				continue;
6000 			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
6001 			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
6002 				continue;
6003 
6004 			btrfs_release_path(path);
6005 			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
6006 			if (IS_ERR(di_inode)) {
6007 				ret = PTR_ERR(di_inode);
6008 				goto next_dir_inode;
6009 			}
6010 
6011 			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6012 				btrfs_add_delayed_iput(di_inode);
6013 				break;
6014 			}
6015 
6016 			ctx->log_new_dentries = false;
6017 			if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
6018 				log_mode = LOG_INODE_ALL;
6019 			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
6020 					      log_mode, ctx);
6021 			btrfs_add_delayed_iput(di_inode);
6022 			if (ret)
6023 				goto next_dir_inode;
6024 			if (ctx->log_new_dentries) {
6025 				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
6026 						       GFP_NOFS);
6027 				if (!new_dir_elem) {
6028 					ret = -ENOMEM;
6029 					goto next_dir_inode;
6030 				}
6031 				new_dir_elem->ino = di_key.objectid;
6032 				list_add_tail(&new_dir_elem->list, &dir_list);
6033 			}
6034 			break;
6035 		}
6036 		if (i == nritems) {
6037 			ret = btrfs_next_leaf(log, path);
6038 			if (ret < 0) {
6039 				goto next_dir_inode;
6040 			} else if (ret > 0) {
6041 				ret = 0;
6042 				goto next_dir_inode;
6043 			}
6044 			goto process_leaf;
6045 		}
6046 		if (min_key.offset < (u64)-1) {
6047 			min_key.offset++;
6048 			goto again;
6049 		}
6050 next_dir_inode:
6051 		list_del(&dir_elem->list);
6052 		kfree(dir_elem);
6053 	}
6054 
6055 	btrfs_free_path(path);
6056 	return ret;
6057 }
6058 
6059 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6060 				 struct btrfs_inode *inode,
6061 				 struct btrfs_log_ctx *ctx)
6062 {
6063 	struct btrfs_fs_info *fs_info = trans->fs_info;
6064 	int ret;
6065 	struct btrfs_path *path;
6066 	struct btrfs_key key;
6067 	struct btrfs_root *root = inode->root;
6068 	const u64 ino = btrfs_ino(inode);
6069 
6070 	path = btrfs_alloc_path();
6071 	if (!path)
6072 		return -ENOMEM;
6073 	path->skip_locking = 1;
6074 	path->search_commit_root = 1;
6075 
6076 	key.objectid = ino;
6077 	key.type = BTRFS_INODE_REF_KEY;
6078 	key.offset = 0;
6079 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6080 	if (ret < 0)
6081 		goto out;
6082 
6083 	while (true) {
6084 		struct extent_buffer *leaf = path->nodes[0];
6085 		int slot = path->slots[0];
6086 		u32 cur_offset = 0;
6087 		u32 item_size;
6088 		unsigned long ptr;
6089 
6090 		if (slot >= btrfs_header_nritems(leaf)) {
6091 			ret = btrfs_next_leaf(root, path);
6092 			if (ret < 0)
6093 				goto out;
6094 			else if (ret > 0)
6095 				break;
6096 			continue;
6097 		}
6098 
6099 		btrfs_item_key_to_cpu(leaf, &key, slot);
6100 		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6101 		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6102 			break;
6103 
6104 		item_size = btrfs_item_size(leaf, slot);
6105 		ptr = btrfs_item_ptr_offset(leaf, slot);
6106 		while (cur_offset < item_size) {
6107 			struct btrfs_key inode_key;
6108 			struct inode *dir_inode;
6109 
6110 			inode_key.type = BTRFS_INODE_ITEM_KEY;
6111 			inode_key.offset = 0;
6112 
6113 			if (key.type == BTRFS_INODE_EXTREF_KEY) {
6114 				struct btrfs_inode_extref *extref;
6115 
6116 				extref = (struct btrfs_inode_extref *)
6117 					(ptr + cur_offset);
6118 				inode_key.objectid = btrfs_inode_extref_parent(
6119 					leaf, extref);
6120 				cur_offset += sizeof(*extref);
6121 				cur_offset += btrfs_inode_extref_name_len(leaf,
6122 					extref);
6123 			} else {
6124 				inode_key.objectid = key.offset;
6125 				cur_offset = item_size;
6126 			}
6127 
6128 			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6129 					       root);
6130 			/*
6131 			 * If the parent inode was deleted, return an error to
6132 			 * fallback to a transaction commit. This is to prevent
6133 			 * getting an inode that was moved from one parent A to
6134 			 * a parent B, got its former parent A deleted and then
6135 			 * it got fsync'ed, from existing at both parents after
6136 			 * a log replay (and the old parent still existing).
6137 			 * Example:
6138 			 *
6139 			 * mkdir /mnt/A
6140 			 * mkdir /mnt/B
6141 			 * touch /mnt/B/bar
6142 			 * sync
6143 			 * mv /mnt/B/bar /mnt/A/bar
6144 			 * mv -T /mnt/A /mnt/B
6145 			 * fsync /mnt/B/bar
6146 			 * <power fail>
6147 			 *
6148 			 * If we ignore the old parent B which got deleted,
6149 			 * after a log replay we would have file bar linked
6150 			 * at both parents and the old parent B would still
6151 			 * exist.
6152 			 */
6153 			if (IS_ERR(dir_inode)) {
6154 				ret = PTR_ERR(dir_inode);
6155 				goto out;
6156 			}
6157 
6158 			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6159 				btrfs_add_delayed_iput(dir_inode);
6160 				continue;
6161 			}
6162 
6163 			ctx->log_new_dentries = false;
6164 			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6165 					      LOG_INODE_ALL, ctx);
6166 			if (!ret && ctx->log_new_dentries)
6167 				ret = log_new_dir_dentries(trans, root,
6168 						   BTRFS_I(dir_inode), ctx);
6169 			btrfs_add_delayed_iput(dir_inode);
6170 			if (ret)
6171 				goto out;
6172 		}
6173 		path->slots[0]++;
6174 	}
6175 	ret = 0;
6176 out:
6177 	btrfs_free_path(path);
6178 	return ret;
6179 }
6180 
6181 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6182 			     struct btrfs_root *root,
6183 			     struct btrfs_path *path,
6184 			     struct btrfs_log_ctx *ctx)
6185 {
6186 	struct btrfs_key found_key;
6187 
6188 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6189 
6190 	while (true) {
6191 		struct btrfs_fs_info *fs_info = root->fs_info;
6192 		struct extent_buffer *leaf = path->nodes[0];
6193 		int slot = path->slots[0];
6194 		struct btrfs_key search_key;
6195 		struct inode *inode;
6196 		u64 ino;
6197 		int ret = 0;
6198 
6199 		btrfs_release_path(path);
6200 
6201 		ino = found_key.offset;
6202 
6203 		search_key.objectid = found_key.offset;
6204 		search_key.type = BTRFS_INODE_ITEM_KEY;
6205 		search_key.offset = 0;
6206 		inode = btrfs_iget(fs_info->sb, ino, root);
6207 		if (IS_ERR(inode))
6208 			return PTR_ERR(inode);
6209 
6210 		if (BTRFS_I(inode)->generation >= trans->transid &&
6211 		    need_log_inode(trans, BTRFS_I(inode)))
6212 			ret = btrfs_log_inode(trans, BTRFS_I(inode),
6213 					      LOG_INODE_EXISTS, ctx);
6214 		btrfs_add_delayed_iput(inode);
6215 		if (ret)
6216 			return ret;
6217 
6218 		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6219 			break;
6220 
6221 		search_key.type = BTRFS_INODE_REF_KEY;
6222 		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6223 		if (ret < 0)
6224 			return ret;
6225 
6226 		leaf = path->nodes[0];
6227 		slot = path->slots[0];
6228 		if (slot >= btrfs_header_nritems(leaf)) {
6229 			ret = btrfs_next_leaf(root, path);
6230 			if (ret < 0)
6231 				return ret;
6232 			else if (ret > 0)
6233 				return -ENOENT;
6234 			leaf = path->nodes[0];
6235 			slot = path->slots[0];
6236 		}
6237 
6238 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6239 		if (found_key.objectid != search_key.objectid ||
6240 		    found_key.type != BTRFS_INODE_REF_KEY)
6241 			return -ENOENT;
6242 	}
6243 	return 0;
6244 }
6245 
6246 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6247 				  struct btrfs_inode *inode,
6248 				  struct dentry *parent,
6249 				  struct btrfs_log_ctx *ctx)
6250 {
6251 	struct btrfs_root *root = inode->root;
6252 	struct dentry *old_parent = NULL;
6253 	struct super_block *sb = inode->vfs_inode.i_sb;
6254 	int ret = 0;
6255 
6256 	while (true) {
6257 		if (!parent || d_really_is_negative(parent) ||
6258 		    sb != parent->d_sb)
6259 			break;
6260 
6261 		inode = BTRFS_I(d_inode(parent));
6262 		if (root != inode->root)
6263 			break;
6264 
6265 		if (inode->generation >= trans->transid &&
6266 		    need_log_inode(trans, inode)) {
6267 			ret = btrfs_log_inode(trans, inode,
6268 					      LOG_INODE_EXISTS, ctx);
6269 			if (ret)
6270 				break;
6271 		}
6272 		if (IS_ROOT(parent))
6273 			break;
6274 
6275 		parent = dget_parent(parent);
6276 		dput(old_parent);
6277 		old_parent = parent;
6278 	}
6279 	dput(old_parent);
6280 
6281 	return ret;
6282 }
6283 
6284 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6285 				 struct btrfs_inode *inode,
6286 				 struct dentry *parent,
6287 				 struct btrfs_log_ctx *ctx)
6288 {
6289 	struct btrfs_root *root = inode->root;
6290 	const u64 ino = btrfs_ino(inode);
6291 	struct btrfs_path *path;
6292 	struct btrfs_key search_key;
6293 	int ret;
6294 
6295 	/*
6296 	 * For a single hard link case, go through a fast path that does not
6297 	 * need to iterate the fs/subvolume tree.
6298 	 */
6299 	if (inode->vfs_inode.i_nlink < 2)
6300 		return log_new_ancestors_fast(trans, inode, parent, ctx);
6301 
6302 	path = btrfs_alloc_path();
6303 	if (!path)
6304 		return -ENOMEM;
6305 
6306 	search_key.objectid = ino;
6307 	search_key.type = BTRFS_INODE_REF_KEY;
6308 	search_key.offset = 0;
6309 again:
6310 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6311 	if (ret < 0)
6312 		goto out;
6313 	if (ret == 0)
6314 		path->slots[0]++;
6315 
6316 	while (true) {
6317 		struct extent_buffer *leaf = path->nodes[0];
6318 		int slot = path->slots[0];
6319 		struct btrfs_key found_key;
6320 
6321 		if (slot >= btrfs_header_nritems(leaf)) {
6322 			ret = btrfs_next_leaf(root, path);
6323 			if (ret < 0)
6324 				goto out;
6325 			else if (ret > 0)
6326 				break;
6327 			continue;
6328 		}
6329 
6330 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6331 		if (found_key.objectid != ino ||
6332 		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6333 			break;
6334 
6335 		/*
6336 		 * Don't deal with extended references because they are rare
6337 		 * cases and too complex to deal with (we would need to keep
6338 		 * track of which subitem we are processing for each item in
6339 		 * this loop, etc). So just return some error to fallback to
6340 		 * a transaction commit.
6341 		 */
6342 		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6343 			ret = -EMLINK;
6344 			goto out;
6345 		}
6346 
6347 		/*
6348 		 * Logging ancestors needs to do more searches on the fs/subvol
6349 		 * tree, so it releases the path as needed to avoid deadlocks.
6350 		 * Keep track of the last inode ref key and resume from that key
6351 		 * after logging all new ancestors for the current hard link.
6352 		 */
6353 		memcpy(&search_key, &found_key, sizeof(search_key));
6354 
6355 		ret = log_new_ancestors(trans, root, path, ctx);
6356 		if (ret)
6357 			goto out;
6358 		btrfs_release_path(path);
6359 		goto again;
6360 	}
6361 	ret = 0;
6362 out:
6363 	btrfs_free_path(path);
6364 	return ret;
6365 }
6366 
6367 /*
6368  * helper function around btrfs_log_inode to make sure newly created
6369  * parent directories also end up in the log.  A minimal inode and backref
6370  * only logging is done of any parent directories that are older than
6371  * the last committed transaction
6372  */
6373 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6374 				  struct btrfs_inode *inode,
6375 				  struct dentry *parent,
6376 				  int inode_only,
6377 				  struct btrfs_log_ctx *ctx)
6378 {
6379 	struct btrfs_root *root = inode->root;
6380 	struct btrfs_fs_info *fs_info = root->fs_info;
6381 	int ret = 0;
6382 	bool log_dentries = false;
6383 
6384 	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6385 		ret = 1;
6386 		goto end_no_trans;
6387 	}
6388 
6389 	if (btrfs_root_refs(&root->root_item) == 0) {
6390 		ret = 1;
6391 		goto end_no_trans;
6392 	}
6393 
6394 	/*
6395 	 * Skip already logged inodes or inodes corresponding to tmpfiles
6396 	 * (since logging them is pointless, a link count of 0 means they
6397 	 * will never be accessible).
6398 	 */
6399 	if ((btrfs_inode_in_log(inode, trans->transid) &&
6400 	     list_empty(&ctx->ordered_extents)) ||
6401 	    inode->vfs_inode.i_nlink == 0) {
6402 		ret = BTRFS_NO_LOG_SYNC;
6403 		goto end_no_trans;
6404 	}
6405 
6406 	ret = start_log_trans(trans, root, ctx);
6407 	if (ret)
6408 		goto end_no_trans;
6409 
6410 	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
6411 	if (ret)
6412 		goto end_trans;
6413 
6414 	/*
6415 	 * for regular files, if its inode is already on disk, we don't
6416 	 * have to worry about the parents at all.  This is because
6417 	 * we can use the last_unlink_trans field to record renames
6418 	 * and other fun in this file.
6419 	 */
6420 	if (S_ISREG(inode->vfs_inode.i_mode) &&
6421 	    inode->generation < trans->transid &&
6422 	    inode->last_unlink_trans < trans->transid) {
6423 		ret = 0;
6424 		goto end_trans;
6425 	}
6426 
6427 	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
6428 		log_dentries = true;
6429 
6430 	/*
6431 	 * On unlink we must make sure all our current and old parent directory
6432 	 * inodes are fully logged. This is to prevent leaving dangling
6433 	 * directory index entries in directories that were our parents but are
6434 	 * not anymore. Not doing this results in old parent directory being
6435 	 * impossible to delete after log replay (rmdir will always fail with
6436 	 * error -ENOTEMPTY).
6437 	 *
6438 	 * Example 1:
6439 	 *
6440 	 * mkdir testdir
6441 	 * touch testdir/foo
6442 	 * ln testdir/foo testdir/bar
6443 	 * sync
6444 	 * unlink testdir/bar
6445 	 * xfs_io -c fsync testdir/foo
6446 	 * <power failure>
6447 	 * mount fs, triggers log replay
6448 	 *
6449 	 * If we don't log the parent directory (testdir), after log replay the
6450 	 * directory still has an entry pointing to the file inode using the bar
6451 	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6452 	 * the file inode has a link count of 1.
6453 	 *
6454 	 * Example 2:
6455 	 *
6456 	 * mkdir testdir
6457 	 * touch foo
6458 	 * ln foo testdir/foo2
6459 	 * ln foo testdir/foo3
6460 	 * sync
6461 	 * unlink testdir/foo3
6462 	 * xfs_io -c fsync foo
6463 	 * <power failure>
6464 	 * mount fs, triggers log replay
6465 	 *
6466 	 * Similar as the first example, after log replay the parent directory
6467 	 * testdir still has an entry pointing to the inode file with name foo3
6468 	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6469 	 * and has a link count of 2.
6470 	 */
6471 	if (inode->last_unlink_trans >= trans->transid) {
6472 		ret = btrfs_log_all_parents(trans, inode, ctx);
6473 		if (ret)
6474 			goto end_trans;
6475 	}
6476 
6477 	ret = log_all_new_ancestors(trans, inode, parent, ctx);
6478 	if (ret)
6479 		goto end_trans;
6480 
6481 	if (log_dentries)
6482 		ret = log_new_dir_dentries(trans, root, inode, ctx);
6483 	else
6484 		ret = 0;
6485 end_trans:
6486 	if (ret < 0) {
6487 		btrfs_set_log_full_commit(trans);
6488 		ret = 1;
6489 	}
6490 
6491 	if (ret)
6492 		btrfs_remove_log_ctx(root, ctx);
6493 	btrfs_end_log_trans(root);
6494 end_no_trans:
6495 	return ret;
6496 }
6497 
6498 /*
6499  * it is not safe to log dentry if the chunk root has added new
6500  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6501  * If this returns 1, you must commit the transaction to safely get your
6502  * data on disk.
6503  */
6504 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6505 			  struct dentry *dentry,
6506 			  struct btrfs_log_ctx *ctx)
6507 {
6508 	struct dentry *parent = dget_parent(dentry);
6509 	int ret;
6510 
6511 	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6512 				     LOG_INODE_ALL, ctx);
6513 	dput(parent);
6514 
6515 	return ret;
6516 }
6517 
6518 /*
6519  * should be called during mount to recover any replay any log trees
6520  * from the FS
6521  */
6522 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6523 {
6524 	int ret;
6525 	struct btrfs_path *path;
6526 	struct btrfs_trans_handle *trans;
6527 	struct btrfs_key key;
6528 	struct btrfs_key found_key;
6529 	struct btrfs_root *log;
6530 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6531 	struct walk_control wc = {
6532 		.process_func = process_one_buffer,
6533 		.stage = LOG_WALK_PIN_ONLY,
6534 	};
6535 
6536 	path = btrfs_alloc_path();
6537 	if (!path)
6538 		return -ENOMEM;
6539 
6540 	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6541 
6542 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
6543 	if (IS_ERR(trans)) {
6544 		ret = PTR_ERR(trans);
6545 		goto error;
6546 	}
6547 
6548 	wc.trans = trans;
6549 	wc.pin = 1;
6550 
6551 	ret = walk_log_tree(trans, log_root_tree, &wc);
6552 	if (ret) {
6553 		btrfs_abort_transaction(trans, ret);
6554 		goto error;
6555 	}
6556 
6557 again:
6558 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
6559 	key.offset = (u64)-1;
6560 	key.type = BTRFS_ROOT_ITEM_KEY;
6561 
6562 	while (1) {
6563 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6564 
6565 		if (ret < 0) {
6566 			btrfs_abort_transaction(trans, ret);
6567 			goto error;
6568 		}
6569 		if (ret > 0) {
6570 			if (path->slots[0] == 0)
6571 				break;
6572 			path->slots[0]--;
6573 		}
6574 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6575 				      path->slots[0]);
6576 		btrfs_release_path(path);
6577 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6578 			break;
6579 
6580 		log = btrfs_read_tree_root(log_root_tree, &found_key);
6581 		if (IS_ERR(log)) {
6582 			ret = PTR_ERR(log);
6583 			btrfs_abort_transaction(trans, ret);
6584 			goto error;
6585 		}
6586 
6587 		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6588 						   true);
6589 		if (IS_ERR(wc.replay_dest)) {
6590 			ret = PTR_ERR(wc.replay_dest);
6591 
6592 			/*
6593 			 * We didn't find the subvol, likely because it was
6594 			 * deleted.  This is ok, simply skip this log and go to
6595 			 * the next one.
6596 			 *
6597 			 * We need to exclude the root because we can't have
6598 			 * other log replays overwriting this log as we'll read
6599 			 * it back in a few more times.  This will keep our
6600 			 * block from being modified, and we'll just bail for
6601 			 * each subsequent pass.
6602 			 */
6603 			if (ret == -ENOENT)
6604 				ret = btrfs_pin_extent_for_log_replay(trans,
6605 							log->node->start,
6606 							log->node->len);
6607 			btrfs_put_root(log);
6608 
6609 			if (!ret)
6610 				goto next;
6611 			btrfs_abort_transaction(trans, ret);
6612 			goto error;
6613 		}
6614 
6615 		wc.replay_dest->log_root = log;
6616 		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
6617 		if (ret)
6618 			/* The loop needs to continue due to the root refs */
6619 			btrfs_abort_transaction(trans, ret);
6620 		else
6621 			ret = walk_log_tree(trans, log, &wc);
6622 
6623 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6624 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
6625 						      path);
6626 			if (ret)
6627 				btrfs_abort_transaction(trans, ret);
6628 		}
6629 
6630 		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6631 			struct btrfs_root *root = wc.replay_dest;
6632 
6633 			btrfs_release_path(path);
6634 
6635 			/*
6636 			 * We have just replayed everything, and the highest
6637 			 * objectid of fs roots probably has changed in case
6638 			 * some inode_item's got replayed.
6639 			 *
6640 			 * root->objectid_mutex is not acquired as log replay
6641 			 * could only happen during mount.
6642 			 */
6643 			ret = btrfs_init_root_free_objectid(root);
6644 			if (ret)
6645 				btrfs_abort_transaction(trans, ret);
6646 		}
6647 
6648 		wc.replay_dest->log_root = NULL;
6649 		btrfs_put_root(wc.replay_dest);
6650 		btrfs_put_root(log);
6651 
6652 		if (ret)
6653 			goto error;
6654 next:
6655 		if (found_key.offset == 0)
6656 			break;
6657 		key.offset = found_key.offset - 1;
6658 	}
6659 	btrfs_release_path(path);
6660 
6661 	/* step one is to pin it all, step two is to replay just inodes */
6662 	if (wc.pin) {
6663 		wc.pin = 0;
6664 		wc.process_func = replay_one_buffer;
6665 		wc.stage = LOG_WALK_REPLAY_INODES;
6666 		goto again;
6667 	}
6668 	/* step three is to replay everything */
6669 	if (wc.stage < LOG_WALK_REPLAY_ALL) {
6670 		wc.stage++;
6671 		goto again;
6672 	}
6673 
6674 	btrfs_free_path(path);
6675 
6676 	/* step 4: commit the transaction, which also unpins the blocks */
6677 	ret = btrfs_commit_transaction(trans);
6678 	if (ret)
6679 		return ret;
6680 
6681 	log_root_tree->log_root = NULL;
6682 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6683 	btrfs_put_root(log_root_tree);
6684 
6685 	return 0;
6686 error:
6687 	if (wc.trans)
6688 		btrfs_end_transaction(wc.trans);
6689 	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6690 	btrfs_free_path(path);
6691 	return ret;
6692 }
6693 
6694 /*
6695  * there are some corner cases where we want to force a full
6696  * commit instead of allowing a directory to be logged.
6697  *
6698  * They revolve around files there were unlinked from the directory, and
6699  * this function updates the parent directory so that a full commit is
6700  * properly done if it is fsync'd later after the unlinks are done.
6701  *
6702  * Must be called before the unlink operations (updates to the subvolume tree,
6703  * inodes, etc) are done.
6704  */
6705 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6706 			     struct btrfs_inode *dir, struct btrfs_inode *inode,
6707 			     int for_rename)
6708 {
6709 	/*
6710 	 * when we're logging a file, if it hasn't been renamed
6711 	 * or unlinked, and its inode is fully committed on disk,
6712 	 * we don't have to worry about walking up the directory chain
6713 	 * to log its parents.
6714 	 *
6715 	 * So, we use the last_unlink_trans field to put this transid
6716 	 * into the file.  When the file is logged we check it and
6717 	 * don't log the parents if the file is fully on disk.
6718 	 */
6719 	mutex_lock(&inode->log_mutex);
6720 	inode->last_unlink_trans = trans->transid;
6721 	mutex_unlock(&inode->log_mutex);
6722 
6723 	/*
6724 	 * if this directory was already logged any new
6725 	 * names for this file/dir will get recorded
6726 	 */
6727 	if (dir->logged_trans == trans->transid)
6728 		return;
6729 
6730 	/*
6731 	 * if the inode we're about to unlink was logged,
6732 	 * the log will be properly updated for any new names
6733 	 */
6734 	if (inode->logged_trans == trans->transid)
6735 		return;
6736 
6737 	/*
6738 	 * when renaming files across directories, if the directory
6739 	 * there we're unlinking from gets fsync'd later on, there's
6740 	 * no way to find the destination directory later and fsync it
6741 	 * properly.  So, we have to be conservative and force commits
6742 	 * so the new name gets discovered.
6743 	 */
6744 	if (for_rename)
6745 		goto record;
6746 
6747 	/* we can safely do the unlink without any special recording */
6748 	return;
6749 
6750 record:
6751 	mutex_lock(&dir->log_mutex);
6752 	dir->last_unlink_trans = trans->transid;
6753 	mutex_unlock(&dir->log_mutex);
6754 }
6755 
6756 /*
6757  * Make sure that if someone attempts to fsync the parent directory of a deleted
6758  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6759  * that after replaying the log tree of the parent directory's root we will not
6760  * see the snapshot anymore and at log replay time we will not see any log tree
6761  * corresponding to the deleted snapshot's root, which could lead to replaying
6762  * it after replaying the log tree of the parent directory (which would replay
6763  * the snapshot delete operation).
6764  *
6765  * Must be called before the actual snapshot destroy operation (updates to the
6766  * parent root and tree of tree roots trees, etc) are done.
6767  */
6768 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6769 				   struct btrfs_inode *dir)
6770 {
6771 	mutex_lock(&dir->log_mutex);
6772 	dir->last_unlink_trans = trans->transid;
6773 	mutex_unlock(&dir->log_mutex);
6774 }
6775 
6776 /*
6777  * Call this after adding a new name for a file and it will properly
6778  * update the log to reflect the new name.
6779  */
6780 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6781 			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6782 			struct dentry *parent)
6783 {
6784 	struct btrfs_log_ctx ctx;
6785 
6786 	/*
6787 	 * this will force the logging code to walk the dentry chain
6788 	 * up for the file
6789 	 */
6790 	if (!S_ISDIR(inode->vfs_inode.i_mode))
6791 		inode->last_unlink_trans = trans->transid;
6792 
6793 	/*
6794 	 * if this inode hasn't been logged and directory we're renaming it
6795 	 * from hasn't been logged, we don't need to log it
6796 	 */
6797 	if (!inode_logged(trans, inode) &&
6798 	    (!old_dir || !inode_logged(trans, old_dir)))
6799 		return;
6800 
6801 	/*
6802 	 * If we are doing a rename (old_dir is not NULL) from a directory that
6803 	 * was previously logged, make sure the next log attempt on the directory
6804 	 * is not skipped and logs the inode again. This is because the log may
6805 	 * not currently be authoritative for a range including the old
6806 	 * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
6807 	 * do not end up with both the new and old dentries around (in case the
6808 	 * inode is a directory we would have a directory with two hard links and
6809 	 * 2 inode references for different parents). The next log attempt of
6810 	 * old_dir will happen at btrfs_log_all_parents(), called through
6811 	 * btrfs_log_inode_parent() below, because we have previously set
6812 	 * inode->last_unlink_trans to the current transaction ID, either here or
6813 	 * at btrfs_record_unlink_dir() in case the inode is a directory.
6814 	 */
6815 	if (old_dir)
6816 		old_dir->logged_trans = 0;
6817 
6818 	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6819 	ctx.logging_new_name = true;
6820 	/*
6821 	 * We don't care about the return value. If we fail to log the new name
6822 	 * then we know the next attempt to sync the log will fallback to a full
6823 	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6824 	 * we don't need to worry about getting a log committed that has an
6825 	 * inconsistent state after a rename operation.
6826 	 */
6827 	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6828 }
6829 
6830