xref: /linux/fs/ext4/fast_commit.c (revision b1b7dce3f09b460da38946d1845f3076daa36abb)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  * Not all operations are supported by fast commits today (e.g extended
69  * attributes). Fast commit ineligiblity is marked by calling one of the
70  * two following functions:
71  *
72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73  *   back to full commit. This is useful in case of transient errors.
74  *
75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76  *   the fast commits happening between ext4_fc_start_ineligible() and
77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79  *   make one more fast commit to fall back to full commit after stop call so
80  *   that it guaranteed that the fast commit ineligible operation contained
81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82  *   followed by at least 1 full commit.
83  *
84  * Atomicity of commits
85  * --------------------
86  * In order to guarantee atomicity during the commit operation, fast commit
87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88  * tag contains CRC of the contents and TID of the transaction after which
89  * this fast commit should be applied. Recovery code replays fast commit
90  * logs only if there's at least 1 valid tail present. For every fast commit
91  * operation, there is 1 tail. This means, we may end up with multiple tails
92  * in the fast commit space. Here's an example:
93  *
94  * - Create a new file A and remove existing file B
95  * - fsync()
96  * - Append contents to file A
97  * - Truncate file A
98  * - fsync()
99  *
100  * The fast commit space at the end of above operations would look like this:
101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
103  *
104  * Replay code should thus check for all the valid tails in the FC area.
105  *
106  * Fast Commit Replay Idempotence
107  * ------------------------------
108  *
109  * Fast commits tags are idempotent in nature provided the recovery code follows
110  * certain rules. The guiding principle that the commit path follows while
111  * committing is that it stores the result of a particular operation instead of
112  * storing the procedure.
113  *
114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115  * was associated with inode 10. During fast commit, instead of storing this
116  * operation as a procedure "rename a to b", we store the resulting file system
117  * state as a "series" of outcomes:
118  *
119  * - Link dirent b to inode 10
120  * - Unlink dirent a
121  * - Inode <10> with valid refcount
122  *
123  * Now when recovery code runs, it needs "enforce" this state on the file
124  * system. This is what guarantees idempotence of fast commit replay.
125  *
126  * Let's take an example of a procedure that is not idempotent and see how fast
127  * commits make it idempotent. Consider following sequence of operations:
128  *
129  *     rm A;    mv B A;    read A
130  *  (x)     (y)        (z)
131  *
132  * (x), (y) and (z) are the points at which we can crash. If we store this
133  * sequence of operations as is then the replay is not idempotent. Let's say
134  * while in replay, we crash at (z). During the second replay, file A (which was
135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
136  * file named A would be absent when we try to read A. So, this sequence of
137  * operations is not idempotent. However, as mentioned above, instead of storing
138  * the procedure fast commits store the outcome of each procedure. Thus the fast
139  * commit log for above procedure would be as follows:
140  *
141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142  * inode 11 before the replay)
143  *
144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
145  * (w)          (x)                    (y)          (z)
146  *
147  * If we crash at (z), we will have file A linked to inode 11. During the second
148  * replay, we will remove file A (inode 11). But we will create it back and make
149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152  * similarly. Thus, by converting a non-idempotent procedure into a series of
153  * idempotent outcomes, fast commits ensured idempotence during the replay.
154  *
155  * TODOs
156  * -----
157  *
158  * 0) Fast commit replay path hardening: Fast commit replay code should use
159  *    journal handles to make sure all the updates it does during the replay
160  *    path are atomic. With that if we crash during fast commit replay, after
161  *    trying to do recovery again, we will find a file system where fast commit
162  *    area is invalid (because new full commit would be found). In order to deal
163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
164  *    superblock state is persisted before starting the replay, so that after
165  *    the crash, fast commit recovery code can look at that flag and perform
166  *    fast commit recovery even if that area is invalidated by later full
167  *    commits.
168  *
169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170  *    eligible update must be protected within ext4_fc_start_update() and
171  *    ext4_fc_stop_update(). These routines are called at much higher
172  *    routines. This can be made more fine grained by combining with
173  *    ext4_journal_start().
174  *
175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176  *
177  * 3) Handle more ineligible cases.
178  */
179 
180 #include <trace/events/ext4.h>
181 static struct kmem_cache *ext4_fc_dentry_cachep;
182 
183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184 {
185 	BUFFER_TRACE(bh, "");
186 	if (uptodate) {
187 		ext4_debug("%s: Block %lld up-to-date",
188 			   __func__, bh->b_blocknr);
189 		set_buffer_uptodate(bh);
190 	} else {
191 		ext4_debug("%s: Block %lld not up-to-date",
192 			   __func__, bh->b_blocknr);
193 		clear_buffer_uptodate(bh);
194 	}
195 
196 	unlock_buffer(bh);
197 }
198 
199 static inline void ext4_fc_reset_inode(struct inode *inode)
200 {
201 	struct ext4_inode_info *ei = EXT4_I(inode);
202 
203 	ei->i_fc_lblk_start = 0;
204 	ei->i_fc_lblk_len = 0;
205 }
206 
207 void ext4_fc_init_inode(struct inode *inode)
208 {
209 	struct ext4_inode_info *ei = EXT4_I(inode);
210 
211 	ext4_fc_reset_inode(inode);
212 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 	INIT_LIST_HEAD(&ei->i_fc_list);
214 	init_waitqueue_head(&ei->i_fc_wait);
215 	atomic_set(&ei->i_fc_updates, 0);
216 }
217 
218 /* This function must be called with sbi->s_fc_lock held. */
219 static void ext4_fc_wait_committing_inode(struct inode *inode)
220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221 {
222 	wait_queue_head_t *wq;
223 	struct ext4_inode_info *ei = EXT4_I(inode);
224 
225 #if (BITS_PER_LONG < 64)
226 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 			EXT4_STATE_FC_COMMITTING);
228 	wq = bit_waitqueue(&ei->i_state_flags,
229 				EXT4_STATE_FC_COMMITTING);
230 #else
231 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 			EXT4_STATE_FC_COMMITTING);
233 	wq = bit_waitqueue(&ei->i_flags,
234 				EXT4_STATE_FC_COMMITTING);
235 #endif
236 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 	schedule();
240 	finish_wait(wq, &wait.wq_entry);
241 }
242 
243 /*
244  * Inform Ext4's fast about start of an inode update
245  *
246  * This function is called by the high level call VFS callbacks before
247  * performing any inode update. This function blocks if there's an ongoing
248  * fast commit on the inode in question.
249  */
250 void ext4_fc_start_update(struct inode *inode)
251 {
252 	struct ext4_inode_info *ei = EXT4_I(inode);
253 
254 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 		return;
257 
258 restart:
259 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 	if (list_empty(&ei->i_fc_list))
261 		goto out;
262 
263 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 		ext4_fc_wait_committing_inode(inode);
265 		goto restart;
266 	}
267 out:
268 	atomic_inc(&ei->i_fc_updates);
269 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270 }
271 
272 /*
273  * Stop inode update and wake up waiting fast commits if any.
274  */
275 void ext4_fc_stop_update(struct inode *inode)
276 {
277 	struct ext4_inode_info *ei = EXT4_I(inode);
278 
279 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 		return;
282 
283 	if (atomic_dec_and_test(&ei->i_fc_updates))
284 		wake_up_all(&ei->i_fc_wait);
285 }
286 
287 /*
288  * Remove inode from fast commit list. If the inode is being committed
289  * we wait until inode commit is done.
290  */
291 void ext4_fc_del(struct inode *inode)
292 {
293 	struct ext4_inode_info *ei = EXT4_I(inode);
294 
295 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 		return;
298 
299 restart:
300 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 	if (list_empty(&ei->i_fc_list)) {
302 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 		return;
304 	}
305 
306 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 		ext4_fc_wait_committing_inode(inode);
308 		goto restart;
309 	}
310 	list_del_init(&ei->i_fc_list);
311 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312 }
313 
314 /*
315  * Mark file system as fast commit ineligible. This means that next commit
316  * operation would result in a full jbd2 commit.
317  */
318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319 {
320 	struct ext4_sb_info *sbi = EXT4_SB(sb);
321 
322 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 		return;
325 
326 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329 }
330 
331 /*
332  * Start a fast commit ineligible update. Any commits that happen while
333  * such an operation is in progress fall back to full commits.
334  */
335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336 {
337 	struct ext4_sb_info *sbi = EXT4_SB(sb);
338 
339 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 		return;
342 
343 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 	atomic_inc(&sbi->s_fc_ineligible_updates);
346 }
347 
348 /*
349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350  * to ensure that after stopping the ineligible update, at least one full
351  * commit takes place.
352  */
353 void ext4_fc_stop_ineligible(struct super_block *sb)
354 {
355 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 		return;
358 
359 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361 }
362 
363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
364 {
365 	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367 }
368 
369 /*
370  * Generic fast commit tracking function. If this is the first time this we are
371  * called after a full commit, we initialize fast commit fields and then call
372  * __fc_track_fn() with update = 0. If we have already been called after a full
373  * commit, we pass update = 1. Based on that, the track function can determine
374  * if it needs to track a field for the first time or if it needs to just
375  * update the previously tracked value.
376  *
377  * If enqueue is set, this function enqueues the inode in fast commit list.
378  */
379 static int ext4_fc_track_template(
380 	handle_t *handle, struct inode *inode,
381 	int (*__fc_track_fn)(struct inode *, void *, bool),
382 	void *args, int enqueue)
383 {
384 	bool update = false;
385 	struct ext4_inode_info *ei = EXT4_I(inode);
386 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 	tid_t tid = 0;
388 	int ret;
389 
390 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
392 		return -EOPNOTSUPP;
393 
394 	if (ext4_fc_is_ineligible(inode->i_sb))
395 		return -EINVAL;
396 
397 	tid = handle->h_transaction->t_tid;
398 	mutex_lock(&ei->i_fc_lock);
399 	if (tid == ei->i_sync_tid) {
400 		update = true;
401 	} else {
402 		ext4_fc_reset_inode(inode);
403 		ei->i_sync_tid = tid;
404 	}
405 	ret = __fc_track_fn(inode, args, update);
406 	mutex_unlock(&ei->i_fc_lock);
407 
408 	if (!enqueue)
409 		return ret;
410 
411 	spin_lock(&sbi->s_fc_lock);
412 	if (list_empty(&EXT4_I(inode)->i_fc_list))
413 		list_add_tail(&EXT4_I(inode)->i_fc_list,
414 				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 				&sbi->s_fc_q[FC_Q_STAGING] :
416 				&sbi->s_fc_q[FC_Q_MAIN]);
417 	spin_unlock(&sbi->s_fc_lock);
418 
419 	return ret;
420 }
421 
422 struct __track_dentry_update_args {
423 	struct dentry *dentry;
424 	int op;
425 };
426 
427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429 {
430 	struct ext4_fc_dentry_update *node;
431 	struct ext4_inode_info *ei = EXT4_I(inode);
432 	struct __track_dentry_update_args *dentry_update =
433 		(struct __track_dentry_update_args *)arg;
434 	struct dentry *dentry = dentry_update->dentry;
435 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436 
437 	mutex_unlock(&ei->i_fc_lock);
438 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 	if (!node) {
440 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 		mutex_lock(&ei->i_fc_lock);
442 		return -ENOMEM;
443 	}
444 
445 	node->fcd_op = dentry_update->op;
446 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 	node->fcd_ino = inode->i_ino;
448 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 		if (!node->fcd_name.name) {
451 			kmem_cache_free(ext4_fc_dentry_cachep, node);
452 			ext4_fc_mark_ineligible(inode->i_sb,
453 				EXT4_FC_REASON_NOMEM);
454 			mutex_lock(&ei->i_fc_lock);
455 			return -ENOMEM;
456 		}
457 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 			dentry->d_name.len);
459 	} else {
460 		memcpy(node->fcd_iname, dentry->d_name.name,
461 			dentry->d_name.len);
462 		node->fcd_name.name = node->fcd_iname;
463 	}
464 	node->fcd_name.len = dentry->d_name.len;
465 
466 	spin_lock(&sbi->s_fc_lock);
467 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 		list_add_tail(&node->fcd_list,
469 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 	else
471 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 	spin_unlock(&sbi->s_fc_lock);
473 	mutex_lock(&ei->i_fc_lock);
474 
475 	return 0;
476 }
477 
478 void __ext4_fc_track_unlink(handle_t *handle,
479 		struct inode *inode, struct dentry *dentry)
480 {
481 	struct __track_dentry_update_args args;
482 	int ret;
483 
484 	args.dentry = dentry;
485 	args.op = EXT4_FC_TAG_UNLINK;
486 
487 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 					(void *)&args, 0);
489 	trace_ext4_fc_track_unlink(inode, dentry, ret);
490 }
491 
492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493 {
494 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495 }
496 
497 void __ext4_fc_track_link(handle_t *handle,
498 	struct inode *inode, struct dentry *dentry)
499 {
500 	struct __track_dentry_update_args args;
501 	int ret;
502 
503 	args.dentry = dentry;
504 	args.op = EXT4_FC_TAG_LINK;
505 
506 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 					(void *)&args, 0);
508 	trace_ext4_fc_track_link(inode, dentry, ret);
509 }
510 
511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512 {
513 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
514 }
515 
516 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
517 {
518 	struct __track_dentry_update_args args;
519 	struct inode *inode = d_inode(dentry);
520 	int ret;
521 
522 	args.dentry = dentry;
523 	args.op = EXT4_FC_TAG_CREAT;
524 
525 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 					(void *)&args, 0);
527 	trace_ext4_fc_track_create(inode, dentry, ret);
528 }
529 
530 /* __track_fn for inode tracking */
531 static int __track_inode(struct inode *inode, void *arg, bool update)
532 {
533 	if (update)
534 		return -EEXIST;
535 
536 	EXT4_I(inode)->i_fc_lblk_len = 0;
537 
538 	return 0;
539 }
540 
541 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
542 {
543 	int ret;
544 
545 	if (S_ISDIR(inode->i_mode))
546 		return;
547 
548 	if (ext4_should_journal_data(inode)) {
549 		ext4_fc_mark_ineligible(inode->i_sb,
550 					EXT4_FC_REASON_INODE_JOURNAL_DATA);
551 		return;
552 	}
553 
554 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
555 	trace_ext4_fc_track_inode(inode, ret);
556 }
557 
558 struct __track_range_args {
559 	ext4_lblk_t start, end;
560 };
561 
562 /* __track_fn for tracking data updates */
563 static int __track_range(struct inode *inode, void *arg, bool update)
564 {
565 	struct ext4_inode_info *ei = EXT4_I(inode);
566 	ext4_lblk_t oldstart;
567 	struct __track_range_args *__arg =
568 		(struct __track_range_args *)arg;
569 
570 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
571 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
572 		return -ECANCELED;
573 	}
574 
575 	oldstart = ei->i_fc_lblk_start;
576 
577 	if (update && ei->i_fc_lblk_len > 0) {
578 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
579 		ei->i_fc_lblk_len =
580 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
581 				ei->i_fc_lblk_start + 1;
582 	} else {
583 		ei->i_fc_lblk_start = __arg->start;
584 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
585 	}
586 
587 	return 0;
588 }
589 
590 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
591 			 ext4_lblk_t end)
592 {
593 	struct __track_range_args args;
594 	int ret;
595 
596 	if (S_ISDIR(inode->i_mode))
597 		return;
598 
599 	args.start = start;
600 	args.end = end;
601 
602 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
603 
604 	trace_ext4_fc_track_range(inode, start, end, ret);
605 }
606 
607 static void ext4_fc_submit_bh(struct super_block *sb)
608 {
609 	int write_flags = REQ_SYNC;
610 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
611 
612 	/* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
613 	if (test_opt(sb, BARRIER))
614 		write_flags |= REQ_FUA | REQ_PREFLUSH;
615 	lock_buffer(bh);
616 	set_buffer_dirty(bh);
617 	set_buffer_uptodate(bh);
618 	bh->b_end_io = ext4_end_buffer_io_sync;
619 	submit_bh(REQ_OP_WRITE, write_flags, bh);
620 	EXT4_SB(sb)->s_fc_bh = NULL;
621 }
622 
623 /* Ext4 commit path routines */
624 
625 /* memzero and update CRC */
626 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
627 				u32 *crc)
628 {
629 	void *ret;
630 
631 	ret = memset(dst, 0, len);
632 	if (crc)
633 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
634 	return ret;
635 }
636 
637 /*
638  * Allocate len bytes on a fast commit buffer.
639  *
640  * During the commit time this function is used to manage fast commit
641  * block space. We don't split a fast commit log onto different
642  * blocks. So this function makes sure that if there's not enough space
643  * on the current block, the remaining space in the current block is
644  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
645  * new block is from jbd2 and CRC is updated to reflect the padding
646  * we added.
647  */
648 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
649 {
650 	struct ext4_fc_tl *tl;
651 	struct ext4_sb_info *sbi = EXT4_SB(sb);
652 	struct buffer_head *bh;
653 	int bsize = sbi->s_journal->j_blocksize;
654 	int ret, off = sbi->s_fc_bytes % bsize;
655 	int pad_len;
656 
657 	/*
658 	 * After allocating len, we should have space at least for a 0 byte
659 	 * padding.
660 	 */
661 	if (len + sizeof(struct ext4_fc_tl) > bsize)
662 		return NULL;
663 
664 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
665 		/*
666 		 * Only allocate from current buffer if we have enough space for
667 		 * this request AND we have space to add a zero byte padding.
668 		 */
669 		if (!sbi->s_fc_bh) {
670 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
671 			if (ret)
672 				return NULL;
673 			sbi->s_fc_bh = bh;
674 		}
675 		sbi->s_fc_bytes += len;
676 		return sbi->s_fc_bh->b_data + off;
677 	}
678 	/* Need to add PAD tag */
679 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
680 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
681 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
682 	tl->fc_len = cpu_to_le16(pad_len);
683 	if (crc)
684 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
685 	if (pad_len > 0)
686 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
687 	ext4_fc_submit_bh(sb);
688 
689 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
690 	if (ret)
691 		return NULL;
692 	sbi->s_fc_bh = bh;
693 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
694 	return sbi->s_fc_bh->b_data;
695 }
696 
697 /* memcpy to fc reserved space and update CRC */
698 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
699 				int len, u32 *crc)
700 {
701 	if (crc)
702 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
703 	return memcpy(dst, src, len);
704 }
705 
706 /*
707  * Complete a fast commit by writing tail tag.
708  *
709  * Writing tail tag marks the end of a fast commit. In order to guarantee
710  * atomicity, after writing tail tag, even if there's space remaining
711  * in the block, next commit shouldn't use it. That's why tail tag
712  * has the length as that of the remaining space on the block.
713  */
714 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
715 {
716 	struct ext4_sb_info *sbi = EXT4_SB(sb);
717 	struct ext4_fc_tl tl;
718 	struct ext4_fc_tail tail;
719 	int off, bsize = sbi->s_journal->j_blocksize;
720 	u8 *dst;
721 
722 	/*
723 	 * ext4_fc_reserve_space takes care of allocating an extra block if
724 	 * there's no enough space on this block for accommodating this tail.
725 	 */
726 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
727 	if (!dst)
728 		return -ENOSPC;
729 
730 	off = sbi->s_fc_bytes % bsize;
731 
732 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
733 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
734 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
735 
736 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
737 	dst += sizeof(tl);
738 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
739 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
740 	dst += sizeof(tail.fc_tid);
741 	tail.fc_crc = cpu_to_le32(crc);
742 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
743 
744 	ext4_fc_submit_bh(sb);
745 
746 	return 0;
747 }
748 
749 /*
750  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
751  * Returns false if there's not enough space.
752  */
753 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
754 			   u32 *crc)
755 {
756 	struct ext4_fc_tl tl;
757 	u8 *dst;
758 
759 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
760 	if (!dst)
761 		return false;
762 
763 	tl.fc_tag = cpu_to_le16(tag);
764 	tl.fc_len = cpu_to_le16(len);
765 
766 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
767 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
768 
769 	return true;
770 }
771 
772 /* Same as above, but adds dentry tlv. */
773 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
774 					int parent_ino, int ino, int dlen,
775 					const unsigned char *dname,
776 					u32 *crc)
777 {
778 	struct ext4_fc_dentry_info fcd;
779 	struct ext4_fc_tl tl;
780 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
781 					crc);
782 
783 	if (!dst)
784 		return false;
785 
786 	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
787 	fcd.fc_ino = cpu_to_le32(ino);
788 	tl.fc_tag = cpu_to_le16(tag);
789 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
790 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
791 	dst += sizeof(tl);
792 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
793 	dst += sizeof(fcd);
794 	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
795 	dst += dlen;
796 
797 	return true;
798 }
799 
800 /*
801  * Writes inode in the fast commit space under TLV with tag @tag.
802  * Returns 0 on success, error on failure.
803  */
804 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
805 {
806 	struct ext4_inode_info *ei = EXT4_I(inode);
807 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
808 	int ret;
809 	struct ext4_iloc iloc;
810 	struct ext4_fc_inode fc_inode;
811 	struct ext4_fc_tl tl;
812 	u8 *dst;
813 
814 	ret = ext4_get_inode_loc(inode, &iloc);
815 	if (ret)
816 		return ret;
817 
818 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
819 		inode_len += ei->i_extra_isize;
820 
821 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
822 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
823 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
824 
825 	dst = ext4_fc_reserve_space(inode->i_sb,
826 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
827 	if (!dst)
828 		return -ECANCELED;
829 
830 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
831 		return -ECANCELED;
832 	dst += sizeof(tl);
833 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
834 		return -ECANCELED;
835 	dst += sizeof(fc_inode);
836 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
837 					inode_len, crc))
838 		return -ECANCELED;
839 
840 	return 0;
841 }
842 
843 /*
844  * Writes updated data ranges for the inode in question. Updates CRC.
845  * Returns 0 on success, error otherwise.
846  */
847 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
848 {
849 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
850 	struct ext4_inode_info *ei = EXT4_I(inode);
851 	struct ext4_map_blocks map;
852 	struct ext4_fc_add_range fc_ext;
853 	struct ext4_fc_del_range lrange;
854 	struct ext4_extent *ex;
855 	int ret;
856 
857 	mutex_lock(&ei->i_fc_lock);
858 	if (ei->i_fc_lblk_len == 0) {
859 		mutex_unlock(&ei->i_fc_lock);
860 		return 0;
861 	}
862 	old_blk_size = ei->i_fc_lblk_start;
863 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
864 	ei->i_fc_lblk_len = 0;
865 	mutex_unlock(&ei->i_fc_lock);
866 
867 	cur_lblk_off = old_blk_size;
868 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
869 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
870 
871 	while (cur_lblk_off <= new_blk_size) {
872 		map.m_lblk = cur_lblk_off;
873 		map.m_len = new_blk_size - cur_lblk_off + 1;
874 		ret = ext4_map_blocks(NULL, inode, &map, 0);
875 		if (ret < 0)
876 			return -ECANCELED;
877 
878 		if (map.m_len == 0) {
879 			cur_lblk_off++;
880 			continue;
881 		}
882 
883 		if (ret == 0) {
884 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
885 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
886 			lrange.fc_len = cpu_to_le32(map.m_len);
887 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
888 					    sizeof(lrange), (u8 *)&lrange, crc))
889 				return -ENOSPC;
890 		} else {
891 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
892 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
893 			ex->ee_block = cpu_to_le32(map.m_lblk);
894 			ex->ee_len = cpu_to_le16(map.m_len);
895 			ext4_ext_store_pblock(ex, map.m_pblk);
896 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
897 				ext4_ext_mark_unwritten(ex);
898 			else
899 				ext4_ext_mark_initialized(ex);
900 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
901 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
902 				return -ENOSPC;
903 		}
904 
905 		cur_lblk_off += map.m_len;
906 	}
907 
908 	return 0;
909 }
910 
911 
912 /* Submit data for all the fast commit inodes */
913 static int ext4_fc_submit_inode_data_all(journal_t *journal)
914 {
915 	struct super_block *sb = (struct super_block *)(journal->j_private);
916 	struct ext4_sb_info *sbi = EXT4_SB(sb);
917 	struct ext4_inode_info *ei;
918 	struct list_head *pos;
919 	int ret = 0;
920 
921 	spin_lock(&sbi->s_fc_lock);
922 	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
923 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
924 		ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
925 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
926 		while (atomic_read(&ei->i_fc_updates)) {
927 			DEFINE_WAIT(wait);
928 
929 			prepare_to_wait(&ei->i_fc_wait, &wait,
930 						TASK_UNINTERRUPTIBLE);
931 			if (atomic_read(&ei->i_fc_updates)) {
932 				spin_unlock(&sbi->s_fc_lock);
933 				schedule();
934 				spin_lock(&sbi->s_fc_lock);
935 			}
936 			finish_wait(&ei->i_fc_wait, &wait);
937 		}
938 		spin_unlock(&sbi->s_fc_lock);
939 		ret = jbd2_submit_inode_data(ei->jinode);
940 		if (ret)
941 			return ret;
942 		spin_lock(&sbi->s_fc_lock);
943 	}
944 	spin_unlock(&sbi->s_fc_lock);
945 
946 	return ret;
947 }
948 
949 /* Wait for completion of data for all the fast commit inodes */
950 static int ext4_fc_wait_inode_data_all(journal_t *journal)
951 {
952 	struct super_block *sb = (struct super_block *)(journal->j_private);
953 	struct ext4_sb_info *sbi = EXT4_SB(sb);
954 	struct ext4_inode_info *pos, *n;
955 	int ret = 0;
956 
957 	spin_lock(&sbi->s_fc_lock);
958 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
959 		if (!ext4_test_inode_state(&pos->vfs_inode,
960 					   EXT4_STATE_FC_COMMITTING))
961 			continue;
962 		spin_unlock(&sbi->s_fc_lock);
963 
964 		ret = jbd2_wait_inode_data(journal, pos->jinode);
965 		if (ret)
966 			return ret;
967 		spin_lock(&sbi->s_fc_lock);
968 	}
969 	spin_unlock(&sbi->s_fc_lock);
970 
971 	return 0;
972 }
973 
974 /* Commit all the directory entry updates */
975 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
976 __acquires(&sbi->s_fc_lock)
977 __releases(&sbi->s_fc_lock)
978 {
979 	struct super_block *sb = (struct super_block *)(journal->j_private);
980 	struct ext4_sb_info *sbi = EXT4_SB(sb);
981 	struct ext4_fc_dentry_update *fc_dentry;
982 	struct inode *inode;
983 	struct list_head *pos, *n, *fcd_pos, *fcd_n;
984 	struct ext4_inode_info *ei;
985 	int ret;
986 
987 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
988 		return 0;
989 	list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
990 		fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
991 					fcd_list);
992 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
993 			spin_unlock(&sbi->s_fc_lock);
994 			if (!ext4_fc_add_dentry_tlv(
995 				sb, fc_dentry->fcd_op,
996 				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
997 				fc_dentry->fcd_name.len,
998 				fc_dentry->fcd_name.name, crc)) {
999 				ret = -ENOSPC;
1000 				goto lock_and_exit;
1001 			}
1002 			spin_lock(&sbi->s_fc_lock);
1003 			continue;
1004 		}
1005 
1006 		inode = NULL;
1007 		list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1008 			ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
1009 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1010 				inode = &ei->vfs_inode;
1011 				break;
1012 			}
1013 		}
1014 		/*
1015 		 * If we don't find inode in our list, then it was deleted,
1016 		 * in which case, we don't need to record it's create tag.
1017 		 */
1018 		if (!inode)
1019 			continue;
1020 		spin_unlock(&sbi->s_fc_lock);
1021 
1022 		/*
1023 		 * We first write the inode and then the create dirent. This
1024 		 * allows the recovery code to create an unnamed inode first
1025 		 * and then link it to a directory entry. This allows us
1026 		 * to use namei.c routines almost as is and simplifies
1027 		 * the recovery code.
1028 		 */
1029 		ret = ext4_fc_write_inode(inode, crc);
1030 		if (ret)
1031 			goto lock_and_exit;
1032 
1033 		ret = ext4_fc_write_inode_data(inode, crc);
1034 		if (ret)
1035 			goto lock_and_exit;
1036 
1037 		if (!ext4_fc_add_dentry_tlv(
1038 			sb, fc_dentry->fcd_op,
1039 			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1040 			fc_dentry->fcd_name.len,
1041 			fc_dentry->fcd_name.name, crc)) {
1042 			ret = -ENOSPC;
1043 			goto lock_and_exit;
1044 		}
1045 
1046 		spin_lock(&sbi->s_fc_lock);
1047 	}
1048 	return 0;
1049 lock_and_exit:
1050 	spin_lock(&sbi->s_fc_lock);
1051 	return ret;
1052 }
1053 
1054 static int ext4_fc_perform_commit(journal_t *journal)
1055 {
1056 	struct super_block *sb = (struct super_block *)(journal->j_private);
1057 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1058 	struct ext4_inode_info *iter;
1059 	struct ext4_fc_head head;
1060 	struct list_head *pos;
1061 	struct inode *inode;
1062 	struct blk_plug plug;
1063 	int ret = 0;
1064 	u32 crc = 0;
1065 
1066 	ret = ext4_fc_submit_inode_data_all(journal);
1067 	if (ret)
1068 		return ret;
1069 
1070 	ret = ext4_fc_wait_inode_data_all(journal);
1071 	if (ret)
1072 		return ret;
1073 
1074 	/*
1075 	 * If file system device is different from journal device, issue a cache
1076 	 * flush before we start writing fast commit blocks.
1077 	 */
1078 	if (journal->j_fs_dev != journal->j_dev)
1079 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1080 
1081 	blk_start_plug(&plug);
1082 	if (sbi->s_fc_bytes == 0) {
1083 		/*
1084 		 * Add a head tag only if this is the first fast commit
1085 		 * in this TID.
1086 		 */
1087 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1088 		head.fc_tid = cpu_to_le32(
1089 			sbi->s_journal->j_running_transaction->t_tid);
1090 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1091 			(u8 *)&head, &crc))
1092 			goto out;
1093 	}
1094 
1095 	spin_lock(&sbi->s_fc_lock);
1096 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1097 	if (ret) {
1098 		spin_unlock(&sbi->s_fc_lock);
1099 		goto out;
1100 	}
1101 
1102 	list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1103 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1104 		inode = &iter->vfs_inode;
1105 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1106 			continue;
1107 
1108 		spin_unlock(&sbi->s_fc_lock);
1109 		ret = ext4_fc_write_inode_data(inode, &crc);
1110 		if (ret)
1111 			goto out;
1112 		ret = ext4_fc_write_inode(inode, &crc);
1113 		if (ret)
1114 			goto out;
1115 		spin_lock(&sbi->s_fc_lock);
1116 	}
1117 	spin_unlock(&sbi->s_fc_lock);
1118 
1119 	ret = ext4_fc_write_tail(sb, crc);
1120 
1121 out:
1122 	blk_finish_plug(&plug);
1123 	return ret;
1124 }
1125 
1126 /*
1127  * The main commit entry point. Performs a fast commit for transaction
1128  * commit_tid if needed. If it's not possible to perform a fast commit
1129  * due to various reasons, we fall back to full commit. Returns 0
1130  * on success, error otherwise.
1131  */
1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1133 {
1134 	struct super_block *sb = (struct super_block *)(journal->j_private);
1135 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1136 	int nblks = 0, ret, bsize = journal->j_blocksize;
1137 	int subtid = atomic_read(&sbi->s_fc_subtid);
1138 	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1139 	ktime_t start_time, commit_time;
1140 
1141 	trace_ext4_fc_commit_start(sb);
1142 
1143 	start_time = ktime_get();
1144 
1145 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1146 		(ext4_fc_is_ineligible(sb))) {
1147 		reason = EXT4_FC_REASON_INELIGIBLE;
1148 		goto out;
1149 	}
1150 
1151 restart_fc:
1152 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1153 	if (ret == -EALREADY) {
1154 		/* There was an ongoing commit, check if we need to restart */
1155 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1156 			commit_tid > journal->j_commit_sequence)
1157 			goto restart_fc;
1158 		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1159 		goto out;
1160 	} else if (ret) {
1161 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1162 		reason = EXT4_FC_REASON_FC_START_FAILED;
1163 		goto out;
1164 	}
1165 
1166 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1167 	ret = ext4_fc_perform_commit(journal);
1168 	if (ret < 0) {
1169 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1170 		reason = EXT4_FC_REASON_FC_FAILED;
1171 		goto out;
1172 	}
1173 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1174 	ret = jbd2_fc_wait_bufs(journal, nblks);
1175 	if (ret < 0) {
1176 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177 		reason = EXT4_FC_REASON_FC_FAILED;
1178 		goto out;
1179 	}
1180 	atomic_inc(&sbi->s_fc_subtid);
1181 	jbd2_fc_end_commit(journal);
1182 out:
1183 	/* Has any ineligible update happened since we started? */
1184 	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1185 		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1186 		reason = EXT4_FC_REASON_INELIGIBLE;
1187 	}
1188 
1189 	spin_lock(&sbi->s_fc_lock);
1190 	if (reason != EXT4_FC_REASON_OK &&
1191 		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1192 		sbi->s_fc_stats.fc_ineligible_commits++;
1193 	} else {
1194 		sbi->s_fc_stats.fc_num_commits++;
1195 		sbi->s_fc_stats.fc_numblks += nblks;
1196 	}
1197 	spin_unlock(&sbi->s_fc_lock);
1198 	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1199 	trace_ext4_fc_commit_stop(sb, nblks, reason);
1200 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1201 	/*
1202 	 * weight the commit time higher than the average time so we don't
1203 	 * react too strongly to vast changes in the commit time
1204 	 */
1205 	if (likely(sbi->s_fc_avg_commit_time))
1206 		sbi->s_fc_avg_commit_time = (commit_time +
1207 				sbi->s_fc_avg_commit_time * 3) / 4;
1208 	else
1209 		sbi->s_fc_avg_commit_time = commit_time;
1210 	jbd_debug(1,
1211 		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1212 		nblks, reason, subtid);
1213 	if (reason == EXT4_FC_REASON_FC_FAILED)
1214 		return jbd2_fc_end_commit_fallback(journal);
1215 	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1216 		reason == EXT4_FC_REASON_INELIGIBLE)
1217 		return jbd2_complete_transaction(journal, commit_tid);
1218 	return 0;
1219 }
1220 
1221 /*
1222  * Fast commit cleanup routine. This is called after every fast commit and
1223  * full commit. full is true if we are called after a full commit.
1224  */
1225 static void ext4_fc_cleanup(journal_t *journal, int full)
1226 {
1227 	struct super_block *sb = journal->j_private;
1228 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1229 	struct ext4_inode_info *iter;
1230 	struct ext4_fc_dentry_update *fc_dentry;
1231 	struct list_head *pos, *n;
1232 
1233 	if (full && sbi->s_fc_bh)
1234 		sbi->s_fc_bh = NULL;
1235 
1236 	jbd2_fc_release_bufs(journal);
1237 
1238 	spin_lock(&sbi->s_fc_lock);
1239 	list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1240 		iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1241 		list_del_init(&iter->i_fc_list);
1242 		ext4_clear_inode_state(&iter->vfs_inode,
1243 				       EXT4_STATE_FC_COMMITTING);
1244 		ext4_fc_reset_inode(&iter->vfs_inode);
1245 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1246 		smp_mb();
1247 #if (BITS_PER_LONG < 64)
1248 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1249 #else
1250 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1251 #endif
1252 	}
1253 
1254 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1255 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1256 					     struct ext4_fc_dentry_update,
1257 					     fcd_list);
1258 		list_del_init(&fc_dentry->fcd_list);
1259 		spin_unlock(&sbi->s_fc_lock);
1260 
1261 		if (fc_dentry->fcd_name.name &&
1262 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1263 			kfree(fc_dentry->fcd_name.name);
1264 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1265 		spin_lock(&sbi->s_fc_lock);
1266 	}
1267 
1268 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1269 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1270 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1271 				&sbi->s_fc_q[FC_Q_STAGING]);
1272 
1273 	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1274 	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1275 
1276 	if (full)
1277 		sbi->s_fc_bytes = 0;
1278 	spin_unlock(&sbi->s_fc_lock);
1279 	trace_ext4_fc_stats(sb);
1280 }
1281 
1282 /* Ext4 Replay Path Routines */
1283 
1284 /* Get length of a particular tlv */
1285 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
1286 {
1287 	return le16_to_cpu(tl->fc_len);
1288 }
1289 
1290 /* Get a pointer to "value" of a tlv */
1291 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
1292 {
1293 	return (u8 *)tl + sizeof(*tl);
1294 }
1295 
1296 /* Helper struct for dentry replay routines */
1297 struct dentry_info_args {
1298 	int parent_ino, dname_len, ino, inode_len;
1299 	char *dname;
1300 };
1301 
1302 static inline void tl_to_darg(struct dentry_info_args *darg,
1303 				struct  ext4_fc_tl *tl)
1304 {
1305 	struct ext4_fc_dentry_info *fcd;
1306 
1307 	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
1308 
1309 	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
1310 	darg->ino = le32_to_cpu(fcd->fc_ino);
1311 	darg->dname = fcd->fc_dname;
1312 	darg->dname_len = ext4_fc_tag_len(tl) -
1313 			sizeof(struct ext4_fc_dentry_info);
1314 }
1315 
1316 /* Unlink replay function */
1317 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
1318 {
1319 	struct inode *inode, *old_parent;
1320 	struct qstr entry;
1321 	struct dentry_info_args darg;
1322 	int ret = 0;
1323 
1324 	tl_to_darg(&darg, tl);
1325 
1326 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1327 			darg.parent_ino, darg.dname_len);
1328 
1329 	entry.name = darg.dname;
1330 	entry.len = darg.dname_len;
1331 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1332 
1333 	if (IS_ERR_OR_NULL(inode)) {
1334 		jbd_debug(1, "Inode %d not found", darg.ino);
1335 		return 0;
1336 	}
1337 
1338 	old_parent = ext4_iget(sb, darg.parent_ino,
1339 				EXT4_IGET_NORMAL);
1340 	if (IS_ERR_OR_NULL(old_parent)) {
1341 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1342 		iput(inode);
1343 		return 0;
1344 	}
1345 
1346 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1347 	/* -ENOENT ok coz it might not exist anymore. */
1348 	if (ret == -ENOENT)
1349 		ret = 0;
1350 	iput(old_parent);
1351 	iput(inode);
1352 	return ret;
1353 }
1354 
1355 static int ext4_fc_replay_link_internal(struct super_block *sb,
1356 				struct dentry_info_args *darg,
1357 				struct inode *inode)
1358 {
1359 	struct inode *dir = NULL;
1360 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1361 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1362 	int ret = 0;
1363 
1364 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1365 	if (IS_ERR(dir)) {
1366 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1367 		dir = NULL;
1368 		goto out;
1369 	}
1370 
1371 	dentry_dir = d_obtain_alias(dir);
1372 	if (IS_ERR(dentry_dir)) {
1373 		jbd_debug(1, "Failed to obtain dentry");
1374 		dentry_dir = NULL;
1375 		goto out;
1376 	}
1377 
1378 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1379 	if (!dentry_inode) {
1380 		jbd_debug(1, "Inode dentry not created.");
1381 		ret = -ENOMEM;
1382 		goto out;
1383 	}
1384 
1385 	ret = __ext4_link(dir, inode, dentry_inode);
1386 	/*
1387 	 * It's possible that link already existed since data blocks
1388 	 * for the dir in question got persisted before we crashed OR
1389 	 * we replayed this tag and crashed before the entire replay
1390 	 * could complete.
1391 	 */
1392 	if (ret && ret != -EEXIST) {
1393 		jbd_debug(1, "Failed to link\n");
1394 		goto out;
1395 	}
1396 
1397 	ret = 0;
1398 out:
1399 	if (dentry_dir) {
1400 		d_drop(dentry_dir);
1401 		dput(dentry_dir);
1402 	} else if (dir) {
1403 		iput(dir);
1404 	}
1405 	if (dentry_inode) {
1406 		d_drop(dentry_inode);
1407 		dput(dentry_inode);
1408 	}
1409 
1410 	return ret;
1411 }
1412 
1413 /* Link replay function */
1414 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
1415 {
1416 	struct inode *inode;
1417 	struct dentry_info_args darg;
1418 	int ret = 0;
1419 
1420 	tl_to_darg(&darg, tl);
1421 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1422 			darg.parent_ino, darg.dname_len);
1423 
1424 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1425 	if (IS_ERR_OR_NULL(inode)) {
1426 		jbd_debug(1, "Inode not found.");
1427 		return 0;
1428 	}
1429 
1430 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1431 	iput(inode);
1432 	return ret;
1433 }
1434 
1435 /*
1436  * Record all the modified inodes during replay. We use this later to setup
1437  * block bitmaps correctly.
1438  */
1439 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1440 {
1441 	struct ext4_fc_replay_state *state;
1442 	int i;
1443 
1444 	state = &EXT4_SB(sb)->s_fc_replay_state;
1445 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1446 		if (state->fc_modified_inodes[i] == ino)
1447 			return 0;
1448 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1449 		state->fc_modified_inodes_size +=
1450 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1451 		state->fc_modified_inodes = krealloc(
1452 					state->fc_modified_inodes, sizeof(int) *
1453 					state->fc_modified_inodes_size,
1454 					GFP_KERNEL);
1455 		if (!state->fc_modified_inodes)
1456 			return -ENOMEM;
1457 	}
1458 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1459 	return 0;
1460 }
1461 
1462 /*
1463  * Inode replay function
1464  */
1465 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
1466 {
1467 	struct ext4_fc_inode *fc_inode;
1468 	struct ext4_inode *raw_inode;
1469 	struct ext4_inode *raw_fc_inode;
1470 	struct inode *inode = NULL;
1471 	struct ext4_iloc iloc;
1472 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1473 	struct ext4_extent_header *eh;
1474 
1475 	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
1476 
1477 	ino = le32_to_cpu(fc_inode->fc_ino);
1478 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1479 
1480 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1481 	if (!IS_ERR_OR_NULL(inode)) {
1482 		ext4_ext_clear_bb(inode);
1483 		iput(inode);
1484 	}
1485 
1486 	ext4_fc_record_modified_inode(sb, ino);
1487 
1488 	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
1489 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1490 	if (ret)
1491 		goto out;
1492 
1493 	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
1494 	raw_inode = ext4_raw_inode(&iloc);
1495 
1496 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1497 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1498 		inode_len - offsetof(struct ext4_inode, i_generation));
1499 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1500 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1501 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1502 			memset(eh, 0, sizeof(*eh));
1503 			eh->eh_magic = EXT4_EXT_MAGIC;
1504 			eh->eh_max = cpu_to_le16(
1505 				(sizeof(raw_inode->i_block) -
1506 				 sizeof(struct ext4_extent_header))
1507 				 / sizeof(struct ext4_extent));
1508 		}
1509 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1510 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1511 			sizeof(raw_inode->i_block));
1512 	}
1513 
1514 	/* Immediately update the inode on disk. */
1515 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1516 	if (ret)
1517 		goto out;
1518 	ret = sync_dirty_buffer(iloc.bh);
1519 	if (ret)
1520 		goto out;
1521 	ret = ext4_mark_inode_used(sb, ino);
1522 	if (ret)
1523 		goto out;
1524 
1525 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1526 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1527 	if (IS_ERR_OR_NULL(inode)) {
1528 		jbd_debug(1, "Inode not found.");
1529 		return -EFSCORRUPTED;
1530 	}
1531 
1532 	/*
1533 	 * Our allocator could have made different decisions than before
1534 	 * crashing. This should be fixed but until then, we calculate
1535 	 * the number of blocks the inode.
1536 	 */
1537 	ext4_ext_replay_set_iblocks(inode);
1538 
1539 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1540 	ext4_reset_inode_seed(inode);
1541 
1542 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1543 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1544 	sync_dirty_buffer(iloc.bh);
1545 	brelse(iloc.bh);
1546 out:
1547 	iput(inode);
1548 	if (!ret)
1549 		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1550 
1551 	return 0;
1552 }
1553 
1554 /*
1555  * Dentry create replay function.
1556  *
1557  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1558  * inode for which we are trying to create a dentry here, should already have
1559  * been replayed before we start here.
1560  */
1561 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
1562 {
1563 	int ret = 0;
1564 	struct inode *inode = NULL;
1565 	struct inode *dir = NULL;
1566 	struct dentry_info_args darg;
1567 
1568 	tl_to_darg(&darg, tl);
1569 
1570 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571 			darg.parent_ino, darg.dname_len);
1572 
1573 	/* This takes care of update group descriptor and other metadata */
1574 	ret = ext4_mark_inode_used(sb, darg.ino);
1575 	if (ret)
1576 		goto out;
1577 
1578 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579 	if (IS_ERR_OR_NULL(inode)) {
1580 		jbd_debug(1, "inode %d not found.", darg.ino);
1581 		inode = NULL;
1582 		ret = -EINVAL;
1583 		goto out;
1584 	}
1585 
1586 	if (S_ISDIR(inode->i_mode)) {
1587 		/*
1588 		 * If we are creating a directory, we need to make sure that the
1589 		 * dot and dot dot dirents are setup properly.
1590 		 */
1591 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592 		if (IS_ERR_OR_NULL(dir)) {
1593 			jbd_debug(1, "Dir %d not found.", darg.ino);
1594 			goto out;
1595 		}
1596 		ret = ext4_init_new_dir(NULL, dir, inode);
1597 		iput(dir);
1598 		if (ret) {
1599 			ret = 0;
1600 			goto out;
1601 		}
1602 	}
1603 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604 	if (ret)
1605 		goto out;
1606 	set_nlink(inode, 1);
1607 	ext4_mark_inode_dirty(NULL, inode);
1608 out:
1609 	if (inode)
1610 		iput(inode);
1611 	return ret;
1612 }
1613 
1614 /*
1615  * Record physical disk regions which are in use as per fast commit area. Our
1616  * simple replay phase allocator excludes these regions from allocation.
1617  */
1618 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620 {
1621 	struct ext4_fc_replay_state *state;
1622 	struct ext4_fc_alloc_region *region;
1623 
1624 	state = &EXT4_SB(sb)->s_fc_replay_state;
1625 	if (state->fc_regions_used == state->fc_regions_size) {
1626 		state->fc_regions_size +=
1627 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628 		state->fc_regions = krealloc(
1629 					state->fc_regions,
1630 					state->fc_regions_size *
1631 					sizeof(struct ext4_fc_alloc_region),
1632 					GFP_KERNEL);
1633 		if (!state->fc_regions)
1634 			return -ENOMEM;
1635 	}
1636 	region = &state->fc_regions[state->fc_regions_used++];
1637 	region->ino = ino;
1638 	region->lblk = lblk;
1639 	region->pblk = pblk;
1640 	region->len = len;
1641 
1642 	return 0;
1643 }
1644 
1645 /* Replay add range tag */
1646 static int ext4_fc_replay_add_range(struct super_block *sb,
1647 				struct ext4_fc_tl *tl)
1648 {
1649 	struct ext4_fc_add_range *fc_add_ex;
1650 	struct ext4_extent newex, *ex;
1651 	struct inode *inode;
1652 	ext4_lblk_t start, cur;
1653 	int remaining, len;
1654 	ext4_fsblk_t start_pblk;
1655 	struct ext4_map_blocks map;
1656 	struct ext4_ext_path *path = NULL;
1657 	int ret;
1658 
1659 	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
1660 	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
1661 
1662 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663 		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
1664 		ext4_ext_get_actual_len(ex));
1665 
1666 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
1667 				EXT4_IGET_NORMAL);
1668 	if (IS_ERR_OR_NULL(inode)) {
1669 		jbd_debug(1, "Inode not found.");
1670 		return 0;
1671 	}
1672 
1673 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1674 
1675 	start = le32_to_cpu(ex->ee_block);
1676 	start_pblk = ext4_ext_pblock(ex);
1677 	len = ext4_ext_get_actual_len(ex);
1678 
1679 	cur = start;
1680 	remaining = len;
1681 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1682 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1683 		  inode->i_ino);
1684 
1685 	while (remaining > 0) {
1686 		map.m_lblk = cur;
1687 		map.m_len = remaining;
1688 		map.m_pblk = 0;
1689 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1690 
1691 		if (ret < 0) {
1692 			iput(inode);
1693 			return 0;
1694 		}
1695 
1696 		if (ret == 0) {
1697 			/* Range is not mapped */
1698 			path = ext4_find_extent(inode, cur, NULL, 0);
1699 			if (IS_ERR(path)) {
1700 				iput(inode);
1701 				return 0;
1702 			}
1703 			memset(&newex, 0, sizeof(newex));
1704 			newex.ee_block = cpu_to_le32(cur);
1705 			ext4_ext_store_pblock(
1706 				&newex, start_pblk + cur - start);
1707 			newex.ee_len = cpu_to_le16(map.m_len);
1708 			if (ext4_ext_is_unwritten(ex))
1709 				ext4_ext_mark_unwritten(&newex);
1710 			down_write(&EXT4_I(inode)->i_data_sem);
1711 			ret = ext4_ext_insert_extent(
1712 				NULL, inode, &path, &newex, 0);
1713 			up_write((&EXT4_I(inode)->i_data_sem));
1714 			ext4_ext_drop_refs(path);
1715 			kfree(path);
1716 			if (ret) {
1717 				iput(inode);
1718 				return 0;
1719 			}
1720 			goto next;
1721 		}
1722 
1723 		if (start_pblk + cur - start != map.m_pblk) {
1724 			/*
1725 			 * Logical to physical mapping changed. This can happen
1726 			 * if this range was removed and then reallocated to
1727 			 * map to new physical blocks during a fast commit.
1728 			 */
1729 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1730 					ext4_ext_is_unwritten(ex),
1731 					start_pblk + cur - start);
1732 			if (ret) {
1733 				iput(inode);
1734 				return 0;
1735 			}
1736 			/*
1737 			 * Mark the old blocks as free since they aren't used
1738 			 * anymore. We maintain an array of all the modified
1739 			 * inodes. In case these blocks are still used at either
1740 			 * a different logical range in the same inode or in
1741 			 * some different inode, we will mark them as allocated
1742 			 * at the end of the FC replay using our array of
1743 			 * modified inodes.
1744 			 */
1745 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1746 			goto next;
1747 		}
1748 
1749 		/* Range is mapped and needs a state change */
1750 		jbd_debug(1, "Converting from %d to %d %lld",
1751 				map.m_flags & EXT4_MAP_UNWRITTEN,
1752 			ext4_ext_is_unwritten(ex), map.m_pblk);
1753 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1754 					ext4_ext_is_unwritten(ex), map.m_pblk);
1755 		if (ret) {
1756 			iput(inode);
1757 			return 0;
1758 		}
1759 		/*
1760 		 * We may have split the extent tree while toggling the state.
1761 		 * Try to shrink the extent tree now.
1762 		 */
1763 		ext4_ext_replay_shrink_inode(inode, start + len);
1764 next:
1765 		cur += map.m_len;
1766 		remaining -= map.m_len;
1767 	}
1768 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1769 					sb->s_blocksize_bits);
1770 	iput(inode);
1771 	return 0;
1772 }
1773 
1774 /* Replay DEL_RANGE tag */
1775 static int
1776 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
1777 {
1778 	struct inode *inode;
1779 	struct ext4_fc_del_range *lrange;
1780 	struct ext4_map_blocks map;
1781 	ext4_lblk_t cur, remaining;
1782 	int ret;
1783 
1784 	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
1785 	cur = le32_to_cpu(lrange->fc_lblk);
1786 	remaining = le32_to_cpu(lrange->fc_len);
1787 
1788 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789 		le32_to_cpu(lrange->fc_ino), cur, remaining);
1790 
1791 	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
1792 	if (IS_ERR_OR_NULL(inode)) {
1793 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
1794 		return 0;
1795 	}
1796 
1797 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1798 
1799 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800 			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
1801 			le32_to_cpu(lrange->fc_len));
1802 	while (remaining > 0) {
1803 		map.m_lblk = cur;
1804 		map.m_len = remaining;
1805 
1806 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1807 		if (ret < 0) {
1808 			iput(inode);
1809 			return 0;
1810 		}
1811 		if (ret > 0) {
1812 			remaining -= ret;
1813 			cur += ret;
1814 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815 		} else {
1816 			remaining -= map.m_len;
1817 			cur += map.m_len;
1818 		}
1819 	}
1820 
1821 	ret = ext4_punch_hole(inode,
1822 		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
1823 		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
1824 	if (ret)
1825 		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826 	ext4_ext_replay_shrink_inode(inode,
1827 		i_size_read(inode) >> sb->s_blocksize_bits);
1828 	ext4_mark_inode_dirty(NULL, inode);
1829 	iput(inode);
1830 
1831 	return 0;
1832 }
1833 
1834 static inline const char *tag2str(u16 tag)
1835 {
1836 	switch (tag) {
1837 	case EXT4_FC_TAG_LINK:
1838 		return "TAG_ADD_ENTRY";
1839 	case EXT4_FC_TAG_UNLINK:
1840 		return "TAG_DEL_ENTRY";
1841 	case EXT4_FC_TAG_ADD_RANGE:
1842 		return "TAG_ADD_RANGE";
1843 	case EXT4_FC_TAG_CREAT:
1844 		return "TAG_CREAT_DENTRY";
1845 	case EXT4_FC_TAG_DEL_RANGE:
1846 		return "TAG_DEL_RANGE";
1847 	case EXT4_FC_TAG_INODE:
1848 		return "TAG_INODE";
1849 	case EXT4_FC_TAG_PAD:
1850 		return "TAG_PAD";
1851 	case EXT4_FC_TAG_TAIL:
1852 		return "TAG_TAIL";
1853 	case EXT4_FC_TAG_HEAD:
1854 		return "TAG_HEAD";
1855 	default:
1856 		return "TAG_ERROR";
1857 	}
1858 }
1859 
1860 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1861 {
1862 	struct ext4_fc_replay_state *state;
1863 	struct inode *inode;
1864 	struct ext4_ext_path *path = NULL;
1865 	struct ext4_map_blocks map;
1866 	int i, ret, j;
1867 	ext4_lblk_t cur, end;
1868 
1869 	state = &EXT4_SB(sb)->s_fc_replay_state;
1870 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1871 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1872 			EXT4_IGET_NORMAL);
1873 		if (IS_ERR_OR_NULL(inode)) {
1874 			jbd_debug(1, "Inode %d not found.",
1875 				state->fc_modified_inodes[i]);
1876 			continue;
1877 		}
1878 		cur = 0;
1879 		end = EXT_MAX_BLOCKS;
1880 		while (cur < end) {
1881 			map.m_lblk = cur;
1882 			map.m_len = end - cur;
1883 
1884 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1885 			if (ret < 0)
1886 				break;
1887 
1888 			if (ret > 0) {
1889 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1890 				if (!IS_ERR_OR_NULL(path)) {
1891 					for (j = 0; j < path->p_depth; j++)
1892 						ext4_mb_mark_bb(inode->i_sb,
1893 							path[j].p_block, 1, 1);
1894 					ext4_ext_drop_refs(path);
1895 					kfree(path);
1896 				}
1897 				cur += ret;
1898 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1899 							map.m_len, 1);
1900 			} else {
1901 				cur = cur + (map.m_len ? map.m_len : 1);
1902 			}
1903 		}
1904 		iput(inode);
1905 	}
1906 }
1907 
1908 /*
1909  * Check if block is in excluded regions for block allocation. The simple
1910  * allocator that runs during replay phase is calls this function to see
1911  * if it is okay to use a block.
1912  */
1913 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1914 {
1915 	int i;
1916 	struct ext4_fc_replay_state *state;
1917 
1918 	state = &EXT4_SB(sb)->s_fc_replay_state;
1919 	for (i = 0; i < state->fc_regions_valid; i++) {
1920 		if (state->fc_regions[i].ino == 0 ||
1921 			state->fc_regions[i].len == 0)
1922 			continue;
1923 		if (blk >= state->fc_regions[i].pblk &&
1924 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1925 			return true;
1926 	}
1927 	return false;
1928 }
1929 
1930 /* Cleanup function called after replay */
1931 void ext4_fc_replay_cleanup(struct super_block *sb)
1932 {
1933 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1934 
1935 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1936 	kfree(sbi->s_fc_replay_state.fc_regions);
1937 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1938 }
1939 
1940 /*
1941  * Recovery Scan phase handler
1942  *
1943  * This function is called during the scan phase and is responsible
1944  * for doing following things:
1945  * - Make sure the fast commit area has valid tags for replay
1946  * - Count number of tags that need to be replayed by the replay handler
1947  * - Verify CRC
1948  * - Create a list of excluded blocks for allocation during replay phase
1949  *
1950  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1951  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1952  * to indicate that scan has finished and JBD2 can now start replay phase.
1953  * It returns a negative error to indicate that there was an error. At the end
1954  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1955  * to indicate the number of tags that need to replayed during the replay phase.
1956  */
1957 static int ext4_fc_replay_scan(journal_t *journal,
1958 				struct buffer_head *bh, int off,
1959 				tid_t expected_tid)
1960 {
1961 	struct super_block *sb = journal->j_private;
1962 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1963 	struct ext4_fc_replay_state *state;
1964 	int ret = JBD2_FC_REPLAY_CONTINUE;
1965 	struct ext4_fc_add_range *ext;
1966 	struct ext4_fc_tl *tl;
1967 	struct ext4_fc_tail *tail;
1968 	__u8 *start, *end;
1969 	struct ext4_fc_head *head;
1970 	struct ext4_extent *ex;
1971 
1972 	state = &sbi->s_fc_replay_state;
1973 
1974 	start = (u8 *)bh->b_data;
1975 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1976 
1977 	if (state->fc_replay_expected_off == 0) {
1978 		state->fc_cur_tag = 0;
1979 		state->fc_replay_num_tags = 0;
1980 		state->fc_crc = 0;
1981 		state->fc_regions = NULL;
1982 		state->fc_regions_valid = state->fc_regions_used =
1983 			state->fc_regions_size = 0;
1984 		/* Check if we can stop early */
1985 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1986 			!= EXT4_FC_TAG_HEAD)
1987 			return 0;
1988 	}
1989 
1990 	if (off != state->fc_replay_expected_off) {
1991 		ret = -EFSCORRUPTED;
1992 		goto out_err;
1993 	}
1994 
1995 	state->fc_replay_expected_off++;
1996 	fc_for_each_tl(start, end, tl) {
1997 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1998 			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
1999 		switch (le16_to_cpu(tl->fc_tag)) {
2000 		case EXT4_FC_TAG_ADD_RANGE:
2001 			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
2002 			ex = (struct ext4_extent *)&ext->fc_ex;
2003 			ret = ext4_fc_record_regions(sb,
2004 				le32_to_cpu(ext->fc_ino),
2005 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2006 				ext4_ext_get_actual_len(ex));
2007 			if (ret < 0)
2008 				break;
2009 			ret = JBD2_FC_REPLAY_CONTINUE;
2010 			fallthrough;
2011 		case EXT4_FC_TAG_DEL_RANGE:
2012 		case EXT4_FC_TAG_LINK:
2013 		case EXT4_FC_TAG_UNLINK:
2014 		case EXT4_FC_TAG_CREAT:
2015 		case EXT4_FC_TAG_INODE:
2016 		case EXT4_FC_TAG_PAD:
2017 			state->fc_cur_tag++;
2018 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2019 					sizeof(*tl) + ext4_fc_tag_len(tl));
2020 			break;
2021 		case EXT4_FC_TAG_TAIL:
2022 			state->fc_cur_tag++;
2023 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2024 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2025 						sizeof(*tl) +
2026 						offsetof(struct ext4_fc_tail,
2027 						fc_crc));
2028 			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
2029 				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
2030 				state->fc_replay_num_tags = state->fc_cur_tag;
2031 				state->fc_regions_valid =
2032 					state->fc_regions_used;
2033 			} else {
2034 				ret = state->fc_replay_num_tags ?
2035 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2036 			}
2037 			state->fc_crc = 0;
2038 			break;
2039 		case EXT4_FC_TAG_HEAD:
2040 			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
2041 			if (le32_to_cpu(head->fc_features) &
2042 				~EXT4_FC_SUPPORTED_FEATURES) {
2043 				ret = -EOPNOTSUPP;
2044 				break;
2045 			}
2046 			if (le32_to_cpu(head->fc_tid) != expected_tid) {
2047 				ret = JBD2_FC_REPLAY_STOP;
2048 				break;
2049 			}
2050 			state->fc_cur_tag++;
2051 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
2052 					sizeof(*tl) + ext4_fc_tag_len(tl));
2053 			break;
2054 		default:
2055 			ret = state->fc_replay_num_tags ?
2056 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2057 		}
2058 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2059 			break;
2060 	}
2061 
2062 out_err:
2063 	trace_ext4_fc_replay_scan(sb, ret, off);
2064 	return ret;
2065 }
2066 
2067 /*
2068  * Main recovery path entry point.
2069  * The meaning of return codes is similar as above.
2070  */
2071 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2072 				enum passtype pass, int off, tid_t expected_tid)
2073 {
2074 	struct super_block *sb = journal->j_private;
2075 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2076 	struct ext4_fc_tl *tl;
2077 	__u8 *start, *end;
2078 	int ret = JBD2_FC_REPLAY_CONTINUE;
2079 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2080 	struct ext4_fc_tail *tail;
2081 
2082 	if (pass == PASS_SCAN) {
2083 		state->fc_current_pass = PASS_SCAN;
2084 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2085 	}
2086 
2087 	if (state->fc_current_pass != pass) {
2088 		state->fc_current_pass = pass;
2089 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2090 	}
2091 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2092 		jbd_debug(1, "Replay stops\n");
2093 		ext4_fc_set_bitmaps_and_counters(sb);
2094 		return 0;
2095 	}
2096 
2097 #ifdef CONFIG_EXT4_DEBUG
2098 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2099 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2100 		return JBD2_FC_REPLAY_STOP;
2101 	}
2102 #endif
2103 
2104 	start = (u8 *)bh->b_data;
2105 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2106 
2107 	fc_for_each_tl(start, end, tl) {
2108 		if (state->fc_replay_num_tags == 0) {
2109 			ret = JBD2_FC_REPLAY_STOP;
2110 			ext4_fc_set_bitmaps_and_counters(sb);
2111 			break;
2112 		}
2113 		jbd_debug(3, "Replay phase, tag:%s\n",
2114 				tag2str(le16_to_cpu(tl->fc_tag)));
2115 		state->fc_replay_num_tags--;
2116 		switch (le16_to_cpu(tl->fc_tag)) {
2117 		case EXT4_FC_TAG_LINK:
2118 			ret = ext4_fc_replay_link(sb, tl);
2119 			break;
2120 		case EXT4_FC_TAG_UNLINK:
2121 			ret = ext4_fc_replay_unlink(sb, tl);
2122 			break;
2123 		case EXT4_FC_TAG_ADD_RANGE:
2124 			ret = ext4_fc_replay_add_range(sb, tl);
2125 			break;
2126 		case EXT4_FC_TAG_CREAT:
2127 			ret = ext4_fc_replay_create(sb, tl);
2128 			break;
2129 		case EXT4_FC_TAG_DEL_RANGE:
2130 			ret = ext4_fc_replay_del_range(sb, tl);
2131 			break;
2132 		case EXT4_FC_TAG_INODE:
2133 			ret = ext4_fc_replay_inode(sb, tl);
2134 			break;
2135 		case EXT4_FC_TAG_PAD:
2136 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2137 				ext4_fc_tag_len(tl), 0);
2138 			break;
2139 		case EXT4_FC_TAG_TAIL:
2140 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2141 				ext4_fc_tag_len(tl), 0);
2142 			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
2143 			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
2144 			break;
2145 		case EXT4_FC_TAG_HEAD:
2146 			break;
2147 		default:
2148 			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
2149 				ext4_fc_tag_len(tl), 0);
2150 			ret = -ECANCELED;
2151 			break;
2152 		}
2153 		if (ret < 0)
2154 			break;
2155 		ret = JBD2_FC_REPLAY_CONTINUE;
2156 	}
2157 	return ret;
2158 }
2159 
2160 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2161 {
2162 	/*
2163 	 * We set replay callback even if fast commit disabled because we may
2164 	 * could still have fast commit blocks that need to be replayed even if
2165 	 * fast commit has now been turned off.
2166 	 */
2167 	journal->j_fc_replay_callback = ext4_fc_replay;
2168 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2169 		return;
2170 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2171 }
2172 
2173 static const char *fc_ineligible_reasons[] = {
2174 	"Extended attributes changed",
2175 	"Cross rename",
2176 	"Journal flag changed",
2177 	"Insufficient memory",
2178 	"Swap boot",
2179 	"Resize",
2180 	"Dir renamed",
2181 	"Falloc range op",
2182 	"Data journalling",
2183 	"FC Commit Failed"
2184 };
2185 
2186 int ext4_fc_info_show(struct seq_file *seq, void *v)
2187 {
2188 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2189 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2190 	int i;
2191 
2192 	if (v != SEQ_START_TOKEN)
2193 		return 0;
2194 
2195 	seq_printf(seq,
2196 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2197 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2198 		   stats->fc_numblks,
2199 		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2200 	seq_puts(seq, "Ineligible reasons:\n");
2201 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2202 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2203 			stats->fc_ineligible_reason_count[i]);
2204 
2205 	return 0;
2206 }
2207 
2208 int __init ext4_fc_init_dentry_cache(void)
2209 {
2210 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2211 					   SLAB_RECLAIM_ACCOUNT);
2212 
2213 	if (ext4_fc_dentry_cachep == NULL)
2214 		return -ENOMEM;
2215 
2216 	return 0;
2217 }
2218