xref: /linux/fs/ext4/fast_commit.c (revision a4eb44a6435d6d8f9e642407a4a06f65eb90ca04)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
30  * - EXT4_FC_TAG_LINK		- records directory entry link
31  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
41  *				  during recovery. Note that iblocks field is
42  *				  not replayed and instead derived during
43  *				  replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167 
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170 
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173 	BUFFER_TRACE(bh, "");
174 	if (uptodate) {
175 		ext4_debug("%s: Block %lld up-to-date",
176 			   __func__, bh->b_blocknr);
177 		set_buffer_uptodate(bh);
178 	} else {
179 		ext4_debug("%s: Block %lld not up-to-date",
180 			   __func__, bh->b_blocknr);
181 		clear_buffer_uptodate(bh);
182 	}
183 
184 	unlock_buffer(bh);
185 }
186 
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189 	struct ext4_inode_info *ei = EXT4_I(inode);
190 
191 	ei->i_fc_lblk_start = 0;
192 	ei->i_fc_lblk_len = 0;
193 }
194 
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197 	struct ext4_inode_info *ei = EXT4_I(inode);
198 
199 	ext4_fc_reset_inode(inode);
200 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 	INIT_LIST_HEAD(&ei->i_fc_list);
202 	init_waitqueue_head(&ei->i_fc_wait);
203 	atomic_set(&ei->i_fc_updates, 0);
204 }
205 
206 /* This function must be called with sbi->s_fc_lock held. */
207 static void ext4_fc_wait_committing_inode(struct inode *inode)
208 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
209 {
210 	wait_queue_head_t *wq;
211 	struct ext4_inode_info *ei = EXT4_I(inode);
212 
213 #if (BITS_PER_LONG < 64)
214 	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
215 			EXT4_STATE_FC_COMMITTING);
216 	wq = bit_waitqueue(&ei->i_state_flags,
217 				EXT4_STATE_FC_COMMITTING);
218 #else
219 	DEFINE_WAIT_BIT(wait, &ei->i_flags,
220 			EXT4_STATE_FC_COMMITTING);
221 	wq = bit_waitqueue(&ei->i_flags,
222 				EXT4_STATE_FC_COMMITTING);
223 #endif
224 	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
225 	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
226 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
227 	schedule();
228 	finish_wait(wq, &wait.wq_entry);
229 }
230 
231 /*
232  * Inform Ext4's fast about start of an inode update
233  *
234  * This function is called by the high level call VFS callbacks before
235  * performing any inode update. This function blocks if there's an ongoing
236  * fast commit on the inode in question.
237  */
238 void ext4_fc_start_update(struct inode *inode)
239 {
240 	struct ext4_inode_info *ei = EXT4_I(inode);
241 
242 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
243 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
244 		return;
245 
246 restart:
247 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
248 	if (list_empty(&ei->i_fc_list))
249 		goto out;
250 
251 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
252 		ext4_fc_wait_committing_inode(inode);
253 		goto restart;
254 	}
255 out:
256 	atomic_inc(&ei->i_fc_updates);
257 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
258 }
259 
260 /*
261  * Stop inode update and wake up waiting fast commits if any.
262  */
263 void ext4_fc_stop_update(struct inode *inode)
264 {
265 	struct ext4_inode_info *ei = EXT4_I(inode);
266 
267 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
268 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
269 		return;
270 
271 	if (atomic_dec_and_test(&ei->i_fc_updates))
272 		wake_up_all(&ei->i_fc_wait);
273 }
274 
275 /*
276  * Remove inode from fast commit list. If the inode is being committed
277  * we wait until inode commit is done.
278  */
279 void ext4_fc_del(struct inode *inode)
280 {
281 	struct ext4_inode_info *ei = EXT4_I(inode);
282 
283 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
284 	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
285 		return;
286 
287 restart:
288 	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
289 	if (list_empty(&ei->i_fc_list)) {
290 		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
291 		return;
292 	}
293 
294 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
295 		ext4_fc_wait_committing_inode(inode);
296 		goto restart;
297 	}
298 	list_del_init(&ei->i_fc_list);
299 	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
300 }
301 
302 /*
303  * Mark file system as fast commit ineligible, and record latest
304  * ineligible transaction tid. This means until the recorded
305  * transaction, commit operation would result in a full jbd2 commit.
306  */
307 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
308 {
309 	struct ext4_sb_info *sbi = EXT4_SB(sb);
310 	tid_t tid;
311 
312 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
313 	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
314 		return;
315 
316 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
317 	if (handle && !IS_ERR(handle))
318 		tid = handle->h_transaction->t_tid;
319 	else {
320 		read_lock(&sbi->s_journal->j_state_lock);
321 		tid = sbi->s_journal->j_running_transaction ?
322 				sbi->s_journal->j_running_transaction->t_tid : 0;
323 		read_unlock(&sbi->s_journal->j_state_lock);
324 	}
325 	spin_lock(&sbi->s_fc_lock);
326 	if (sbi->s_fc_ineligible_tid < tid)
327 		sbi->s_fc_ineligible_tid = tid;
328 	spin_unlock(&sbi->s_fc_lock);
329 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
330 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
331 }
332 
333 /*
334  * Generic fast commit tracking function. If this is the first time this we are
335  * called after a full commit, we initialize fast commit fields and then call
336  * __fc_track_fn() with update = 0. If we have already been called after a full
337  * commit, we pass update = 1. Based on that, the track function can determine
338  * if it needs to track a field for the first time or if it needs to just
339  * update the previously tracked value.
340  *
341  * If enqueue is set, this function enqueues the inode in fast commit list.
342  */
343 static int ext4_fc_track_template(
344 	handle_t *handle, struct inode *inode,
345 	int (*__fc_track_fn)(struct inode *, void *, bool),
346 	void *args, int enqueue)
347 {
348 	bool update = false;
349 	struct ext4_inode_info *ei = EXT4_I(inode);
350 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
351 	tid_t tid = 0;
352 	int ret;
353 
354 	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
355 	    (sbi->s_mount_state & EXT4_FC_REPLAY))
356 		return -EOPNOTSUPP;
357 
358 	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
359 		return -EINVAL;
360 
361 	tid = handle->h_transaction->t_tid;
362 	mutex_lock(&ei->i_fc_lock);
363 	if (tid == ei->i_sync_tid) {
364 		update = true;
365 	} else {
366 		ext4_fc_reset_inode(inode);
367 		ei->i_sync_tid = tid;
368 	}
369 	ret = __fc_track_fn(inode, args, update);
370 	mutex_unlock(&ei->i_fc_lock);
371 
372 	if (!enqueue)
373 		return ret;
374 
375 	spin_lock(&sbi->s_fc_lock);
376 	if (list_empty(&EXT4_I(inode)->i_fc_list))
377 		list_add_tail(&EXT4_I(inode)->i_fc_list,
378 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
379 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
380 				&sbi->s_fc_q[FC_Q_STAGING] :
381 				&sbi->s_fc_q[FC_Q_MAIN]);
382 	spin_unlock(&sbi->s_fc_lock);
383 
384 	return ret;
385 }
386 
387 struct __track_dentry_update_args {
388 	struct dentry *dentry;
389 	int op;
390 };
391 
392 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
393 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
394 {
395 	struct ext4_fc_dentry_update *node;
396 	struct ext4_inode_info *ei = EXT4_I(inode);
397 	struct __track_dentry_update_args *dentry_update =
398 		(struct __track_dentry_update_args *)arg;
399 	struct dentry *dentry = dentry_update->dentry;
400 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
401 
402 	mutex_unlock(&ei->i_fc_lock);
403 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
404 	if (!node) {
405 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
406 		mutex_lock(&ei->i_fc_lock);
407 		return -ENOMEM;
408 	}
409 
410 	node->fcd_op = dentry_update->op;
411 	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
412 	node->fcd_ino = inode->i_ino;
413 	if (dentry->d_name.len > DNAME_INLINE_LEN) {
414 		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
415 		if (!node->fcd_name.name) {
416 			kmem_cache_free(ext4_fc_dentry_cachep, node);
417 			ext4_fc_mark_ineligible(inode->i_sb,
418 				EXT4_FC_REASON_NOMEM, NULL);
419 			mutex_lock(&ei->i_fc_lock);
420 			return -ENOMEM;
421 		}
422 		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
423 			dentry->d_name.len);
424 	} else {
425 		memcpy(node->fcd_iname, dentry->d_name.name,
426 			dentry->d_name.len);
427 		node->fcd_name.name = node->fcd_iname;
428 	}
429 	node->fcd_name.len = dentry->d_name.len;
430 
431 	spin_lock(&sbi->s_fc_lock);
432 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
433 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
434 		list_add_tail(&node->fcd_list,
435 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
436 	else
437 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
438 	spin_unlock(&sbi->s_fc_lock);
439 	mutex_lock(&ei->i_fc_lock);
440 
441 	return 0;
442 }
443 
444 void __ext4_fc_track_unlink(handle_t *handle,
445 		struct inode *inode, struct dentry *dentry)
446 {
447 	struct __track_dentry_update_args args;
448 	int ret;
449 
450 	args.dentry = dentry;
451 	args.op = EXT4_FC_TAG_UNLINK;
452 
453 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
454 					(void *)&args, 0);
455 	trace_ext4_fc_track_unlink(inode, dentry, ret);
456 }
457 
458 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
459 {
460 	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
461 }
462 
463 void __ext4_fc_track_link(handle_t *handle,
464 	struct inode *inode, struct dentry *dentry)
465 {
466 	struct __track_dentry_update_args args;
467 	int ret;
468 
469 	args.dentry = dentry;
470 	args.op = EXT4_FC_TAG_LINK;
471 
472 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
473 					(void *)&args, 0);
474 	trace_ext4_fc_track_link(inode, dentry, ret);
475 }
476 
477 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
478 {
479 	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
480 }
481 
482 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
483 			  struct dentry *dentry)
484 {
485 	struct __track_dentry_update_args args;
486 	int ret;
487 
488 	args.dentry = dentry;
489 	args.op = EXT4_FC_TAG_CREAT;
490 
491 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
492 					(void *)&args, 0);
493 	trace_ext4_fc_track_create(inode, dentry, ret);
494 }
495 
496 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
497 {
498 	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
499 }
500 
501 /* __track_fn for inode tracking */
502 static int __track_inode(struct inode *inode, void *arg, bool update)
503 {
504 	if (update)
505 		return -EEXIST;
506 
507 	EXT4_I(inode)->i_fc_lblk_len = 0;
508 
509 	return 0;
510 }
511 
512 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
513 {
514 	int ret;
515 
516 	if (S_ISDIR(inode->i_mode))
517 		return;
518 
519 	if (ext4_should_journal_data(inode)) {
520 		ext4_fc_mark_ineligible(inode->i_sb,
521 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
522 		return;
523 	}
524 
525 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
526 	trace_ext4_fc_track_inode(inode, ret);
527 }
528 
529 struct __track_range_args {
530 	ext4_lblk_t start, end;
531 };
532 
533 /* __track_fn for tracking data updates */
534 static int __track_range(struct inode *inode, void *arg, bool update)
535 {
536 	struct ext4_inode_info *ei = EXT4_I(inode);
537 	ext4_lblk_t oldstart;
538 	struct __track_range_args *__arg =
539 		(struct __track_range_args *)arg;
540 
541 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
542 		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
543 		return -ECANCELED;
544 	}
545 
546 	oldstart = ei->i_fc_lblk_start;
547 
548 	if (update && ei->i_fc_lblk_len > 0) {
549 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
550 		ei->i_fc_lblk_len =
551 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
552 				ei->i_fc_lblk_start + 1;
553 	} else {
554 		ei->i_fc_lblk_start = __arg->start;
555 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
556 	}
557 
558 	return 0;
559 }
560 
561 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
562 			 ext4_lblk_t end)
563 {
564 	struct __track_range_args args;
565 	int ret;
566 
567 	if (S_ISDIR(inode->i_mode))
568 		return;
569 
570 	args.start = start;
571 	args.end = end;
572 
573 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
574 
575 	trace_ext4_fc_track_range(inode, start, end, ret);
576 }
577 
578 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
579 {
580 	int write_flags = REQ_SYNC;
581 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
582 
583 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
584 	if (test_opt(sb, BARRIER) && is_tail)
585 		write_flags |= REQ_FUA | REQ_PREFLUSH;
586 	lock_buffer(bh);
587 	set_buffer_dirty(bh);
588 	set_buffer_uptodate(bh);
589 	bh->b_end_io = ext4_end_buffer_io_sync;
590 	submit_bh(REQ_OP_WRITE, write_flags, bh);
591 	EXT4_SB(sb)->s_fc_bh = NULL;
592 }
593 
594 /* Ext4 commit path routines */
595 
596 /* memzero and update CRC */
597 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
598 				u32 *crc)
599 {
600 	void *ret;
601 
602 	ret = memset(dst, 0, len);
603 	if (crc)
604 		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
605 	return ret;
606 }
607 
608 /*
609  * Allocate len bytes on a fast commit buffer.
610  *
611  * During the commit time this function is used to manage fast commit
612  * block space. We don't split a fast commit log onto different
613  * blocks. So this function makes sure that if there's not enough space
614  * on the current block, the remaining space in the current block is
615  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
616  * new block is from jbd2 and CRC is updated to reflect the padding
617  * we added.
618  */
619 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
620 {
621 	struct ext4_fc_tl *tl;
622 	struct ext4_sb_info *sbi = EXT4_SB(sb);
623 	struct buffer_head *bh;
624 	int bsize = sbi->s_journal->j_blocksize;
625 	int ret, off = sbi->s_fc_bytes % bsize;
626 	int pad_len;
627 
628 	/*
629 	 * After allocating len, we should have space at least for a 0 byte
630 	 * padding.
631 	 */
632 	if (len + sizeof(struct ext4_fc_tl) > bsize)
633 		return NULL;
634 
635 	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
636 		/*
637 		 * Only allocate from current buffer if we have enough space for
638 		 * this request AND we have space to add a zero byte padding.
639 		 */
640 		if (!sbi->s_fc_bh) {
641 			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
642 			if (ret)
643 				return NULL;
644 			sbi->s_fc_bh = bh;
645 		}
646 		sbi->s_fc_bytes += len;
647 		return sbi->s_fc_bh->b_data + off;
648 	}
649 	/* Need to add PAD tag */
650 	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
651 	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
652 	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
653 	tl->fc_len = cpu_to_le16(pad_len);
654 	if (crc)
655 		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
656 	if (pad_len > 0)
657 		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
658 	ext4_fc_submit_bh(sb, false);
659 
660 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
661 	if (ret)
662 		return NULL;
663 	sbi->s_fc_bh = bh;
664 	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
665 	return sbi->s_fc_bh->b_data;
666 }
667 
668 /* memcpy to fc reserved space and update CRC */
669 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
670 				int len, u32 *crc)
671 {
672 	if (crc)
673 		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
674 	return memcpy(dst, src, len);
675 }
676 
677 /*
678  * Complete a fast commit by writing tail tag.
679  *
680  * Writing tail tag marks the end of a fast commit. In order to guarantee
681  * atomicity, after writing tail tag, even if there's space remaining
682  * in the block, next commit shouldn't use it. That's why tail tag
683  * has the length as that of the remaining space on the block.
684  */
685 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
686 {
687 	struct ext4_sb_info *sbi = EXT4_SB(sb);
688 	struct ext4_fc_tl tl;
689 	struct ext4_fc_tail tail;
690 	int off, bsize = sbi->s_journal->j_blocksize;
691 	u8 *dst;
692 
693 	/*
694 	 * ext4_fc_reserve_space takes care of allocating an extra block if
695 	 * there's no enough space on this block for accommodating this tail.
696 	 */
697 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
698 	if (!dst)
699 		return -ENOSPC;
700 
701 	off = sbi->s_fc_bytes % bsize;
702 
703 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
704 	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
705 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
706 
707 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
708 	dst += sizeof(tl);
709 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
710 	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
711 	dst += sizeof(tail.fc_tid);
712 	tail.fc_crc = cpu_to_le32(crc);
713 	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
714 
715 	ext4_fc_submit_bh(sb, true);
716 
717 	return 0;
718 }
719 
720 /*
721  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
722  * Returns false if there's not enough space.
723  */
724 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
725 			   u32 *crc)
726 {
727 	struct ext4_fc_tl tl;
728 	u8 *dst;
729 
730 	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
731 	if (!dst)
732 		return false;
733 
734 	tl.fc_tag = cpu_to_le16(tag);
735 	tl.fc_len = cpu_to_le16(len);
736 
737 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
738 	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
739 
740 	return true;
741 }
742 
743 /* Same as above, but adds dentry tlv. */
744 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
745 				   struct ext4_fc_dentry_update *fc_dentry)
746 {
747 	struct ext4_fc_dentry_info fcd;
748 	struct ext4_fc_tl tl;
749 	int dlen = fc_dentry->fcd_name.len;
750 	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
751 					crc);
752 
753 	if (!dst)
754 		return false;
755 
756 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
757 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
758 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
759 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
760 	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
761 	dst += sizeof(tl);
762 	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
763 	dst += sizeof(fcd);
764 	ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
765 
766 	return true;
767 }
768 
769 /*
770  * Writes inode in the fast commit space under TLV with tag @tag.
771  * Returns 0 on success, error on failure.
772  */
773 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
774 {
775 	struct ext4_inode_info *ei = EXT4_I(inode);
776 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
777 	int ret;
778 	struct ext4_iloc iloc;
779 	struct ext4_fc_inode fc_inode;
780 	struct ext4_fc_tl tl;
781 	u8 *dst;
782 
783 	ret = ext4_get_inode_loc(inode, &iloc);
784 	if (ret)
785 		return ret;
786 
787 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
788 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
789 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
790 		inode_len += ei->i_extra_isize;
791 
792 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
793 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
794 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
795 
796 	dst = ext4_fc_reserve_space(inode->i_sb,
797 			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
798 	if (!dst)
799 		return -ECANCELED;
800 
801 	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
802 		return -ECANCELED;
803 	dst += sizeof(tl);
804 	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
805 		return -ECANCELED;
806 	dst += sizeof(fc_inode);
807 	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
808 					inode_len, crc))
809 		return -ECANCELED;
810 
811 	return 0;
812 }
813 
814 /*
815  * Writes updated data ranges for the inode in question. Updates CRC.
816  * Returns 0 on success, error otherwise.
817  */
818 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
819 {
820 	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
821 	struct ext4_inode_info *ei = EXT4_I(inode);
822 	struct ext4_map_blocks map;
823 	struct ext4_fc_add_range fc_ext;
824 	struct ext4_fc_del_range lrange;
825 	struct ext4_extent *ex;
826 	int ret;
827 
828 	mutex_lock(&ei->i_fc_lock);
829 	if (ei->i_fc_lblk_len == 0) {
830 		mutex_unlock(&ei->i_fc_lock);
831 		return 0;
832 	}
833 	old_blk_size = ei->i_fc_lblk_start;
834 	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
835 	ei->i_fc_lblk_len = 0;
836 	mutex_unlock(&ei->i_fc_lock);
837 
838 	cur_lblk_off = old_blk_size;
839 	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
840 		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
841 
842 	while (cur_lblk_off <= new_blk_size) {
843 		map.m_lblk = cur_lblk_off;
844 		map.m_len = new_blk_size - cur_lblk_off + 1;
845 		ret = ext4_map_blocks(NULL, inode, &map, 0);
846 		if (ret < 0)
847 			return -ECANCELED;
848 
849 		if (map.m_len == 0) {
850 			cur_lblk_off++;
851 			continue;
852 		}
853 
854 		if (ret == 0) {
855 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
856 			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
857 			lrange.fc_len = cpu_to_le32(map.m_len);
858 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
859 					    sizeof(lrange), (u8 *)&lrange, crc))
860 				return -ENOSPC;
861 		} else {
862 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
863 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
864 
865 			/* Limit the number of blocks in one extent */
866 			map.m_len = min(max, map.m_len);
867 
868 			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
869 			ex = (struct ext4_extent *)&fc_ext.fc_ex;
870 			ex->ee_block = cpu_to_le32(map.m_lblk);
871 			ex->ee_len = cpu_to_le16(map.m_len);
872 			ext4_ext_store_pblock(ex, map.m_pblk);
873 			if (map.m_flags & EXT4_MAP_UNWRITTEN)
874 				ext4_ext_mark_unwritten(ex);
875 			else
876 				ext4_ext_mark_initialized(ex);
877 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
878 					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
879 				return -ENOSPC;
880 		}
881 
882 		cur_lblk_off += map.m_len;
883 	}
884 
885 	return 0;
886 }
887 
888 
889 /* Submit data for all the fast commit inodes */
890 static int ext4_fc_submit_inode_data_all(journal_t *journal)
891 {
892 	struct super_block *sb = (struct super_block *)(journal->j_private);
893 	struct ext4_sb_info *sbi = EXT4_SB(sb);
894 	struct ext4_inode_info *ei;
895 	int ret = 0;
896 
897 	spin_lock(&sbi->s_fc_lock);
898 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
899 		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
900 		while (atomic_read(&ei->i_fc_updates)) {
901 			DEFINE_WAIT(wait);
902 
903 			prepare_to_wait(&ei->i_fc_wait, &wait,
904 						TASK_UNINTERRUPTIBLE);
905 			if (atomic_read(&ei->i_fc_updates)) {
906 				spin_unlock(&sbi->s_fc_lock);
907 				schedule();
908 				spin_lock(&sbi->s_fc_lock);
909 			}
910 			finish_wait(&ei->i_fc_wait, &wait);
911 		}
912 		spin_unlock(&sbi->s_fc_lock);
913 		ret = jbd2_submit_inode_data(ei->jinode);
914 		if (ret)
915 			return ret;
916 		spin_lock(&sbi->s_fc_lock);
917 	}
918 	spin_unlock(&sbi->s_fc_lock);
919 
920 	return ret;
921 }
922 
923 /* Wait for completion of data for all the fast commit inodes */
924 static int ext4_fc_wait_inode_data_all(journal_t *journal)
925 {
926 	struct super_block *sb = (struct super_block *)(journal->j_private);
927 	struct ext4_sb_info *sbi = EXT4_SB(sb);
928 	struct ext4_inode_info *pos, *n;
929 	int ret = 0;
930 
931 	spin_lock(&sbi->s_fc_lock);
932 	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
933 		if (!ext4_test_inode_state(&pos->vfs_inode,
934 					   EXT4_STATE_FC_COMMITTING))
935 			continue;
936 		spin_unlock(&sbi->s_fc_lock);
937 
938 		ret = jbd2_wait_inode_data(journal, pos->jinode);
939 		if (ret)
940 			return ret;
941 		spin_lock(&sbi->s_fc_lock);
942 	}
943 	spin_unlock(&sbi->s_fc_lock);
944 
945 	return 0;
946 }
947 
948 /* Commit all the directory entry updates */
949 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
950 __acquires(&sbi->s_fc_lock)
951 __releases(&sbi->s_fc_lock)
952 {
953 	struct super_block *sb = (struct super_block *)(journal->j_private);
954 	struct ext4_sb_info *sbi = EXT4_SB(sb);
955 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
956 	struct inode *inode;
957 	struct ext4_inode_info *ei, *ei_n;
958 	int ret;
959 
960 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
961 		return 0;
962 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
963 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
964 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
965 			spin_unlock(&sbi->s_fc_lock);
966 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
967 				ret = -ENOSPC;
968 				goto lock_and_exit;
969 			}
970 			spin_lock(&sbi->s_fc_lock);
971 			continue;
972 		}
973 
974 		inode = NULL;
975 		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
976 					 i_fc_list) {
977 			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
978 				inode = &ei->vfs_inode;
979 				break;
980 			}
981 		}
982 		/*
983 		 * If we don't find inode in our list, then it was deleted,
984 		 * in which case, we don't need to record it's create tag.
985 		 */
986 		if (!inode)
987 			continue;
988 		spin_unlock(&sbi->s_fc_lock);
989 
990 		/*
991 		 * We first write the inode and then the create dirent. This
992 		 * allows the recovery code to create an unnamed inode first
993 		 * and then link it to a directory entry. This allows us
994 		 * to use namei.c routines almost as is and simplifies
995 		 * the recovery code.
996 		 */
997 		ret = ext4_fc_write_inode(inode, crc);
998 		if (ret)
999 			goto lock_and_exit;
1000 
1001 		ret = ext4_fc_write_inode_data(inode, crc);
1002 		if (ret)
1003 			goto lock_and_exit;
1004 
1005 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1006 			ret = -ENOSPC;
1007 			goto lock_and_exit;
1008 		}
1009 
1010 		spin_lock(&sbi->s_fc_lock);
1011 	}
1012 	return 0;
1013 lock_and_exit:
1014 	spin_lock(&sbi->s_fc_lock);
1015 	return ret;
1016 }
1017 
1018 static int ext4_fc_perform_commit(journal_t *journal)
1019 {
1020 	struct super_block *sb = (struct super_block *)(journal->j_private);
1021 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1022 	struct ext4_inode_info *iter;
1023 	struct ext4_fc_head head;
1024 	struct inode *inode;
1025 	struct blk_plug plug;
1026 	int ret = 0;
1027 	u32 crc = 0;
1028 
1029 	ret = ext4_fc_submit_inode_data_all(journal);
1030 	if (ret)
1031 		return ret;
1032 
1033 	ret = ext4_fc_wait_inode_data_all(journal);
1034 	if (ret)
1035 		return ret;
1036 
1037 	/*
1038 	 * If file system device is different from journal device, issue a cache
1039 	 * flush before we start writing fast commit blocks.
1040 	 */
1041 	if (journal->j_fs_dev != journal->j_dev)
1042 		blkdev_issue_flush(journal->j_fs_dev);
1043 
1044 	blk_start_plug(&plug);
1045 	if (sbi->s_fc_bytes == 0) {
1046 		/*
1047 		 * Add a head tag only if this is the first fast commit
1048 		 * in this TID.
1049 		 */
1050 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1051 		head.fc_tid = cpu_to_le32(
1052 			sbi->s_journal->j_running_transaction->t_tid);
1053 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1054 			(u8 *)&head, &crc)) {
1055 			ret = -ENOSPC;
1056 			goto out;
1057 		}
1058 	}
1059 
1060 	spin_lock(&sbi->s_fc_lock);
1061 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1062 	if (ret) {
1063 		spin_unlock(&sbi->s_fc_lock);
1064 		goto out;
1065 	}
1066 
1067 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1068 		inode = &iter->vfs_inode;
1069 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1070 			continue;
1071 
1072 		spin_unlock(&sbi->s_fc_lock);
1073 		ret = ext4_fc_write_inode_data(inode, &crc);
1074 		if (ret)
1075 			goto out;
1076 		ret = ext4_fc_write_inode(inode, &crc);
1077 		if (ret)
1078 			goto out;
1079 		spin_lock(&sbi->s_fc_lock);
1080 	}
1081 	spin_unlock(&sbi->s_fc_lock);
1082 
1083 	ret = ext4_fc_write_tail(sb, crc);
1084 
1085 out:
1086 	blk_finish_plug(&plug);
1087 	return ret;
1088 }
1089 
1090 static void ext4_fc_update_stats(struct super_block *sb, int status,
1091 				 u64 commit_time, int nblks)
1092 {
1093 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1094 
1095 	jbd_debug(1, "Fast commit ended with status = %d", status);
1096 	if (status == EXT4_FC_STATUS_OK) {
1097 		stats->fc_num_commits++;
1098 		stats->fc_numblks += nblks;
1099 		if (likely(stats->s_fc_avg_commit_time))
1100 			stats->s_fc_avg_commit_time =
1101 				(commit_time +
1102 				 stats->s_fc_avg_commit_time * 3) / 4;
1103 		else
1104 			stats->s_fc_avg_commit_time = commit_time;
1105 	} else if (status == EXT4_FC_STATUS_FAILED ||
1106 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1107 		if (status == EXT4_FC_STATUS_FAILED)
1108 			stats->fc_failed_commits++;
1109 		stats->fc_ineligible_commits++;
1110 	} else {
1111 		stats->fc_skipped_commits++;
1112 	}
1113 	trace_ext4_fc_commit_stop(sb, nblks, status);
1114 }
1115 
1116 /*
1117  * The main commit entry point. Performs a fast commit for transaction
1118  * commit_tid if needed. If it's not possible to perform a fast commit
1119  * due to various reasons, we fall back to full commit. Returns 0
1120  * on success, error otherwise.
1121  */
1122 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1123 {
1124 	struct super_block *sb = (struct super_block *)(journal->j_private);
1125 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1126 	int nblks = 0, ret, bsize = journal->j_blocksize;
1127 	int subtid = atomic_read(&sbi->s_fc_subtid);
1128 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1129 	ktime_t start_time, commit_time;
1130 
1131 	trace_ext4_fc_commit_start(sb);
1132 
1133 	start_time = ktime_get();
1134 
1135 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1136 		return jbd2_complete_transaction(journal, commit_tid);
1137 
1138 restart_fc:
1139 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1140 	if (ret == -EALREADY) {
1141 		/* There was an ongoing commit, check if we need to restart */
1142 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1143 			commit_tid > journal->j_commit_sequence)
1144 			goto restart_fc;
1145 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1146 		return 0;
1147 	} else if (ret) {
1148 		/*
1149 		 * Commit couldn't start. Just update stats and perform a
1150 		 * full commit.
1151 		 */
1152 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1153 		return jbd2_complete_transaction(journal, commit_tid);
1154 	}
1155 
1156 	/*
1157 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1158 	 * if we are fast commit ineligible.
1159 	 */
1160 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1161 		status = EXT4_FC_STATUS_INELIGIBLE;
1162 		goto fallback;
1163 	}
1164 
1165 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1166 	ret = ext4_fc_perform_commit(journal);
1167 	if (ret < 0) {
1168 		status = EXT4_FC_STATUS_FAILED;
1169 		goto fallback;
1170 	}
1171 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172 	ret = jbd2_fc_wait_bufs(journal, nblks);
1173 	if (ret < 0) {
1174 		status = EXT4_FC_STATUS_FAILED;
1175 		goto fallback;
1176 	}
1177 	atomic_inc(&sbi->s_fc_subtid);
1178 	ret = jbd2_fc_end_commit(journal);
1179 	/*
1180 	 * weight the commit time higher than the average time so we
1181 	 * don't react too strongly to vast changes in the commit time
1182 	 */
1183 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1184 	ext4_fc_update_stats(sb, status, commit_time, nblks);
1185 	return ret;
1186 
1187 fallback:
1188 	ret = jbd2_fc_end_commit_fallback(journal);
1189 	ext4_fc_update_stats(sb, status, 0, 0);
1190 	return ret;
1191 }
1192 
1193 /*
1194  * Fast commit cleanup routine. This is called after every fast commit and
1195  * full commit. full is true if we are called after a full commit.
1196  */
1197 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1198 {
1199 	struct super_block *sb = journal->j_private;
1200 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1201 	struct ext4_inode_info *iter, *iter_n;
1202 	struct ext4_fc_dentry_update *fc_dentry;
1203 
1204 	if (full && sbi->s_fc_bh)
1205 		sbi->s_fc_bh = NULL;
1206 
1207 	jbd2_fc_release_bufs(journal);
1208 
1209 	spin_lock(&sbi->s_fc_lock);
1210 	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1211 				 i_fc_list) {
1212 		list_del_init(&iter->i_fc_list);
1213 		ext4_clear_inode_state(&iter->vfs_inode,
1214 				       EXT4_STATE_FC_COMMITTING);
1215 		if (iter->i_sync_tid <= tid)
1216 			ext4_fc_reset_inode(&iter->vfs_inode);
1217 		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1218 		smp_mb();
1219 #if (BITS_PER_LONG < 64)
1220 		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1221 #else
1222 		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1223 #endif
1224 	}
1225 
1226 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1227 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1228 					     struct ext4_fc_dentry_update,
1229 					     fcd_list);
1230 		list_del_init(&fc_dentry->fcd_list);
1231 		spin_unlock(&sbi->s_fc_lock);
1232 
1233 		if (fc_dentry->fcd_name.name &&
1234 			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1235 			kfree(fc_dentry->fcd_name.name);
1236 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1237 		spin_lock(&sbi->s_fc_lock);
1238 	}
1239 
1240 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1241 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1242 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1243 				&sbi->s_fc_q[FC_Q_MAIN]);
1244 
1245 	if (tid >= sbi->s_fc_ineligible_tid) {
1246 		sbi->s_fc_ineligible_tid = 0;
1247 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1248 	}
1249 
1250 	if (full)
1251 		sbi->s_fc_bytes = 0;
1252 	spin_unlock(&sbi->s_fc_lock);
1253 	trace_ext4_fc_stats(sb);
1254 }
1255 
1256 /* Ext4 Replay Path Routines */
1257 
1258 /* Helper struct for dentry replay routines */
1259 struct dentry_info_args {
1260 	int parent_ino, dname_len, ino, inode_len;
1261 	char *dname;
1262 };
1263 
1264 static inline void tl_to_darg(struct dentry_info_args *darg,
1265 			      struct  ext4_fc_tl *tl, u8 *val)
1266 {
1267 	struct ext4_fc_dentry_info fcd;
1268 
1269 	memcpy(&fcd, val, sizeof(fcd));
1270 
1271 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1272 	darg->ino = le32_to_cpu(fcd.fc_ino);
1273 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1274 	darg->dname_len = le16_to_cpu(tl->fc_len) -
1275 		sizeof(struct ext4_fc_dentry_info);
1276 }
1277 
1278 /* Unlink replay function */
1279 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1280 				 u8 *val)
1281 {
1282 	struct inode *inode, *old_parent;
1283 	struct qstr entry;
1284 	struct dentry_info_args darg;
1285 	int ret = 0;
1286 
1287 	tl_to_darg(&darg, tl, val);
1288 
1289 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1290 			darg.parent_ino, darg.dname_len);
1291 
1292 	entry.name = darg.dname;
1293 	entry.len = darg.dname_len;
1294 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1295 
1296 	if (IS_ERR(inode)) {
1297 		jbd_debug(1, "Inode %d not found", darg.ino);
1298 		return 0;
1299 	}
1300 
1301 	old_parent = ext4_iget(sb, darg.parent_ino,
1302 				EXT4_IGET_NORMAL);
1303 	if (IS_ERR(old_parent)) {
1304 		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1305 		iput(inode);
1306 		return 0;
1307 	}
1308 
1309 	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1310 	/* -ENOENT ok coz it might not exist anymore. */
1311 	if (ret == -ENOENT)
1312 		ret = 0;
1313 	iput(old_parent);
1314 	iput(inode);
1315 	return ret;
1316 }
1317 
1318 static int ext4_fc_replay_link_internal(struct super_block *sb,
1319 				struct dentry_info_args *darg,
1320 				struct inode *inode)
1321 {
1322 	struct inode *dir = NULL;
1323 	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1324 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1325 	int ret = 0;
1326 
1327 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1328 	if (IS_ERR(dir)) {
1329 		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1330 		dir = NULL;
1331 		goto out;
1332 	}
1333 
1334 	dentry_dir = d_obtain_alias(dir);
1335 	if (IS_ERR(dentry_dir)) {
1336 		jbd_debug(1, "Failed to obtain dentry");
1337 		dentry_dir = NULL;
1338 		goto out;
1339 	}
1340 
1341 	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1342 	if (!dentry_inode) {
1343 		jbd_debug(1, "Inode dentry not created.");
1344 		ret = -ENOMEM;
1345 		goto out;
1346 	}
1347 
1348 	ret = __ext4_link(dir, inode, dentry_inode);
1349 	/*
1350 	 * It's possible that link already existed since data blocks
1351 	 * for the dir in question got persisted before we crashed OR
1352 	 * we replayed this tag and crashed before the entire replay
1353 	 * could complete.
1354 	 */
1355 	if (ret && ret != -EEXIST) {
1356 		jbd_debug(1, "Failed to link\n");
1357 		goto out;
1358 	}
1359 
1360 	ret = 0;
1361 out:
1362 	if (dentry_dir) {
1363 		d_drop(dentry_dir);
1364 		dput(dentry_dir);
1365 	} else if (dir) {
1366 		iput(dir);
1367 	}
1368 	if (dentry_inode) {
1369 		d_drop(dentry_inode);
1370 		dput(dentry_inode);
1371 	}
1372 
1373 	return ret;
1374 }
1375 
1376 /* Link replay function */
1377 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1378 			       u8 *val)
1379 {
1380 	struct inode *inode;
1381 	struct dentry_info_args darg;
1382 	int ret = 0;
1383 
1384 	tl_to_darg(&darg, tl, val);
1385 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1386 			darg.parent_ino, darg.dname_len);
1387 
1388 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1389 	if (IS_ERR(inode)) {
1390 		jbd_debug(1, "Inode not found.");
1391 		return 0;
1392 	}
1393 
1394 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1395 	iput(inode);
1396 	return ret;
1397 }
1398 
1399 /*
1400  * Record all the modified inodes during replay. We use this later to setup
1401  * block bitmaps correctly.
1402  */
1403 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1404 {
1405 	struct ext4_fc_replay_state *state;
1406 	int i;
1407 
1408 	state = &EXT4_SB(sb)->s_fc_replay_state;
1409 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1410 		if (state->fc_modified_inodes[i] == ino)
1411 			return 0;
1412 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1413 		state->fc_modified_inodes = krealloc(
1414 				state->fc_modified_inodes,
1415 				sizeof(int) * (state->fc_modified_inodes_size +
1416 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1417 				GFP_KERNEL);
1418 		if (!state->fc_modified_inodes)
1419 			return -ENOMEM;
1420 		state->fc_modified_inodes_size +=
1421 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1422 	}
1423 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1424 	return 0;
1425 }
1426 
1427 /*
1428  * Inode replay function
1429  */
1430 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1431 				u8 *val)
1432 {
1433 	struct ext4_fc_inode fc_inode;
1434 	struct ext4_inode *raw_inode;
1435 	struct ext4_inode *raw_fc_inode;
1436 	struct inode *inode = NULL;
1437 	struct ext4_iloc iloc;
1438 	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1439 	struct ext4_extent_header *eh;
1440 
1441 	memcpy(&fc_inode, val, sizeof(fc_inode));
1442 
1443 	ino = le32_to_cpu(fc_inode.fc_ino);
1444 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1445 
1446 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1447 	if (!IS_ERR(inode)) {
1448 		ext4_ext_clear_bb(inode);
1449 		iput(inode);
1450 	}
1451 	inode = NULL;
1452 
1453 	ret = ext4_fc_record_modified_inode(sb, ino);
1454 	if (ret)
1455 		goto out;
1456 
1457 	raw_fc_inode = (struct ext4_inode *)
1458 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1459 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1460 	if (ret)
1461 		goto out;
1462 
1463 	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1464 	raw_inode = ext4_raw_inode(&iloc);
1465 
1466 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1467 	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1468 		inode_len - offsetof(struct ext4_inode, i_generation));
1469 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1470 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1471 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1472 			memset(eh, 0, sizeof(*eh));
1473 			eh->eh_magic = EXT4_EXT_MAGIC;
1474 			eh->eh_max = cpu_to_le16(
1475 				(sizeof(raw_inode->i_block) -
1476 				 sizeof(struct ext4_extent_header))
1477 				 / sizeof(struct ext4_extent));
1478 		}
1479 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1480 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1481 			sizeof(raw_inode->i_block));
1482 	}
1483 
1484 	/* Immediately update the inode on disk. */
1485 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1486 	if (ret)
1487 		goto out;
1488 	ret = sync_dirty_buffer(iloc.bh);
1489 	if (ret)
1490 		goto out;
1491 	ret = ext4_mark_inode_used(sb, ino);
1492 	if (ret)
1493 		goto out;
1494 
1495 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1496 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1497 	if (IS_ERR(inode)) {
1498 		jbd_debug(1, "Inode not found.");
1499 		return -EFSCORRUPTED;
1500 	}
1501 
1502 	/*
1503 	 * Our allocator could have made different decisions than before
1504 	 * crashing. This should be fixed but until then, we calculate
1505 	 * the number of blocks the inode.
1506 	 */
1507 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1508 		ext4_ext_replay_set_iblocks(inode);
1509 
1510 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1511 	ext4_reset_inode_seed(inode);
1512 
1513 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1514 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515 	sync_dirty_buffer(iloc.bh);
1516 	brelse(iloc.bh);
1517 out:
1518 	iput(inode);
1519 	if (!ret)
1520 		blkdev_issue_flush(sb->s_bdev);
1521 
1522 	return 0;
1523 }
1524 
1525 /*
1526  * Dentry create replay function.
1527  *
1528  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1529  * inode for which we are trying to create a dentry here, should already have
1530  * been replayed before we start here.
1531  */
1532 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1533 				 u8 *val)
1534 {
1535 	int ret = 0;
1536 	struct inode *inode = NULL;
1537 	struct inode *dir = NULL;
1538 	struct dentry_info_args darg;
1539 
1540 	tl_to_darg(&darg, tl, val);
1541 
1542 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1543 			darg.parent_ino, darg.dname_len);
1544 
1545 	/* This takes care of update group descriptor and other metadata */
1546 	ret = ext4_mark_inode_used(sb, darg.ino);
1547 	if (ret)
1548 		goto out;
1549 
1550 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1551 	if (IS_ERR(inode)) {
1552 		jbd_debug(1, "inode %d not found.", darg.ino);
1553 		inode = NULL;
1554 		ret = -EINVAL;
1555 		goto out;
1556 	}
1557 
1558 	if (S_ISDIR(inode->i_mode)) {
1559 		/*
1560 		 * If we are creating a directory, we need to make sure that the
1561 		 * dot and dot dot dirents are setup properly.
1562 		 */
1563 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1564 		if (IS_ERR(dir)) {
1565 			jbd_debug(1, "Dir %d not found.", darg.ino);
1566 			goto out;
1567 		}
1568 		ret = ext4_init_new_dir(NULL, dir, inode);
1569 		iput(dir);
1570 		if (ret) {
1571 			ret = 0;
1572 			goto out;
1573 		}
1574 	}
1575 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1576 	if (ret)
1577 		goto out;
1578 	set_nlink(inode, 1);
1579 	ext4_mark_inode_dirty(NULL, inode);
1580 out:
1581 	if (inode)
1582 		iput(inode);
1583 	return ret;
1584 }
1585 
1586 /*
1587  * Record physical disk regions which are in use as per fast commit area,
1588  * and used by inodes during replay phase. Our simple replay phase
1589  * allocator excludes these regions from allocation.
1590  */
1591 int ext4_fc_record_regions(struct super_block *sb, int ino,
1592 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1593 {
1594 	struct ext4_fc_replay_state *state;
1595 	struct ext4_fc_alloc_region *region;
1596 
1597 	state = &EXT4_SB(sb)->s_fc_replay_state;
1598 	/*
1599 	 * during replay phase, the fc_regions_valid may not same as
1600 	 * fc_regions_used, update it when do new additions.
1601 	 */
1602 	if (replay && state->fc_regions_used != state->fc_regions_valid)
1603 		state->fc_regions_used = state->fc_regions_valid;
1604 	if (state->fc_regions_used == state->fc_regions_size) {
1605 		state->fc_regions_size +=
1606 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1607 		state->fc_regions = krealloc(
1608 					state->fc_regions,
1609 					state->fc_regions_size *
1610 					sizeof(struct ext4_fc_alloc_region),
1611 					GFP_KERNEL);
1612 		if (!state->fc_regions)
1613 			return -ENOMEM;
1614 	}
1615 	region = &state->fc_regions[state->fc_regions_used++];
1616 	region->ino = ino;
1617 	region->lblk = lblk;
1618 	region->pblk = pblk;
1619 	region->len = len;
1620 
1621 	if (replay)
1622 		state->fc_regions_valid++;
1623 
1624 	return 0;
1625 }
1626 
1627 /* Replay add range tag */
1628 static int ext4_fc_replay_add_range(struct super_block *sb,
1629 				    struct ext4_fc_tl *tl, u8 *val)
1630 {
1631 	struct ext4_fc_add_range fc_add_ex;
1632 	struct ext4_extent newex, *ex;
1633 	struct inode *inode;
1634 	ext4_lblk_t start, cur;
1635 	int remaining, len;
1636 	ext4_fsblk_t start_pblk;
1637 	struct ext4_map_blocks map;
1638 	struct ext4_ext_path *path = NULL;
1639 	int ret;
1640 
1641 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1642 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1643 
1644 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1645 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1646 		ext4_ext_get_actual_len(ex));
1647 
1648 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1649 	if (IS_ERR(inode)) {
1650 		jbd_debug(1, "Inode not found.");
1651 		return 0;
1652 	}
1653 
1654 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1655 	if (ret)
1656 		goto out;
1657 
1658 	start = le32_to_cpu(ex->ee_block);
1659 	start_pblk = ext4_ext_pblock(ex);
1660 	len = ext4_ext_get_actual_len(ex);
1661 
1662 	cur = start;
1663 	remaining = len;
1664 	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1665 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1666 		  inode->i_ino);
1667 
1668 	while (remaining > 0) {
1669 		map.m_lblk = cur;
1670 		map.m_len = remaining;
1671 		map.m_pblk = 0;
1672 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1673 
1674 		if (ret < 0)
1675 			goto out;
1676 
1677 		if (ret == 0) {
1678 			/* Range is not mapped */
1679 			path = ext4_find_extent(inode, cur, NULL, 0);
1680 			if (IS_ERR(path))
1681 				goto out;
1682 			memset(&newex, 0, sizeof(newex));
1683 			newex.ee_block = cpu_to_le32(cur);
1684 			ext4_ext_store_pblock(
1685 				&newex, start_pblk + cur - start);
1686 			newex.ee_len = cpu_to_le16(map.m_len);
1687 			if (ext4_ext_is_unwritten(ex))
1688 				ext4_ext_mark_unwritten(&newex);
1689 			down_write(&EXT4_I(inode)->i_data_sem);
1690 			ret = ext4_ext_insert_extent(
1691 				NULL, inode, &path, &newex, 0);
1692 			up_write((&EXT4_I(inode)->i_data_sem));
1693 			ext4_ext_drop_refs(path);
1694 			kfree(path);
1695 			if (ret)
1696 				goto out;
1697 			goto next;
1698 		}
1699 
1700 		if (start_pblk + cur - start != map.m_pblk) {
1701 			/*
1702 			 * Logical to physical mapping changed. This can happen
1703 			 * if this range was removed and then reallocated to
1704 			 * map to new physical blocks during a fast commit.
1705 			 */
1706 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1707 					ext4_ext_is_unwritten(ex),
1708 					start_pblk + cur - start);
1709 			if (ret)
1710 				goto out;
1711 			/*
1712 			 * Mark the old blocks as free since they aren't used
1713 			 * anymore. We maintain an array of all the modified
1714 			 * inodes. In case these blocks are still used at either
1715 			 * a different logical range in the same inode or in
1716 			 * some different inode, we will mark them as allocated
1717 			 * at the end of the FC replay using our array of
1718 			 * modified inodes.
1719 			 */
1720 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1721 			goto next;
1722 		}
1723 
1724 		/* Range is mapped and needs a state change */
1725 		jbd_debug(1, "Converting from %ld to %d %lld",
1726 				map.m_flags & EXT4_MAP_UNWRITTEN,
1727 			ext4_ext_is_unwritten(ex), map.m_pblk);
1728 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729 					ext4_ext_is_unwritten(ex), map.m_pblk);
1730 		if (ret)
1731 			goto out;
1732 		/*
1733 		 * We may have split the extent tree while toggling the state.
1734 		 * Try to shrink the extent tree now.
1735 		 */
1736 		ext4_ext_replay_shrink_inode(inode, start + len);
1737 next:
1738 		cur += map.m_len;
1739 		remaining -= map.m_len;
1740 	}
1741 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1742 					sb->s_blocksize_bits);
1743 out:
1744 	iput(inode);
1745 	return 0;
1746 }
1747 
1748 /* Replay DEL_RANGE tag */
1749 static int
1750 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1751 			 u8 *val)
1752 {
1753 	struct inode *inode;
1754 	struct ext4_fc_del_range lrange;
1755 	struct ext4_map_blocks map;
1756 	ext4_lblk_t cur, remaining;
1757 	int ret;
1758 
1759 	memcpy(&lrange, val, sizeof(lrange));
1760 	cur = le32_to_cpu(lrange.fc_lblk);
1761 	remaining = le32_to_cpu(lrange.fc_len);
1762 
1763 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1764 		le32_to_cpu(lrange.fc_ino), cur, remaining);
1765 
1766 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1767 	if (IS_ERR(inode)) {
1768 		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1769 		return 0;
1770 	}
1771 
1772 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1773 	if (ret)
1774 		goto out;
1775 
1776 	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1777 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1778 			le32_to_cpu(lrange.fc_len));
1779 	while (remaining > 0) {
1780 		map.m_lblk = cur;
1781 		map.m_len = remaining;
1782 
1783 		ret = ext4_map_blocks(NULL, inode, &map, 0);
1784 		if (ret < 0)
1785 			goto out;
1786 		if (ret > 0) {
1787 			remaining -= ret;
1788 			cur += ret;
1789 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1790 		} else {
1791 			remaining -= map.m_len;
1792 			cur += map.m_len;
1793 		}
1794 	}
1795 
1796 	down_write(&EXT4_I(inode)->i_data_sem);
1797 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1798 				le32_to_cpu(lrange.fc_lblk) +
1799 				le32_to_cpu(lrange.fc_len) - 1);
1800 	up_write(&EXT4_I(inode)->i_data_sem);
1801 	if (ret)
1802 		goto out;
1803 	ext4_ext_replay_shrink_inode(inode,
1804 		i_size_read(inode) >> sb->s_blocksize_bits);
1805 	ext4_mark_inode_dirty(NULL, inode);
1806 out:
1807 	iput(inode);
1808 	return 0;
1809 }
1810 
1811 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1812 {
1813 	struct ext4_fc_replay_state *state;
1814 	struct inode *inode;
1815 	struct ext4_ext_path *path = NULL;
1816 	struct ext4_map_blocks map;
1817 	int i, ret, j;
1818 	ext4_lblk_t cur, end;
1819 
1820 	state = &EXT4_SB(sb)->s_fc_replay_state;
1821 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1822 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1823 			EXT4_IGET_NORMAL);
1824 		if (IS_ERR(inode)) {
1825 			jbd_debug(1, "Inode %d not found.",
1826 				state->fc_modified_inodes[i]);
1827 			continue;
1828 		}
1829 		cur = 0;
1830 		end = EXT_MAX_BLOCKS;
1831 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1832 			iput(inode);
1833 			continue;
1834 		}
1835 		while (cur < end) {
1836 			map.m_lblk = cur;
1837 			map.m_len = end - cur;
1838 
1839 			ret = ext4_map_blocks(NULL, inode, &map, 0);
1840 			if (ret < 0)
1841 				break;
1842 
1843 			if (ret > 0) {
1844 				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1845 				if (!IS_ERR(path)) {
1846 					for (j = 0; j < path->p_depth; j++)
1847 						ext4_mb_mark_bb(inode->i_sb,
1848 							path[j].p_block, 1, 1);
1849 					ext4_ext_drop_refs(path);
1850 					kfree(path);
1851 				}
1852 				cur += ret;
1853 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1854 							map.m_len, 1);
1855 			} else {
1856 				cur = cur + (map.m_len ? map.m_len : 1);
1857 			}
1858 		}
1859 		iput(inode);
1860 	}
1861 }
1862 
1863 /*
1864  * Check if block is in excluded regions for block allocation. The simple
1865  * allocator that runs during replay phase is calls this function to see
1866  * if it is okay to use a block.
1867  */
1868 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1869 {
1870 	int i;
1871 	struct ext4_fc_replay_state *state;
1872 
1873 	state = &EXT4_SB(sb)->s_fc_replay_state;
1874 	for (i = 0; i < state->fc_regions_valid; i++) {
1875 		if (state->fc_regions[i].ino == 0 ||
1876 			state->fc_regions[i].len == 0)
1877 			continue;
1878 		if (blk >= state->fc_regions[i].pblk &&
1879 		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1880 			return true;
1881 	}
1882 	return false;
1883 }
1884 
1885 /* Cleanup function called after replay */
1886 void ext4_fc_replay_cleanup(struct super_block *sb)
1887 {
1888 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1889 
1890 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1891 	kfree(sbi->s_fc_replay_state.fc_regions);
1892 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1893 }
1894 
1895 /*
1896  * Recovery Scan phase handler
1897  *
1898  * This function is called during the scan phase and is responsible
1899  * for doing following things:
1900  * - Make sure the fast commit area has valid tags for replay
1901  * - Count number of tags that need to be replayed by the replay handler
1902  * - Verify CRC
1903  * - Create a list of excluded blocks for allocation during replay phase
1904  *
1905  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1906  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1907  * to indicate that scan has finished and JBD2 can now start replay phase.
1908  * It returns a negative error to indicate that there was an error. At the end
1909  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1910  * to indicate the number of tags that need to replayed during the replay phase.
1911  */
1912 static int ext4_fc_replay_scan(journal_t *journal,
1913 				struct buffer_head *bh, int off,
1914 				tid_t expected_tid)
1915 {
1916 	struct super_block *sb = journal->j_private;
1917 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1918 	struct ext4_fc_replay_state *state;
1919 	int ret = JBD2_FC_REPLAY_CONTINUE;
1920 	struct ext4_fc_add_range ext;
1921 	struct ext4_fc_tl tl;
1922 	struct ext4_fc_tail tail;
1923 	__u8 *start, *end, *cur, *val;
1924 	struct ext4_fc_head head;
1925 	struct ext4_extent *ex;
1926 
1927 	state = &sbi->s_fc_replay_state;
1928 
1929 	start = (u8 *)bh->b_data;
1930 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1931 
1932 	if (state->fc_replay_expected_off == 0) {
1933 		state->fc_cur_tag = 0;
1934 		state->fc_replay_num_tags = 0;
1935 		state->fc_crc = 0;
1936 		state->fc_regions = NULL;
1937 		state->fc_regions_valid = state->fc_regions_used =
1938 			state->fc_regions_size = 0;
1939 		/* Check if we can stop early */
1940 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1941 			!= EXT4_FC_TAG_HEAD)
1942 			return 0;
1943 	}
1944 
1945 	if (off != state->fc_replay_expected_off) {
1946 		ret = -EFSCORRUPTED;
1947 		goto out_err;
1948 	}
1949 
1950 	state->fc_replay_expected_off++;
1951 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1952 		memcpy(&tl, cur, sizeof(tl));
1953 		val = cur + sizeof(tl);
1954 		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1955 			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1956 		switch (le16_to_cpu(tl.fc_tag)) {
1957 		case EXT4_FC_TAG_ADD_RANGE:
1958 			memcpy(&ext, val, sizeof(ext));
1959 			ex = (struct ext4_extent *)&ext.fc_ex;
1960 			ret = ext4_fc_record_regions(sb,
1961 				le32_to_cpu(ext.fc_ino),
1962 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1963 				ext4_ext_get_actual_len(ex), 0);
1964 			if (ret < 0)
1965 				break;
1966 			ret = JBD2_FC_REPLAY_CONTINUE;
1967 			fallthrough;
1968 		case EXT4_FC_TAG_DEL_RANGE:
1969 		case EXT4_FC_TAG_LINK:
1970 		case EXT4_FC_TAG_UNLINK:
1971 		case EXT4_FC_TAG_CREAT:
1972 		case EXT4_FC_TAG_INODE:
1973 		case EXT4_FC_TAG_PAD:
1974 			state->fc_cur_tag++;
1975 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1976 					sizeof(tl) + le16_to_cpu(tl.fc_len));
1977 			break;
1978 		case EXT4_FC_TAG_TAIL:
1979 			state->fc_cur_tag++;
1980 			memcpy(&tail, val, sizeof(tail));
1981 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1982 						sizeof(tl) +
1983 						offsetof(struct ext4_fc_tail,
1984 						fc_crc));
1985 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1986 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1987 				state->fc_replay_num_tags = state->fc_cur_tag;
1988 				state->fc_regions_valid =
1989 					state->fc_regions_used;
1990 			} else {
1991 				ret = state->fc_replay_num_tags ?
1992 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
1993 			}
1994 			state->fc_crc = 0;
1995 			break;
1996 		case EXT4_FC_TAG_HEAD:
1997 			memcpy(&head, val, sizeof(head));
1998 			if (le32_to_cpu(head.fc_features) &
1999 				~EXT4_FC_SUPPORTED_FEATURES) {
2000 				ret = -EOPNOTSUPP;
2001 				break;
2002 			}
2003 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2004 				ret = JBD2_FC_REPLAY_STOP;
2005 				break;
2006 			}
2007 			state->fc_cur_tag++;
2008 			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2009 					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2010 			break;
2011 		default:
2012 			ret = state->fc_replay_num_tags ?
2013 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2014 		}
2015 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2016 			break;
2017 	}
2018 
2019 out_err:
2020 	trace_ext4_fc_replay_scan(sb, ret, off);
2021 	return ret;
2022 }
2023 
2024 /*
2025  * Main recovery path entry point.
2026  * The meaning of return codes is similar as above.
2027  */
2028 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2029 				enum passtype pass, int off, tid_t expected_tid)
2030 {
2031 	struct super_block *sb = journal->j_private;
2032 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2033 	struct ext4_fc_tl tl;
2034 	__u8 *start, *end, *cur, *val;
2035 	int ret = JBD2_FC_REPLAY_CONTINUE;
2036 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2037 	struct ext4_fc_tail tail;
2038 
2039 	if (pass == PASS_SCAN) {
2040 		state->fc_current_pass = PASS_SCAN;
2041 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2042 	}
2043 
2044 	if (state->fc_current_pass != pass) {
2045 		state->fc_current_pass = pass;
2046 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2047 	}
2048 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2049 		jbd_debug(1, "Replay stops\n");
2050 		ext4_fc_set_bitmaps_and_counters(sb);
2051 		return 0;
2052 	}
2053 
2054 #ifdef CONFIG_EXT4_DEBUG
2055 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2056 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2057 		return JBD2_FC_REPLAY_STOP;
2058 	}
2059 #endif
2060 
2061 	start = (u8 *)bh->b_data;
2062 	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2063 
2064 	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2065 		memcpy(&tl, cur, sizeof(tl));
2066 		val = cur + sizeof(tl);
2067 
2068 		if (state->fc_replay_num_tags == 0) {
2069 			ret = JBD2_FC_REPLAY_STOP;
2070 			ext4_fc_set_bitmaps_and_counters(sb);
2071 			break;
2072 		}
2073 		jbd_debug(3, "Replay phase, tag:%s\n",
2074 				tag2str(le16_to_cpu(tl.fc_tag)));
2075 		state->fc_replay_num_tags--;
2076 		switch (le16_to_cpu(tl.fc_tag)) {
2077 		case EXT4_FC_TAG_LINK:
2078 			ret = ext4_fc_replay_link(sb, &tl, val);
2079 			break;
2080 		case EXT4_FC_TAG_UNLINK:
2081 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2082 			break;
2083 		case EXT4_FC_TAG_ADD_RANGE:
2084 			ret = ext4_fc_replay_add_range(sb, &tl, val);
2085 			break;
2086 		case EXT4_FC_TAG_CREAT:
2087 			ret = ext4_fc_replay_create(sb, &tl, val);
2088 			break;
2089 		case EXT4_FC_TAG_DEL_RANGE:
2090 			ret = ext4_fc_replay_del_range(sb, &tl, val);
2091 			break;
2092 		case EXT4_FC_TAG_INODE:
2093 			ret = ext4_fc_replay_inode(sb, &tl, val);
2094 			break;
2095 		case EXT4_FC_TAG_PAD:
2096 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2097 					     le16_to_cpu(tl.fc_len), 0);
2098 			break;
2099 		case EXT4_FC_TAG_TAIL:
2100 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2101 					     le16_to_cpu(tl.fc_len), 0);
2102 			memcpy(&tail, val, sizeof(tail));
2103 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2104 			break;
2105 		case EXT4_FC_TAG_HEAD:
2106 			break;
2107 		default:
2108 			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2109 					     le16_to_cpu(tl.fc_len), 0);
2110 			ret = -ECANCELED;
2111 			break;
2112 		}
2113 		if (ret < 0)
2114 			break;
2115 		ret = JBD2_FC_REPLAY_CONTINUE;
2116 	}
2117 	return ret;
2118 }
2119 
2120 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2121 {
2122 	/*
2123 	 * We set replay callback even if fast commit disabled because we may
2124 	 * could still have fast commit blocks that need to be replayed even if
2125 	 * fast commit has now been turned off.
2126 	 */
2127 	journal->j_fc_replay_callback = ext4_fc_replay;
2128 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2129 		return;
2130 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2131 }
2132 
2133 static const char *fc_ineligible_reasons[] = {
2134 	"Extended attributes changed",
2135 	"Cross rename",
2136 	"Journal flag changed",
2137 	"Insufficient memory",
2138 	"Swap boot",
2139 	"Resize",
2140 	"Dir renamed",
2141 	"Falloc range op",
2142 	"Data journalling",
2143 	"FC Commit Failed"
2144 };
2145 
2146 int ext4_fc_info_show(struct seq_file *seq, void *v)
2147 {
2148 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2149 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2150 	int i;
2151 
2152 	if (v != SEQ_START_TOKEN)
2153 		return 0;
2154 
2155 	seq_printf(seq,
2156 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2157 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2158 		   stats->fc_numblks,
2159 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2160 	seq_puts(seq, "Ineligible reasons:\n");
2161 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2162 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2163 			stats->fc_ineligible_reason_count[i]);
2164 
2165 	return 0;
2166 }
2167 
2168 int __init ext4_fc_init_dentry_cache(void)
2169 {
2170 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2171 					   SLAB_RECLAIM_ACCOUNT);
2172 
2173 	if (ext4_fc_dentry_cachep == NULL)
2174 		return -ENOMEM;
2175 
2176 	return 0;
2177 }
2178 
2179 void ext4_fc_destroy_dentry_cache(void)
2180 {
2181 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2182 }
2183