xref: /linux/fs/ext4/fast_commit.c (revision 83f1454877cc292b88baf13c829c16ce6937d120)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14 
15 #include <linux/lockdep.h>
16 #include <linux/wait_bit.h>
17 /*
18  * Ext4 Fast Commits
19  * -----------------
20  *
21  * Ext4 fast commits implement fine grained journalling for Ext4.
22  *
23  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
24  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
25  * TLV during the recovery phase. For the scenarios for which we currently
26  * don't have replay code, fast commit falls back to full commits.
27  * Fast commits record delta in one of the following three categories.
28  *
29  * (A) Directory entry updates:
30  *
31  * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
32  * - EXT4_FC_TAG_LINK		- records directory entry link
33  * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
34  *
35  * (B) File specific data range updates:
36  *
37  * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
38  * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
39  *
40  * (C) Inode metadata (mtime / ctime etc):
41  *
42  * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
43  *				  during recovery. Note that iblocks field is
44  *				  not replayed and instead derived during
45  *				  replay.
46  * Commit Operation
47  * ----------------
48  * With fast commits, we maintain all the directory entry operations in the
49  * order in which they are issued in an in-memory queue. This queue is flushed
50  * to disk during the commit operation. We also maintain a list of inodes
51  * that need to be committed during a fast commit in another in memory queue of
52  * inodes. During the commit operation, we commit in the following order:
53  *
54  * [1] Prepare all the inodes to write out their data by setting
55  *     "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
56  *     deleted while it is being flushed.
57  * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
58  *     state.
59  * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures
60  *     that all the existing handles finish and no new handles can start.
61  * [4] Mark all the fast commit eligible inodes as undergoing fast commit by
62  *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
63  *     needed for log writing.
64  * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
65  *     starting of new handles. Updates to inodes being fast committed are
66  *     tracked for requeue rather than blocking.
67  * [6] Commit all the directory entry updates in the fast commit space.
68  * [7] Commit all the changed inodes in the fast commit space.
69  * [8] Write tail tag (this tag ensures the atomicity, please read the following
70  *     section for more details).
71  * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in
72  *     ext4_fc_cleanup().
73  *
74  * All the inode updates must be enclosed within jbd2_journal_start()
75  * and jbd2_journal_stop() similar to JBD2 journaling.
76  *
77  * Fast Commit Ineligibility
78  * -------------------------
79  *
80  * Not all operations are supported by fast commits today (e.g extended
81  * attributes). Fast commit ineligibility is marked by calling
82  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
83  * to full commit.
84  *
85  * Atomicity of commits
86  * --------------------
87  * In order to guarantee atomicity during the commit operation, fast commit
88  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
89  * tag contains CRC of the contents and TID of the transaction after which
90  * this fast commit should be applied. Recovery code replays fast commit
91  * logs only if there's at least 1 valid tail present. For every fast commit
92  * operation, there is 1 tail. This means, we may end up with multiple tails
93  * in the fast commit space. Here's an example:
94  *
95  * - Create a new file A and remove existing file B
96  * - fsync()
97  * - Append contents to file A
98  * - Truncate file A
99  * - fsync()
100  *
101  * The fast commit space at the end of above operations would look like this:
102  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
103  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
104  *
105  * Replay code should thus check for all the valid tails in the FC area.
106  *
107  * Fast Commit Replay Idempotence
108  * ------------------------------
109  *
110  * Fast commits tags are idempotent in nature provided the recovery code follows
111  * certain rules. The guiding principle that the commit path follows while
112  * committing is that it stores the result of a particular operation instead of
113  * storing the procedure.
114  *
115  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
116  * was associated with inode 10. During fast commit, instead of storing this
117  * operation as a procedure "rename a to b", we store the resulting file system
118  * state as a "series" of outcomes:
119  *
120  * - Link dirent b to inode 10
121  * - Unlink dirent a
122  * - Inode <10> with valid refcount
123  *
124  * Now when recovery code runs, it needs "enforce" this state on the file
125  * system. This is what guarantees idempotence of fast commit replay.
126  *
127  * Let's take an example of a procedure that is not idempotent and see how fast
128  * commits make it idempotent. Consider following sequence of operations:
129  *
130  *     rm A;    mv B A;    read A
131  *  (x)     (y)        (z)
132  *
133  * (x), (y) and (z) are the points at which we can crash. If we store this
134  * sequence of operations as is then the replay is not idempotent. Let's say
135  * while in replay, we crash at (z). During the second replay, file A (which was
136  * actually created as a result of "mv B A" operation) would get deleted. Thus,
137  * file named A would be absent when we try to read A. So, this sequence of
138  * operations is not idempotent. However, as mentioned above, instead of storing
139  * the procedure fast commits store the outcome of each procedure. Thus the fast
140  * commit log for above procedure would be as follows:
141  *
142  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
143  * inode 11 before the replay)
144  *
145  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
146  * (w)          (x)                    (y)          (z)
147  *
148  * If we crash at (z), we will have file A linked to inode 11. During the second
149  * replay, we will remove file A (inode 11). But we will create it back and make
150  * it point to inode 11. We won't find B, so we'll just skip that step. At this
151  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
152  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
153  * similarly. Thus, by converting a non-idempotent procedure into a series of
154  * idempotent outcomes, fast commits ensured idempotence during the replay.
155  *
156  * Locking
157  * -------
158  * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
159  * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
160  * inode. Most of the code avoids acquiring both the locks, but if one must do
161  * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
162  *
163  * TODOs
164  * -----
165  *
166  * 0) Fast commit replay path hardening: Fast commit replay code should use
167  *    journal handles to make sure all the updates it does during the replay
168  *    path are atomic. With that if we crash during fast commit replay, after
169  *    trying to do recovery again, we will find a file system where fast commit
170  *    area is invalid (because new full commit would be found). In order to deal
171  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
172  *    superblock state is persisted before starting the replay, so that after
173  *    the crash, fast commit recovery code can look at that flag and perform
174  *    fast commit recovery even if that area is invalidated by later full
175  *    commits.
176  *
177  * 1) Handle more ineligible cases.
178  *
179  * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
180  *    status tree. This would get rid of the need to call ext4_fc_track_inode()
181  *    before acquiring i_data_sem. To do that we would need to ensure that
182  *    modified extents from the extent status tree are not evicted from memory.
183  */
184 
185 #include <trace/events/ext4.h>
186 static struct kmem_cache *ext4_fc_dentry_cachep;
187 static struct kmem_cache *ext4_fc_range_cachep;
188 
189 /*
190  * Avoid spending unbounded time/memory snapshotting highly fragmented files
191  * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to
192  * full commit.
193  */
194 #define EXT4_FC_SNAPSHOT_MAX_INODES	1024
195 #define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
196 
197 static inline void ext4_fc_set_snap_err(int *snap_err, int err)
198 {
199 	if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
200 		*snap_err = err;
201 }
202 
203 static void ext4_end_buffer_io_sync(struct bio *bio)
204 {
205 	struct buffer_head *bh;
206 	bool uptodate = bio_endio_bh(bio, &bh);
207 
208 	BUFFER_TRACE(bh, "");
209 	if (uptodate) {
210 		ext4_debug("%s: Block %lld up-to-date",
211 			   __func__, bh->b_blocknr);
212 		set_buffer_uptodate(bh);
213 	} else {
214 		ext4_debug("%s: Block %lld not up-to-date",
215 			   __func__, bh->b_blocknr);
216 		clear_buffer_uptodate(bh);
217 	}
218 
219 	unlock_buffer(bh);
220 }
221 
222 static void ext4_fc_free_inode_snap(struct inode *inode);
223 
224 static inline void ext4_fc_reset_inode(struct inode *inode)
225 {
226 	struct ext4_inode_info *ei = EXT4_I(inode);
227 
228 	ei->i_fc_lblk_start = 0;
229 	ei->i_fc_lblk_len = 0;
230 }
231 
232 void ext4_fc_init_inode(struct inode *inode)
233 {
234 	struct ext4_inode_info *ei = EXT4_I(inode);
235 
236 	ext4_fc_reset_inode(inode);
237 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
238 	ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
239 	INIT_LIST_HEAD(&ei->i_fc_list);
240 	INIT_LIST_HEAD(&ei->i_fc_dilist);
241 	ei->i_fc_snap = NULL;
242 }
243 
244 static bool ext4_fc_disabled(struct super_block *sb)
245 {
246 	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
247 		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
248 }
249 
250 static bool ext4_fc_eligible(struct super_block *sb)
251 {
252 	return !ext4_fc_disabled(sb) &&
253 		!(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE));
254 }
255 
256 /*
257  * Wait for an inode fast-commit state bit to clear while dropping the
258  * fast-commit lock around schedule().
259  */
260 static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
261 				     int *alloc_ctx)
262 {
263 	wait_queue_head_t *wq;
264 	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
265 	int wait_bit = ext4_inode_state_wait_bit(bit);
266 
267 	while (ext4_test_inode_state(inode, bit)) {
268 		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
269 
270 		wq = bit_waitqueue(wait_word, wait_bit);
271 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
272 		if (ext4_test_inode_state(inode, bit)) {
273 			ext4_fc_unlock(inode->i_sb, *alloc_ctx);
274 			schedule();
275 			*alloc_ctx = ext4_fc_lock(inode->i_sb);
276 		}
277 		finish_wait(wq, &wait.wq_entry);
278 	}
279 }
280 
281 static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
282 {
283 	wake_up_bit(ext4_inode_state_wait_word(inode),
284 		    ext4_inode_state_wait_bit(bit));
285 }
286 
287 static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value)
288 {
289 	u64 old = atomic64_read(stat);
290 
291 	while (value > old) {
292 		u64 prev = atomic64_cmpxchg(stat, old, value);
293 
294 		if (prev == old)
295 			break;
296 		old = prev;
297 	}
298 }
299 
300 /*
301  * Remove inode from fast commit list. If the inode is being committed
302  * we wait until inode commit is done.
303  */
304 void ext4_fc_del(struct inode *inode)
305 {
306 	struct ext4_inode_info *ei = EXT4_I(inode);
307 	struct ext4_fc_dentry_update *fc_dentry;
308 	int alloc_ctx;
309 
310 	if (ext4_fc_disabled(inode->i_sb))
311 		return;
312 
313 	alloc_ctx = ext4_fc_lock(inode->i_sb);
314 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
315 		ext4_fc_free_inode_snap(inode);
316 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
317 		return;
318 	}
319 
320 	/*
321 	 * Wait for ongoing fast commit to finish. We cannot remove the inode
322 	 * from fast commit lists while it is being committed. If we wake from
323 	 * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
324 	 * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
325 	 */
326 	for (;;) {
327 		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
328 					 &alloc_ctx);
329 
330 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
331 			break;
332 
333 		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
334 					 &alloc_ctx);
335 	}
336 
337 	ext4_fc_free_inode_snap(inode);
338 	list_del_init(&ei->i_fc_list);
339 
340 	/*
341 	 * Since this inode is getting removed, let's also remove all FC dentry
342 	 * create references, since it is not needed to log it anyways.
343 	 */
344 	if (list_empty(&ei->i_fc_dilist)) {
345 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
346 		return;
347 	}
348 
349 	fc_dentry = list_first_entry(&ei->i_fc_dilist,
350 				     struct ext4_fc_dentry_update,
351 				     fcd_dilist);
352 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
353 	list_del_init(&fc_dentry->fcd_list);
354 	list_del_init(&fc_dentry->fcd_dilist);
355 
356 	WARN_ON(!list_empty(&ei->i_fc_dilist));
357 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
358 
359 	release_dentry_name_snapshot(&fc_dentry->fcd_name);
360 	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
361 }
362 
363 /*
364  * Mark file system as fast commit ineligible, and record latest
365  * ineligible transaction tid. This means until the recorded
366  * transaction, commit operation would result in a full jbd2 commit.
367  */
368 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
369 {
370 	struct ext4_sb_info *sbi = EXT4_SB(sb);
371 	tid_t tid;
372 	bool has_transaction = true;
373 	bool is_ineligible;
374 	int alloc_ctx;
375 
376 	if (ext4_fc_disabled(sb))
377 		return;
378 
379 	if (!IS_ERR_OR_NULL(handle))
380 		tid = handle->h_transaction->t_tid;
381 	else {
382 		read_lock(&sbi->s_journal->j_state_lock);
383 		if (sbi->s_journal->j_running_transaction)
384 			tid = sbi->s_journal->j_running_transaction->t_tid;
385 		else
386 			has_transaction = false;
387 		read_unlock(&sbi->s_journal->j_state_lock);
388 	}
389 	alloc_ctx = ext4_fc_lock(sb);
390 	is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
391 	if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
392 		sbi->s_fc_ineligible_tid = tid;
393 	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
394 	ext4_fc_unlock(sb, alloc_ctx);
395 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
396 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
397 }
398 
399 /*
400  * Generic fast commit tracking function. If this is the first time this we are
401  * called after a full commit, we initialize fast commit fields and then call
402  * __fc_track_fn() with update = 0. If we have already been called after a full
403  * commit, we pass update = 1. Based on that, the track function can determine
404  * if it needs to track a field for the first time or if it needs to just
405  * update the previously tracked value.
406  *
407  * If enqueue is set, this function enqueues the inode in fast commit list.
408  */
409 static int ext4_fc_track_template(
410 	handle_t *handle, struct inode *inode,
411 	int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
412 	void *args, int enqueue)
413 {
414 	bool update = false;
415 	struct ext4_inode_info *ei = EXT4_I(inode);
416 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
417 	tid_t tid = 0;
418 	int alloc_ctx;
419 	int ret;
420 
421 	tid = handle->h_transaction->t_tid;
422 	spin_lock(&ei->i_fc_lock);
423 	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
424 		ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE);
425 	if (tid == ei->i_sync_tid) {
426 		update = true;
427 	} else {
428 		ext4_fc_reset_inode(inode);
429 		ei->i_sync_tid = tid;
430 	}
431 	ret = __fc_track_fn(handle, inode, args, update);
432 	spin_unlock(&ei->i_fc_lock);
433 	if (!enqueue)
434 		return ret;
435 
436 	alloc_ctx = ext4_fc_lock(inode->i_sb);
437 	if (list_empty(&EXT4_I(inode)->i_fc_list))
438 		list_add_tail(&EXT4_I(inode)->i_fc_list,
439 				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
440 				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
441 				&sbi->s_fc_q[FC_Q_STAGING] :
442 				&sbi->s_fc_q[FC_Q_MAIN]);
443 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
444 
445 	return ret;
446 }
447 
448 struct __track_dentry_update_args {
449 	struct dentry *dentry;
450 	int op;
451 };
452 
453 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
454 static int __track_dentry_update(handle_t *handle, struct inode *inode,
455 				 void *arg, bool update)
456 {
457 	struct ext4_fc_dentry_update *node;
458 	struct ext4_inode_info *ei = EXT4_I(inode);
459 	struct __track_dentry_update_args *dentry_update =
460 		(struct __track_dentry_update_args *)arg;
461 	struct dentry *dentry = dentry_update->dentry;
462 	struct inode *dir = dentry->d_parent->d_inode;
463 	struct super_block *sb = inode->i_sb;
464 	struct ext4_sb_info *sbi = EXT4_SB(sb);
465 	int alloc_ctx;
466 
467 	spin_unlock(&ei->i_fc_lock);
468 
469 	if (IS_ENCRYPTED(dir)) {
470 		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
471 					handle);
472 		spin_lock(&ei->i_fc_lock);
473 		return -EOPNOTSUPP;
474 	}
475 
476 	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
477 	if (!node) {
478 		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
479 		spin_lock(&ei->i_fc_lock);
480 		return -ENOMEM;
481 	}
482 
483 	node->fcd_op = dentry_update->op;
484 	node->fcd_parent = dir->i_ino;
485 	node->fcd_ino = inode->i_ino;
486 	take_dentry_name_snapshot(&node->fcd_name, dentry);
487 	INIT_LIST_HEAD(&node->fcd_dilist);
488 	INIT_LIST_HEAD(&node->fcd_list);
489 	alloc_ctx = ext4_fc_lock(sb);
490 	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
491 		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
492 		list_add_tail(&node->fcd_list,
493 				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
494 	else
495 		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
496 
497 	/*
498 	 * This helps us keep a track of all fc_dentry updates which is part of
499 	 * this ext4 inode. So in case the inode is getting unlinked, before
500 	 * even we get a chance to fsync, we could remove all fc_dentry
501 	 * references while evicting the inode in ext4_fc_del().
502 	 * Also with this, we don't need to loop over all the inodes in
503 	 * sbi->s_fc_q to get the corresponding inode in
504 	 * ext4_fc_commit_dentry_updates().
505 	 */
506 	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
507 		WARN_ON(!list_empty(&ei->i_fc_dilist));
508 		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
509 	}
510 	ext4_fc_unlock(sb, alloc_ctx);
511 	spin_lock(&ei->i_fc_lock);
512 
513 	return 0;
514 }
515 
516 void __ext4_fc_track_unlink(handle_t *handle,
517 		struct inode *inode, struct dentry *dentry)
518 {
519 	struct __track_dentry_update_args args;
520 	int ret;
521 
522 	args.dentry = dentry;
523 	args.op = EXT4_FC_TAG_UNLINK;
524 
525 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 					(void *)&args, 0);
527 	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
528 }
529 
530 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
531 {
532 	struct inode *inode = d_inode(dentry);
533 
534 	if (ext4_fc_eligible(inode->i_sb))
535 		__ext4_fc_track_unlink(handle, inode, dentry);
536 }
537 
538 void __ext4_fc_track_link(handle_t *handle,
539 	struct inode *inode, struct dentry *dentry)
540 {
541 	struct __track_dentry_update_args args;
542 	int ret;
543 
544 	args.dentry = dentry;
545 	args.op = EXT4_FC_TAG_LINK;
546 
547 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
548 					(void *)&args, 0);
549 	trace_ext4_fc_track_link(handle, inode, dentry, ret);
550 }
551 
552 void ext4_fc_track_link(handle_t *handle, struct inode *inode,
553 			struct dentry *dentry)
554 {
555 	if (ext4_fc_eligible(inode->i_sb))
556 		__ext4_fc_track_link(handle, inode, dentry);
557 }
558 
559 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
560 			  struct dentry *dentry)
561 {
562 	struct __track_dentry_update_args args;
563 	int ret;
564 
565 	args.dentry = dentry;
566 	args.op = EXT4_FC_TAG_CREAT;
567 
568 	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
569 					(void *)&args, 0);
570 	trace_ext4_fc_track_create(handle, inode, dentry, ret);
571 }
572 
573 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
574 {
575 	struct inode *inode = d_inode(dentry);
576 
577 	if (ext4_fc_eligible(inode->i_sb))
578 		__ext4_fc_track_create(handle, inode, dentry);
579 }
580 
581 /* __track_fn for inode tracking */
582 static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
583 			 bool update)
584 {
585 	if (update)
586 		return -EEXIST;
587 
588 	EXT4_I(inode)->i_fc_lblk_len = 0;
589 
590 	return 0;
591 }
592 
593 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
594 {
595 	int ret;
596 
597 	if (S_ISDIR(inode->i_mode))
598 		return;
599 
600 	if (ext4_should_journal_data(inode)) {
601 		ext4_fc_mark_ineligible(inode->i_sb,
602 					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
603 		return;
604 	}
605 
606 	if (!ext4_fc_eligible(inode->i_sb))
607 		return;
608 
609 	/*
610 	 * Fast commit snapshots inode state at commit time, so there's no need
611 	 * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already
612 	 * on the commit queue, ext4_fc_cleanup() will requeue it for the new
613 	 * transaction once the current commit finishes.
614 	 */
615 
616 	/*
617 	 * From this point on, this inode will not be committed either
618 	 * by fast or full commit as long as the handle is open.
619 	 */
620 	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
621 	trace_ext4_fc_track_inode(handle, inode, ret);
622 }
623 
624 struct __track_range_args {
625 	ext4_lblk_t start, end;
626 };
627 
628 /* __track_fn for tracking data updates */
629 static int __track_range(handle_t *handle, struct inode *inode, void *arg,
630 			 bool update)
631 {
632 	struct ext4_inode_info *ei = EXT4_I(inode);
633 	ext4_lblk_t oldstart;
634 	struct __track_range_args *__arg =
635 		(struct __track_range_args *)arg;
636 
637 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
638 		ext4_debug("Special inode %llu being modified\n", inode->i_ino);
639 		return -ECANCELED;
640 	}
641 
642 	oldstart = ei->i_fc_lblk_start;
643 
644 	if (update && ei->i_fc_lblk_len > 0) {
645 		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
646 		ei->i_fc_lblk_len =
647 			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
648 				ei->i_fc_lblk_start + 1;
649 	} else {
650 		ei->i_fc_lblk_start = __arg->start;
651 		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
652 	}
653 
654 	return 0;
655 }
656 
657 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
658 			 ext4_lblk_t end)
659 {
660 	struct __track_range_args args;
661 	int ret;
662 
663 	if (S_ISDIR(inode->i_mode))
664 		return;
665 
666 	if (!ext4_fc_eligible(inode->i_sb))
667 		return;
668 
669 	if (ext4_has_inline_data(inode)) {
670 		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
671 					handle);
672 		return;
673 	}
674 
675 	args.start = start;
676 	args.end = end;
677 
678 	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
679 
680 	trace_ext4_fc_track_range(handle, inode, start, end, ret);
681 }
682 
683 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
684 {
685 	blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
686 	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
687 
688 	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
689 	if (test_opt(sb, BARRIER) && is_tail)
690 		write_flags |= REQ_FUA | REQ_PREFLUSH;
691 	lock_buffer(bh);
692 	set_buffer_dirty(bh);
693 	set_buffer_uptodate(bh);
694 	bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync);
695 	EXT4_SB(sb)->s_fc_bh = NULL;
696 }
697 
698 /* Ext4 commit path routines */
699 
700 /*
701  * Allocate len bytes on a fast commit buffer.
702  *
703  * During the commit time this function is used to manage fast commit
704  * block space. We don't split a fast commit log onto different
705  * blocks. So this function makes sure that if there's not enough space
706  * on the current block, the remaining space in the current block is
707  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
708  * new block is from jbd2 and CRC is updated to reflect the padding
709  * we added.
710  */
711 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
712 {
713 	struct ext4_fc_tl tl;
714 	struct ext4_sb_info *sbi = EXT4_SB(sb);
715 	struct buffer_head *bh;
716 	int bsize = sbi->s_journal->j_blocksize;
717 	int ret, off = sbi->s_fc_bytes % bsize;
718 	int remaining;
719 	u8 *dst;
720 
721 	/*
722 	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
723 	 * cannot fulfill the request.
724 	 */
725 	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
726 		return NULL;
727 
728 	if (!sbi->s_fc_bh) {
729 		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
730 		if (ret)
731 			return NULL;
732 		sbi->s_fc_bh = bh;
733 	}
734 	dst = sbi->s_fc_bh->b_data + off;
735 
736 	/*
737 	 * Allocate the bytes in the current block if we can do so while still
738 	 * leaving enough space for a PAD tlv.
739 	 */
740 	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
741 	if (len <= remaining) {
742 		sbi->s_fc_bytes += len;
743 		return dst;
744 	}
745 
746 	/*
747 	 * Else, terminate the current block with a PAD tlv, then allocate a new
748 	 * block and allocate the bytes at the start of that new block.
749 	 */
750 
751 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
752 	tl.fc_len = cpu_to_le16(remaining);
753 	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
754 	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
755 	*crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
756 
757 	ext4_fc_submit_bh(sb, false);
758 
759 	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
760 	if (ret)
761 		return NULL;
762 	sbi->s_fc_bh = bh;
763 	sbi->s_fc_bytes += bsize - off + len;
764 	return sbi->s_fc_bh->b_data;
765 }
766 
767 /*
768  * Complete a fast commit by writing tail tag.
769  *
770  * Writing tail tag marks the end of a fast commit. In order to guarantee
771  * atomicity, after writing tail tag, even if there's space remaining
772  * in the block, next commit shouldn't use it. That's why tail tag
773  * has the length as that of the remaining space on the block.
774  */
775 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
776 {
777 	struct ext4_sb_info *sbi = EXT4_SB(sb);
778 	struct ext4_fc_tl tl;
779 	struct ext4_fc_tail tail;
780 	int off, bsize = sbi->s_journal->j_blocksize;
781 	u8 *dst;
782 
783 	/*
784 	 * ext4_fc_reserve_space takes care of allocating an extra block if
785 	 * there's no enough space on this block for accommodating this tail.
786 	 */
787 	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
788 	if (!dst)
789 		return -ENOSPC;
790 
791 	off = sbi->s_fc_bytes % bsize;
792 
793 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
794 	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
795 	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
796 
797 	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
798 	dst += EXT4_FC_TAG_BASE_LEN;
799 	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
800 	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
801 	dst += sizeof(tail.fc_tid);
802 	crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
803 			  dst - (u8 *)sbi->s_fc_bh->b_data);
804 	tail.fc_crc = cpu_to_le32(crc);
805 	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
806 	dst += sizeof(tail.fc_crc);
807 	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
808 
809 	ext4_fc_submit_bh(sb, true);
810 
811 	return 0;
812 }
813 
814 /*
815  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
816  * Returns false if there's not enough space.
817  */
818 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
819 			   u32 *crc)
820 {
821 	struct ext4_fc_tl tl;
822 	u8 *dst;
823 
824 	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
825 	if (!dst)
826 		return false;
827 
828 	tl.fc_tag = cpu_to_le16(tag);
829 	tl.fc_len = cpu_to_le16(len);
830 
831 	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
832 	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
833 
834 	return true;
835 }
836 
837 /* Same as above, but adds dentry tlv. */
838 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
839 				   struct ext4_fc_dentry_update *fc_dentry)
840 {
841 	struct ext4_fc_dentry_info fcd;
842 	struct ext4_fc_tl tl;
843 	int dlen = fc_dentry->fcd_name.name.len;
844 	u8 *dst = ext4_fc_reserve_space(sb,
845 			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
846 
847 	if (!dst)
848 		return false;
849 
850 	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
851 	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
852 	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
853 	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
854 	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
855 	dst += EXT4_FC_TAG_BASE_LEN;
856 	memcpy(dst, &fcd, sizeof(fcd));
857 	dst += sizeof(fcd);
858 	memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
859 
860 	return true;
861 }
862 
863 struct ext4_fc_range {
864 	struct list_head list;
865 	u16 tag;
866 	ext4_lblk_t lblk;
867 	ext4_lblk_t len;
868 	ext4_fsblk_t pblk;
869 	bool unwritten;
870 };
871 
872 struct ext4_fc_inode_snap {
873 	struct list_head data_list;
874 	unsigned int inode_len;
875 	u8 inode_buf[];
876 };
877 
878 /*
879  * Writes inode in the fast commit space under TLV with tag @tag.
880  * Returns 0 on success, error on failure.
881  */
882 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
883 {
884 	struct ext4_inode_info *ei = EXT4_I(inode);
885 	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
886 	struct ext4_fc_snap_stats *stats =
887 		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
888 	struct ext4_fc_inode fc_inode;
889 	struct ext4_fc_tl tl;
890 	u8 *dst;
891 	u8 *src;
892 	int inode_len;
893 	int ret;
894 
895 	if (!snap) {
896 		atomic64_inc(&stats->snap_fail_no_snap);
897 		return -ECANCELED;
898 	}
899 
900 	src = snap->inode_buf;
901 	inode_len = snap->inode_len;
902 	if (!src || inode_len == 0) {
903 		atomic64_inc(&stats->snap_fail_no_snap);
904 		return -ECANCELED;
905 	}
906 
907 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
908 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
909 	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
910 
911 	ret = -ECANCELED;
912 	dst = ext4_fc_reserve_space(inode->i_sb,
913 		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
914 	if (!dst)
915 		goto err;
916 
917 	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
918 	dst += EXT4_FC_TAG_BASE_LEN;
919 	memcpy(dst, &fc_inode, sizeof(fc_inode));
920 	dst += sizeof(fc_inode);
921 	memcpy(dst, src, inode_len);
922 	ret = 0;
923 err:
924 	return ret;
925 }
926 
927 /*
928  * Writes updated data ranges for the inode in question. Updates CRC.
929  * Returns 0 on success, error otherwise.
930  */
931 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
932 {
933 	struct ext4_inode_info *ei = EXT4_I(inode);
934 	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
935 	struct ext4_fc_snap_stats *stats =
936 		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
937 	struct ext4_fc_add_range fc_ext;
938 	struct ext4_fc_del_range lrange;
939 	struct ext4_extent *ex;
940 	struct ext4_fc_range *range;
941 
942 	if (!snap) {
943 		atomic64_inc(&stats->snap_fail_no_snap);
944 		return -ECANCELED;
945 	}
946 
947 	list_for_each_entry(range, &snap->data_list, list) {
948 		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
949 			lrange.fc_ino = cpu_to_le32(inode->i_ino);
950 			lrange.fc_lblk = cpu_to_le32(range->lblk);
951 			lrange.fc_len = cpu_to_le32(range->len);
952 			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
953 					     sizeof(lrange), (u8 *)&lrange, crc))
954 				return -ENOSPC;
955 			continue;
956 		}
957 
958 		fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
959 		ex = (struct ext4_extent *)&fc_ext.fc_ex;
960 		ex->ee_block = cpu_to_le32(range->lblk);
961 		ex->ee_len = cpu_to_le16(range->len);
962 		ext4_ext_store_pblock(ex, range->pblk);
963 		if (range->unwritten)
964 			ext4_ext_mark_unwritten(ex);
965 		else
966 			ext4_ext_mark_initialized(ex);
967 
968 		if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
969 				     sizeof(fc_ext), (u8 *)&fc_ext, crc))
970 			return -ENOSPC;
971 	}
972 
973 	return 0;
974 }
975 
976 static void ext4_fc_free_ranges(struct list_head *head)
977 {
978 	struct ext4_fc_range *range, *range_n;
979 
980 	list_for_each_entry_safe(range, range_n, head, list) {
981 		list_del(&range->list);
982 		kmem_cache_free(ext4_fc_range_cachep, range);
983 	}
984 }
985 
986 static void ext4_fc_free_inode_snap(struct inode *inode)
987 {
988 	struct ext4_inode_info *ei = EXT4_I(inode);
989 	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
990 
991 	if (!snap)
992 		return;
993 
994 	ext4_fc_free_ranges(&snap->data_list);
995 	kfree(snap);
996 	ei->i_fc_snap = NULL;
997 }
998 
999 static int ext4_fc_snapshot_inode_data(struct inode *inode,
1000 				       struct list_head *ranges,
1001 				       unsigned int nr_ranges_total,
1002 				       unsigned int *nr_rangesp,
1003 				       int *snap_err)
1004 {
1005 	struct ext4_inode_info *ei = EXT4_I(inode);
1006 	struct ext4_fc_snap_stats *stats =
1007 		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
1008 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
1009 	unsigned int nr_ranges = 0;
1010 
1011 	spin_lock(&ei->i_fc_lock);
1012 	if (ei->i_fc_lblk_len == 0) {
1013 		spin_unlock(&ei->i_fc_lock);
1014 		if (nr_rangesp)
1015 			*nr_rangesp = 0;
1016 		return 0;
1017 	}
1018 	start_lblk = ei->i_fc_lblk_start;
1019 	end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
1020 	ei->i_fc_lblk_len = 0;
1021 	spin_unlock(&ei->i_fc_lock);
1022 
1023 	cur_lblk = start_lblk;
1024 	ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
1025 		   start_lblk, end_lblk,
1026 		   (unsigned long long)inode->i_ino);
1027 
1028 	while (cur_lblk <= end_lblk) {
1029 		struct extent_status es;
1030 		struct ext4_fc_range *range;
1031 		ext4_lblk_t len;
1032 		u64 remaining = (u64)end_lblk - cur_lblk + 1;
1033 
1034 		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
1035 			atomic64_inc(&stats->snap_fail_es_miss);
1036 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
1037 			return -EAGAIN;
1038 		}
1039 
1040 		if (ext4_es_is_delayed(&es)) {
1041 			atomic64_inc(&stats->snap_fail_es_delayed);
1042 			ext4_fc_set_snap_err(snap_err,
1043 					     EXT4_FC_SNAP_ERR_ES_DELAYED);
1044 			return -EAGAIN;
1045 		}
1046 
1047 		len = es.es_len - (cur_lblk - es.es_lblk);
1048 		if (len > remaining)
1049 			len = remaining;
1050 		if (len == 0) {
1051 			cur_lblk++;
1052 			continue;
1053 		}
1054 
1055 		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
1056 			atomic64_inc(&stats->snap_fail_ranges_cap);
1057 			ext4_fc_set_snap_err(snap_err,
1058 					     EXT4_FC_SNAP_ERR_RANGES_CAP);
1059 			return -E2BIG;
1060 		}
1061 
1062 		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
1063 		if (!range) {
1064 			atomic64_inc(&stats->snap_fail_nomem);
1065 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
1066 			return -ENOMEM;
1067 		}
1068 		nr_ranges++;
1069 
1070 		range->lblk = cur_lblk;
1071 		range->len = len;
1072 		range->pblk = 0;
1073 		range->unwritten = false;
1074 
1075 		if (ext4_es_is_hole(&es)) {
1076 			range->tag = EXT4_FC_TAG_DEL_RANGE;
1077 		} else if (ext4_es_is_written(&es) ||
1078 			   ext4_es_is_unwritten(&es)) {
1079 			unsigned int max;
1080 
1081 			range->tag = EXT4_FC_TAG_ADD_RANGE;
1082 			range->pblk = ext4_es_pblock(&es) +
1083 				      (cur_lblk - es.es_lblk);
1084 			range->unwritten = ext4_es_is_unwritten(&es);
1085 
1086 			max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN :
1087 						 EXT_INIT_MAX_LEN;
1088 			if (range->len > max)
1089 				range->len = max;
1090 		} else {
1091 			kmem_cache_free(ext4_fc_range_cachep, range);
1092 			atomic64_inc(&stats->snap_fail_es_other);
1093 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
1094 			return -EAGAIN;
1095 		}
1096 
1097 		INIT_LIST_HEAD(&range->list);
1098 		list_add_tail(&range->list, ranges);
1099 
1100 		if ((u64)range->len > (u64)end_lblk - cur_lblk)
1101 			break;
1102 
1103 		cur_lblk += range->len;
1104 	}
1105 
1106 	if (nr_rangesp)
1107 		*nr_rangesp = nr_ranges;
1108 	return 0;
1109 }
1110 
1111 static int ext4_fc_snapshot_inode(struct inode *inode,
1112 				  unsigned int nr_ranges_total,
1113 				  unsigned int *nr_rangesp, int *snap_err)
1114 {
1115 	struct ext4_inode_info *ei = EXT4_I(inode);
1116 	struct ext4_fc_snap_stats *stats =
1117 		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
1118 	struct ext4_fc_inode_snap *snap;
1119 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
1120 	struct ext4_iloc iloc;
1121 	LIST_HEAD(ranges);
1122 	unsigned int nr_ranges = 0;
1123 	int ret;
1124 	int alloc_ctx;
1125 
1126 	ret = ext4_get_inode_loc_noio(inode, &iloc);
1127 	if (ret) {
1128 		atomic64_inc(&stats->snap_fail_inode_loc);
1129 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
1130 		return ret;
1131 	}
1132 
1133 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1134 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
1135 	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
1136 		inode_len += ei->i_extra_isize;
1137 
1138 	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
1139 	if (!snap) {
1140 		atomic64_inc(&stats->snap_fail_nomem);
1141 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
1142 		brelse(iloc.bh);
1143 		return -ENOMEM;
1144 	}
1145 	INIT_LIST_HEAD(&snap->data_list);
1146 	snap->inode_len = inode_len;
1147 
1148 	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
1149 	brelse(iloc.bh);
1150 
1151 	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
1152 					  &nr_ranges, snap_err);
1153 	if (ret) {
1154 		kfree(snap);
1155 		ext4_fc_free_ranges(&ranges);
1156 		return ret;
1157 	}
1158 
1159 	alloc_ctx = ext4_fc_lock(inode->i_sb);
1160 	ext4_fc_free_inode_snap(inode);
1161 	ei->i_fc_snap = snap;
1162 	list_splice_tail_init(&ranges, &snap->data_list);
1163 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
1164 
1165 	atomic64_inc(&stats->snap_inodes);
1166 	atomic64_add(nr_ranges, &stats->snap_ranges);
1167 	if (nr_rangesp)
1168 		*nr_rangesp = nr_ranges;
1169 	return 0;
1170 }
1171 
1172 /* Flushes data of all the inodes in the commit queue. */
1173 static int ext4_fc_flush_data(journal_t *journal)
1174 {
1175 	struct super_block *sb = journal->j_private;
1176 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1177 	struct ext4_inode_info *ei;
1178 	int ret = 0;
1179 
1180 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1181 		ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode));
1182 		if (ret)
1183 			return ret;
1184 	}
1185 
1186 	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1187 		ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode));
1188 		if (ret)
1189 			return ret;
1190 	}
1191 
1192 	return 0;
1193 }
1194 
1195 /* Commit all the directory entry updates */
1196 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1197 {
1198 	struct super_block *sb = journal->j_private;
1199 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1200 	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1201 	struct inode *inode;
1202 	struct ext4_inode_info *ei;
1203 	int ret;
1204 
1205 	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1206 		return 0;
1207 	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1208 				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1209 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1210 			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1211 				return -ENOSPC;
1212 			continue;
1213 		}
1214 		/*
1215 		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1216 		 * corresponding inode. Also, the corresponding inode could have been
1217 		 * deleted, in which case, we don't need to do anything.
1218 		 */
1219 		if (list_empty(&fc_dentry->fcd_dilist))
1220 			continue;
1221 		/*
1222 		 * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created
1223 		 * inode's i_fc_dilist list (kept singular), so we can recover the
1224 		 * inode through it.
1225 		 */
1226 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1227 				struct ext4_inode_info, i_fc_dilist);
1228 		inode = &ei->vfs_inode;
1229 		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1230 
1231 		/*
1232 		 * We first write the inode and then the create dirent. This
1233 		 * allows the recovery code to create an unnamed inode first
1234 		 * and then link it to a directory entry. This allows us
1235 		 * to use namei.c routines almost as is and simplifies
1236 		 * the recovery code.
1237 		 */
1238 		ret = ext4_fc_write_inode(inode, crc);
1239 		if (ret)
1240 			return ret;
1241 		ret = ext4_fc_write_inode_data(inode, crc);
1242 		if (ret)
1243 			return ret;
1244 		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1245 			return -ENOSPC;
1246 	}
1247 	return 0;
1248 }
1249 
1250 static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
1251 					 struct inode ***inodesp,
1252 					 unsigned int *nr_inodesp);
1253 
1254 static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
1255 				   unsigned int inodes_size,
1256 				   unsigned int *nr_inodesp,
1257 				   unsigned int *nr_rangesp,
1258 				   int *snap_err)
1259 {
1260 	struct super_block *sb = journal->j_private;
1261 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1262 	struct ext4_inode_info *iter;
1263 	struct ext4_fc_dentry_update *fc_dentry;
1264 	unsigned int i = 0;
1265 	unsigned int idx;
1266 	unsigned int nr_ranges = 0;
1267 	int ret = 0;
1268 	int alloc_ctx;
1269 
1270 	alloc_ctx = ext4_fc_lock(sb);
1271 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1272 		if (i >= inodes_size) {
1273 			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
1274 			ext4_fc_set_snap_err(snap_err,
1275 					     EXT4_FC_SNAP_ERR_INODES_CAP);
1276 			ret = -E2BIG;
1277 			goto unlock;
1278 		}
1279 		inodes[i++] = &iter->vfs_inode;
1280 	}
1281 
1282 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1283 		struct ext4_inode_info *ei;
1284 		struct inode *inode;
1285 
1286 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
1287 			continue;
1288 		if (list_empty(&fc_dentry->fcd_dilist))
1289 			continue;
1290 
1291 		/* See the comment in ext4_fc_commit_dentry_updates(). */
1292 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1293 				      struct ext4_inode_info, i_fc_dilist);
1294 		inode = &ei->vfs_inode;
1295 		if (!list_empty(&ei->i_fc_list))
1296 			continue;
1297 
1298 		if (i >= inodes_size) {
1299 			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
1300 			ext4_fc_set_snap_err(snap_err,
1301 					     EXT4_FC_SNAP_ERR_INODES_CAP);
1302 			ret = -E2BIG;
1303 			goto unlock;
1304 		}
1305 		/*
1306 		 * Create-only inodes may only be referenced via fcd_dilist and
1307 		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
1308 		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
1309 		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
1310 		 * so the inode stays pinned and the snapshot stays valid until
1311 		 * ext4_fc_cleanup().
1312 		 */
1313 		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
1314 		inodes[i++] = inode;
1315 	}
1316 unlock:
1317 	ext4_fc_unlock(sb, alloc_ctx);
1318 
1319 	if (ret)
1320 		return ret;
1321 
1322 	for (idx = 0; idx < i; idx++) {
1323 		unsigned int inode_ranges = 0;
1324 
1325 		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
1326 					     &inode_ranges, snap_err);
1327 		if (ret)
1328 			break;
1329 		nr_ranges += inode_ranges;
1330 	}
1331 
1332 	if (nr_inodesp)
1333 		*nr_inodesp = idx;
1334 	if (nr_rangesp)
1335 		*nr_rangesp = nr_ranges;
1336 	return ret;
1337 }
1338 
1339 static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
1340 {
1341 	struct super_block *sb = journal->j_private;
1342 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1343 	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
1344 	struct ext4_inode_info *iter;
1345 	struct ext4_fc_head head;
1346 	struct inode *inode;
1347 	struct inode **inodes;
1348 	unsigned int inodes_size;
1349 	unsigned int snap_inodes = 0;
1350 	unsigned int snap_ranges = 0;
1351 	int snap_err = EXT4_FC_SNAP_ERR_NONE;
1352 	struct blk_plug plug;
1353 	int ret = 0;
1354 	u32 crc = 0;
1355 	int alloc_ctx;
1356 	ktime_t lock_start;
1357 	u64 locked_ns;
1358 
1359 	/*
1360 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
1361 	 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
1362 	 * freed until the data flush is over.
1363 	 */
1364 	alloc_ctx = ext4_fc_lock(sb);
1365 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1366 		ext4_set_inode_state(&iter->vfs_inode,
1367 				     EXT4_STATE_FC_FLUSHING_DATA);
1368 	}
1369 	ext4_fc_unlock(sb, alloc_ctx);
1370 
1371 	/* Step 2: Flush data for all the eligible inodes. */
1372 	ret = ext4_fc_flush_data(journal);
1373 
1374 	/*
1375 	 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
1376 	 * any error from step 2. This ensures that waiters waiting on
1377 	 * EXT4_STATE_FC_FLUSHING_DATA can resume.
1378 	 */
1379 	alloc_ctx = ext4_fc_lock(sb);
1380 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1381 		ext4_clear_inode_state(&iter->vfs_inode,
1382 				       EXT4_STATE_FC_FLUSHING_DATA);
1383 		ext4_fc_wake_inode_state(&iter->vfs_inode,
1384 					 EXT4_STATE_FC_FLUSHING_DATA);
1385 	}
1386 
1387 	/*
1388 	 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
1389 	 * the waiter checks the bit. Pairs with implicit barrier in
1390 	 * prepare_to_wait() in ext4_fc_del().
1391 	 */
1392 	smp_mb();
1393 	ext4_fc_unlock(sb, alloc_ctx);
1394 
1395 	/*
1396 	 * If we encountered error in Step 2, return it now after clearing
1397 	 * EXT4_STATE_FC_FLUSHING_DATA bit.
1398 	 */
1399 	if (ret)
1400 		return ret;
1401 
1402 	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
1403 	if (ret) {
1404 		if (ret == -E2BIG)
1405 			atomic64_inc(&snap_stats->snap_fail_inodes_cap);
1406 		else if (ret == -ENOMEM)
1407 			atomic64_inc(&snap_stats->snap_fail_nomem);
1408 		return ret;
1409 	}
1410 
1411 	/* Step 4: Mark all inodes as being committed. */
1412 	jbd2_journal_lock_updates(journal);
1413 	lock_start = ktime_get();
1414 	/*
1415 	 * The journal is now locked. No more handles can start and all the
1416 	 * previous handles are now drained. Snapshotting happens in this
1417 	 * window so log writing can consume only stable snapshots without
1418 	 * doing logical-to-physical mapping.
1419 	 */
1420 	alloc_ctx = ext4_fc_lock(sb);
1421 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1422 		ext4_set_inode_state(&iter->vfs_inode,
1423 				     EXT4_STATE_FC_COMMITTING);
1424 	}
1425 	ext4_fc_unlock(sb, alloc_ctx);
1426 
1427 	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
1428 				      &snap_inodes, &snap_ranges, &snap_err);
1429 	jbd2_journal_unlock_updates(journal);
1430 	locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
1431 	atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total);
1432 	atomic64_inc(&snap_stats->lock_updates_samples);
1433 	ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max,
1434 				      locked_ns);
1435 	if (trace_ext4_fc_lock_updates_enabled())
1436 		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
1437 						 snap_inodes, snap_ranges,
1438 						 ret, snap_err);
1439 	kvfree(inodes);
1440 	if (ret)
1441 		return ret;
1442 
1443 	/*
1444 	 * Step 5: If file system device is different from journal device,
1445 	 * issue a cache flush before we start writing fast commit blocks.
1446 	 */
1447 	if (journal->j_fs_dev != journal->j_dev)
1448 		blkdev_issue_flush(journal->j_fs_dev);
1449 
1450 	blk_start_plug(&plug);
1451 	alloc_ctx = ext4_fc_lock(sb);
1452 	/* Step 6: Write fast commit blocks to disk. */
1453 	if (sbi->s_fc_bytes == 0) {
1454 		/*
1455 		 * Step 6.1: Add a head tag only if this is the first fast
1456 		 * commit in this TID.
1457 		 */
1458 		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1459 		head.fc_tid = cpu_to_le32(
1460 			sbi->s_journal->j_running_transaction->t_tid);
1461 		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1462 			(u8 *)&head, &crc)) {
1463 			ret = -ENOSPC;
1464 			goto out;
1465 		}
1466 	}
1467 
1468 	/* Step 6.2: Now write all the dentry updates. */
1469 	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1470 	if (ret)
1471 		goto out;
1472 
1473 	/* Step 6.3: Now write all the changed inodes to disk. */
1474 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1475 		inode = &iter->vfs_inode;
1476 		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1477 			continue;
1478 
1479 		ret = ext4_fc_write_inode_data(inode, &crc);
1480 		if (ret)
1481 			goto out;
1482 		ret = ext4_fc_write_inode(inode, &crc);
1483 		if (ret)
1484 			goto out;
1485 	}
1486 	/* Step 6.4: Finally write tail tag to conclude this fast commit. */
1487 	ret = ext4_fc_write_tail(sb, crc);
1488 
1489 out:
1490 	ext4_fc_unlock(sb, alloc_ctx);
1491 	blk_finish_plug(&plug);
1492 	return ret;
1493 }
1494 
1495 static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb)
1496 {
1497 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1498 	struct ext4_inode_info *iter;
1499 	struct ext4_fc_dentry_update *fc_dentry;
1500 	unsigned int nr_inodes = 0;
1501 	int alloc_ctx;
1502 
1503 	alloc_ctx = ext4_fc_lock(sb);
1504 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
1505 		nr_inodes++;
1506 
1507 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1508 		struct ext4_inode_info *ei;
1509 
1510 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
1511 			continue;
1512 		if (list_empty(&fc_dentry->fcd_dilist))
1513 			continue;
1514 
1515 		/* See the comment in ext4_fc_commit_dentry_updates(). */
1516 		ei = list_first_entry(&fc_dentry->fcd_dilist,
1517 				      struct ext4_inode_info, i_fc_dilist);
1518 		if (!list_empty(&ei->i_fc_list))
1519 			continue;
1520 
1521 		nr_inodes++;
1522 	}
1523 	ext4_fc_unlock(sb, alloc_ctx);
1524 
1525 	return nr_inodes;
1526 }
1527 
1528 static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
1529 					 struct inode ***inodesp,
1530 					 unsigned int *nr_inodesp)
1531 {
1532 	unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb);
1533 	struct inode **inodes;
1534 
1535 	*inodesp = NULL;
1536 	*nr_inodesp = 0;
1537 
1538 	if (!nr_inodes)
1539 		return 0;
1540 
1541 	if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES)
1542 		return -E2BIG;
1543 
1544 	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
1545 	if (!inodes)
1546 		return -ENOMEM;
1547 
1548 	*inodesp = inodes;
1549 	*nr_inodesp = nr_inodes;
1550 	return 0;
1551 }
1552 
1553 static void ext4_fc_update_stats(struct super_block *sb, int status,
1554 				 u64 commit_time, int nblks, tid_t commit_tid)
1555 {
1556 	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1557 
1558 	ext4_debug("Fast commit ended with status = %d for tid %u",
1559 			status, commit_tid);
1560 	if (status == EXT4_FC_STATUS_OK) {
1561 		stats->fc_num_commits++;
1562 		stats->fc_numblks += nblks;
1563 		if (likely(stats->s_fc_avg_commit_time))
1564 			stats->s_fc_avg_commit_time =
1565 				(commit_time +
1566 				 stats->s_fc_avg_commit_time * 3) / 4;
1567 		else
1568 			stats->s_fc_avg_commit_time = commit_time;
1569 	} else if (status == EXT4_FC_STATUS_FAILED ||
1570 		   status == EXT4_FC_STATUS_INELIGIBLE) {
1571 		if (status == EXT4_FC_STATUS_FAILED)
1572 			stats->fc_failed_commits++;
1573 		stats->fc_ineligible_commits++;
1574 	} else {
1575 		stats->fc_skipped_commits++;
1576 	}
1577 	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1578 }
1579 
1580 /*
1581  * The main commit entry point. Performs a fast commit for transaction
1582  * commit_tid if needed. If it's not possible to perform a fast commit
1583  * due to various reasons, we fall back to full commit. Returns 0
1584  * on success, error otherwise.
1585  */
1586 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1587 {
1588 	struct super_block *sb = journal->j_private;
1589 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1590 	int nblks = 0, ret, bsize = journal->j_blocksize;
1591 	int subtid = atomic_read(&sbi->s_fc_subtid);
1592 	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1593 	ktime_t start_time, commit_time;
1594 	int old_ioprio, journal_ioprio;
1595 
1596 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1597 		return jbd2_complete_transaction(journal, commit_tid);
1598 
1599 	trace_ext4_fc_commit_start(sb, commit_tid);
1600 
1601 	start_time = ktime_get();
1602 	old_ioprio = get_current_ioprio();
1603 
1604 restart_fc:
1605 	ret = jbd2_fc_begin_commit(journal, commit_tid);
1606 	if (ret == -EALREADY) {
1607 		/* There was an ongoing commit, check if we need to restart */
1608 		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1609 		    tid_gt(commit_tid, journal->j_commit_sequence))
1610 			goto restart_fc;
1611 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1612 				commit_tid);
1613 		return 0;
1614 	} else if (ret) {
1615 		/*
1616 		 * Commit couldn't start. Just update stats and perform a
1617 		 * full commit.
1618 		 */
1619 		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1620 				commit_tid);
1621 		return jbd2_complete_transaction(journal, commit_tid);
1622 	}
1623 
1624 	/*
1625 	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1626 	 * if we are fast commit ineligible.
1627 	 */
1628 	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1629 		status = EXT4_FC_STATUS_INELIGIBLE;
1630 		goto fallback;
1631 	}
1632 
1633 	/*
1634 	 * Now that we know that this thread is going to do a fast commit,
1635 	 * elevate the priority to match that of the journal thread.
1636 	 */
1637 	if (journal->j_task->io_context)
1638 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
1639 	else
1640 		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
1641 	set_task_ioprio(current, journal_ioprio);
1642 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1643 	ret = ext4_fc_perform_commit(journal, commit_tid);
1644 	if (ret < 0) {
1645 		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
1646 			status = EXT4_FC_STATUS_INELIGIBLE;
1647 		else
1648 			status = EXT4_FC_STATUS_FAILED;
1649 		goto fallback;
1650 	}
1651 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1652 	ret = jbd2_fc_wait_bufs(journal, nblks);
1653 	if (ret < 0) {
1654 		status = EXT4_FC_STATUS_FAILED;
1655 		goto fallback;
1656 	}
1657 	atomic_inc(&sbi->s_fc_subtid);
1658 	ret = jbd2_fc_end_commit(journal);
1659 	set_task_ioprio(current, old_ioprio);
1660 	/*
1661 	 * weight the commit time higher than the average time so we
1662 	 * don't react too strongly to vast changes in the commit time
1663 	 */
1664 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1665 	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1666 	return ret;
1667 
1668 fallback:
1669 	set_task_ioprio(current, old_ioprio);
1670 	ret = jbd2_fc_end_commit_fallback(journal);
1671 	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1672 	return ret;
1673 }
1674 
1675 /*
1676  * Fast commit cleanup routine. This is called after every fast commit and
1677  * full commit. full is true if we are called after a full commit.
1678  */
1679 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1680 {
1681 	struct super_block *sb = journal->j_private;
1682 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1683 	struct ext4_inode_info *ei;
1684 	struct ext4_fc_dentry_update *fc_dentry;
1685 	int alloc_ctx;
1686 
1687 	if (full && sbi->s_fc_bh)
1688 		sbi->s_fc_bh = NULL;
1689 
1690 	trace_ext4_fc_cleanup(journal, full, tid);
1691 	jbd2_fc_release_bufs(journal);
1692 
1693 	alloc_ctx = ext4_fc_lock(sb);
1694 	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
1695 		bool requeue;
1696 
1697 		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
1698 					struct ext4_inode_info,
1699 					i_fc_list);
1700 		list_del_init(&ei->i_fc_list);
1701 		ext4_fc_free_inode_snap(&ei->vfs_inode);
1702 		spin_lock(&ei->i_fc_lock);
1703 		if (full)
1704 			requeue = !tid_geq(tid, ei->i_sync_tid);
1705 		else
1706 			requeue = ext4_test_inode_state(&ei->vfs_inode,
1707 							EXT4_STATE_FC_REQUEUE);
1708 		if (!requeue)
1709 			ext4_fc_reset_inode(&ei->vfs_inode);
1710 		ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE);
1711 		ext4_clear_inode_state(&ei->vfs_inode,
1712 				       EXT4_STATE_FC_COMMITTING);
1713 		spin_unlock(&ei->i_fc_lock);
1714 		if (requeue)
1715 			list_add_tail(&ei->i_fc_list,
1716 				      &sbi->s_fc_q[FC_Q_STAGING]);
1717 		/*
1718 		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
1719 		 * visible before we send the wakeup. Pairs with implicit
1720 		 * barrier in prepare_to_wait() in ext4_fc_del().
1721 		 */
1722 		smp_mb();
1723 		ext4_fc_wake_inode_state(&ei->vfs_inode,
1724 					 EXT4_STATE_FC_COMMITTING);
1725 	}
1726 
1727 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1728 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1729 						 struct ext4_fc_dentry_update,
1730 						 fcd_list);
1731 		list_del_init(&fc_dentry->fcd_list);
1732 		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
1733 			!list_empty(&fc_dentry->fcd_dilist)) {
1734 			/* See the comment in ext4_fc_commit_dentry_updates(). */
1735 			ei = list_first_entry(&fc_dentry->fcd_dilist,
1736 						  struct ext4_inode_info,
1737 						  i_fc_dilist);
1738 			ext4_fc_free_inode_snap(&ei->vfs_inode);
1739 			spin_lock(&ei->i_fc_lock);
1740 			ext4_clear_inode_state(&ei->vfs_inode,
1741 						   EXT4_STATE_FC_REQUEUE);
1742 			ext4_clear_inode_state(&ei->vfs_inode,
1743 						   EXT4_STATE_FC_COMMITTING);
1744 			spin_unlock(&ei->i_fc_lock);
1745 			/*
1746 			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
1747 			 * visible before we send the wakeup. Pairs with
1748 			 * implicit barrier in prepare_to_wait() in
1749 			 * ext4_fc_del().
1750 			 */
1751 			smp_mb();
1752 			ext4_fc_wake_inode_state(&ei->vfs_inode,
1753 						 EXT4_STATE_FC_COMMITTING);
1754 		}
1755 		list_del_init(&fc_dentry->fcd_dilist);
1756 
1757 		release_dentry_name_snapshot(&fc_dentry->fcd_name);
1758 		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1759 	}
1760 
1761 	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1762 				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1763 	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1764 				&sbi->s_fc_q[FC_Q_MAIN]);
1765 
1766 	if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1767 		sbi->s_fc_ineligible_tid = 0;
1768 		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1769 	}
1770 
1771 	if (full)
1772 		sbi->s_fc_bytes = 0;
1773 	ext4_fc_unlock(sb, alloc_ctx);
1774 	trace_ext4_fc_stats(sb);
1775 }
1776 
1777 /* Ext4 Replay Path Routines */
1778 
1779 /* Helper struct for dentry replay routines */
1780 struct dentry_info_args {
1781 	int parent_ino, dname_len, ino, inode_len;
1782 	char *dname;
1783 };
1784 
1785 /* Same as struct ext4_fc_tl, but uses native endianness fields */
1786 struct ext4_fc_tl_mem {
1787 	u16 fc_tag;
1788 	u16 fc_len;
1789 };
1790 
1791 static inline void tl_to_darg(struct dentry_info_args *darg,
1792 			      struct ext4_fc_tl_mem *tl, u8 *val)
1793 {
1794 	struct ext4_fc_dentry_info fcd;
1795 
1796 	memcpy(&fcd, val, sizeof(fcd));
1797 
1798 	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1799 	darg->ino = le32_to_cpu(fcd.fc_ino);
1800 	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1801 	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1802 }
1803 
1804 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1805 {
1806 	struct ext4_fc_tl tl_disk;
1807 
1808 	memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1809 	tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1810 	tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1811 }
1812 
1813 /* Unlink replay function */
1814 static int ext4_fc_replay_unlink(struct super_block *sb,
1815 				 struct ext4_fc_tl_mem *tl, u8 *val)
1816 {
1817 	struct inode *inode, *old_parent;
1818 	struct qstr entry;
1819 	struct dentry_info_args darg;
1820 	int ret = 0;
1821 
1822 	tl_to_darg(&darg, tl, val);
1823 
1824 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1825 			darg.parent_ino, darg.dname_len);
1826 
1827 	entry.name = darg.dname;
1828 	entry.len = darg.dname_len;
1829 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1830 
1831 	if (IS_ERR(inode)) {
1832 		ext4_debug("Inode %d not found", darg.ino);
1833 		return 0;
1834 	}
1835 
1836 	old_parent = ext4_iget(sb, darg.parent_ino,
1837 				EXT4_IGET_NORMAL);
1838 	if (IS_ERR(old_parent)) {
1839 		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1840 		iput(inode);
1841 		return 0;
1842 	}
1843 
1844 	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1845 	/* -ENOENT ok coz it might not exist anymore. */
1846 	if (ret == -ENOENT)
1847 		ret = 0;
1848 	iput(old_parent);
1849 	iput(inode);
1850 	return ret;
1851 }
1852 
1853 static int ext4_fc_replay_link_internal(struct super_block *sb,
1854 				struct dentry_info_args *darg,
1855 				struct inode *inode)
1856 {
1857 	struct inode *dir = NULL;
1858 	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1859 	int ret = 0;
1860 
1861 	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1862 	if (IS_ERR(dir)) {
1863 		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1864 		dir = NULL;
1865 		goto out;
1866 	}
1867 
1868 	ret = __ext4_link(dir, inode, &qstr_dname, NULL);
1869 	/*
1870 	 * It's possible that link already existed since data blocks
1871 	 * for the dir in question got persisted before we crashed OR
1872 	 * we replayed this tag and crashed before the entire replay
1873 	 * could complete.
1874 	 */
1875 	if (ret && ret != -EEXIST) {
1876 		ext4_debug("Failed to link\n");
1877 		goto out;
1878 	}
1879 
1880 	ret = 0;
1881 out:
1882 	if (dir)
1883 		iput(dir);
1884 
1885 	return ret;
1886 }
1887 
1888 /* Link replay function */
1889 static int ext4_fc_replay_link(struct super_block *sb,
1890 			       struct ext4_fc_tl_mem *tl, u8 *val)
1891 {
1892 	struct inode *inode;
1893 	struct dentry_info_args darg;
1894 	int ret = 0;
1895 
1896 	tl_to_darg(&darg, tl, val);
1897 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1898 			darg.parent_ino, darg.dname_len);
1899 
1900 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1901 	if (IS_ERR(inode)) {
1902 		ext4_debug("Inode not found.");
1903 		return 0;
1904 	}
1905 
1906 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1907 	iput(inode);
1908 	return ret;
1909 }
1910 
1911 /*
1912  * Record all the modified inodes during replay. We use this later to setup
1913  * block bitmaps correctly.
1914  */
1915 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1916 {
1917 	struct ext4_fc_replay_state *state;
1918 	int i;
1919 
1920 	state = &EXT4_SB(sb)->s_fc_replay_state;
1921 	for (i = 0; i < state->fc_modified_inodes_used; i++)
1922 		if (state->fc_modified_inodes[i] == ino)
1923 			return 0;
1924 	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1925 		int *fc_modified_inodes;
1926 
1927 		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1928 				sizeof(int) * (state->fc_modified_inodes_size +
1929 				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1930 				GFP_KERNEL);
1931 		if (!fc_modified_inodes)
1932 			return -ENOMEM;
1933 		state->fc_modified_inodes = fc_modified_inodes;
1934 		state->fc_modified_inodes_size +=
1935 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1936 	}
1937 	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1938 	return 0;
1939 }
1940 
1941 /*
1942  * Inode replay function
1943  */
1944 static int ext4_fc_replay_inode(struct super_block *sb,
1945 				struct ext4_fc_tl_mem *tl, u8 *val)
1946 {
1947 	struct ext4_fc_inode fc_inode;
1948 	struct ext4_inode *raw_inode;
1949 	struct ext4_inode *raw_fc_inode;
1950 	struct inode *inode = NULL;
1951 	struct ext4_iloc iloc;
1952 	int inode_len, ino, ret, tag = tl->fc_tag;
1953 	struct ext4_extent_header *eh;
1954 	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1955 
1956 	memcpy(&fc_inode, val, sizeof(fc_inode));
1957 
1958 	ino = le32_to_cpu(fc_inode.fc_ino);
1959 	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1960 
1961 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1962 	if (!IS_ERR(inode)) {
1963 		ext4_ext_clear_bb(inode);
1964 		iput(inode);
1965 	}
1966 	inode = NULL;
1967 
1968 	ret = ext4_fc_record_modified_inode(sb, ino);
1969 	if (ret)
1970 		goto out;
1971 
1972 	raw_fc_inode = (struct ext4_inode *)
1973 		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1974 	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1975 	if (ret)
1976 		goto out;
1977 
1978 	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1979 	raw_inode = ext4_raw_inode(&iloc);
1980 
1981 	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1982 	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1983 	       inode_len - off_gen);
1984 	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1985 		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1986 		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1987 			memset(eh, 0, sizeof(*eh));
1988 			eh->eh_magic = EXT4_EXT_MAGIC;
1989 			eh->eh_max = cpu_to_le16(
1990 				(sizeof(raw_inode->i_block) -
1991 				 sizeof(struct ext4_extent_header))
1992 				 / sizeof(struct ext4_extent));
1993 		}
1994 	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1995 		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1996 			sizeof(raw_inode->i_block));
1997 	}
1998 
1999 	/* Immediately update the inode on disk. */
2000 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
2001 	if (ret)
2002 		goto out_brelse;
2003 	ret = sync_dirty_buffer(iloc.bh);
2004 	if (ret)
2005 		goto out_brelse;
2006 	ret = ext4_mark_inode_used(sb, ino);
2007 	if (ret)
2008 		goto out_brelse;
2009 
2010 	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
2011 	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
2012 	if (IS_ERR(inode)) {
2013 		ext4_debug("Inode not found.");
2014 		inode = NULL;
2015 		ret = -EFSCORRUPTED;
2016 		goto out_brelse;
2017 	}
2018 
2019 	/*
2020 	 * Our allocator could have made different decisions than before
2021 	 * crashing. This should be fixed but until then, we calculate
2022 	 * the number of blocks the inode.
2023 	 */
2024 	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
2025 		ext4_ext_replay_set_iblocks(inode);
2026 
2027 	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
2028 	ext4_reset_inode_seed(inode);
2029 
2030 	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
2031 	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
2032 	sync_dirty_buffer(iloc.bh);
2033 out_brelse:
2034 	brelse(iloc.bh);
2035 out:
2036 	iput(inode);
2037 	if (!ret)
2038 		blkdev_issue_flush(sb->s_bdev);
2039 
2040 	return ret;
2041 }
2042 
2043 /*
2044  * Dentry create replay function.
2045  *
2046  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
2047  * inode for which we are trying to create a dentry here, should already have
2048  * been replayed before we start here.
2049  */
2050 static int ext4_fc_replay_create(struct super_block *sb,
2051 				 struct ext4_fc_tl_mem *tl, u8 *val)
2052 {
2053 	int ret = 0;
2054 	struct inode *inode = NULL;
2055 	struct inode *dir = NULL;
2056 	struct dentry_info_args darg;
2057 
2058 	tl_to_darg(&darg, tl, val);
2059 
2060 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
2061 			darg.parent_ino, darg.dname_len);
2062 
2063 	/* This takes care of update group descriptor and other metadata */
2064 	ret = ext4_mark_inode_used(sb, darg.ino);
2065 	if (ret)
2066 		goto out;
2067 
2068 	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
2069 	if (IS_ERR(inode)) {
2070 		ext4_debug("inode %d not found.", darg.ino);
2071 		inode = NULL;
2072 		ret = -EINVAL;
2073 		goto out;
2074 	}
2075 
2076 	if (S_ISDIR(inode->i_mode)) {
2077 		/*
2078 		 * If we are creating a directory, we need to make sure that the
2079 		 * dot and dot dot dirents are setup properly.
2080 		 */
2081 		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
2082 		if (IS_ERR(dir)) {
2083 			ext4_debug("Dir %d not found.", darg.ino);
2084 			goto out;
2085 		}
2086 		ret = ext4_init_new_dir(NULL, dir, inode);
2087 		iput(dir);
2088 		if (ret) {
2089 			ret = 0;
2090 			goto out;
2091 		}
2092 	}
2093 	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
2094 	if (ret)
2095 		goto out;
2096 	set_nlink(inode, 1);
2097 	ext4_mark_inode_dirty(NULL, inode);
2098 out:
2099 	iput(inode);
2100 	return ret;
2101 }
2102 
2103 /*
2104  * Record physical disk regions which are in use as per fast commit area,
2105  * and used by inodes during replay phase. Our simple replay phase
2106  * allocator excludes these regions from allocation.
2107  */
2108 int ext4_fc_record_regions(struct super_block *sb, int ino,
2109 		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
2110 {
2111 	struct ext4_fc_replay_state *state;
2112 	struct ext4_fc_alloc_region *region;
2113 
2114 	state = &EXT4_SB(sb)->s_fc_replay_state;
2115 	/*
2116 	 * during replay phase, the fc_regions_valid may not same as
2117 	 * fc_regions_used, update it when do new additions.
2118 	 */
2119 	if (replay && state->fc_regions_used != state->fc_regions_valid)
2120 		state->fc_regions_used = state->fc_regions_valid;
2121 	if (state->fc_regions_used == state->fc_regions_size) {
2122 		struct ext4_fc_alloc_region *fc_regions;
2123 
2124 		fc_regions = krealloc(state->fc_regions,
2125 				      sizeof(struct ext4_fc_alloc_region) *
2126 				      (state->fc_regions_size +
2127 				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
2128 				      GFP_KERNEL);
2129 		if (!fc_regions)
2130 			return -ENOMEM;
2131 		state->fc_regions_size +=
2132 			EXT4_FC_REPLAY_REALLOC_INCREMENT;
2133 		state->fc_regions = fc_regions;
2134 	}
2135 	region = &state->fc_regions[state->fc_regions_used++];
2136 	region->ino = ino;
2137 	region->lblk = lblk;
2138 	region->pblk = pblk;
2139 	region->len = len;
2140 
2141 	if (replay)
2142 		state->fc_regions_valid++;
2143 
2144 	return 0;
2145 }
2146 
2147 /* Replay add range tag */
2148 static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val)
2149 {
2150 	struct ext4_fc_add_range fc_add_ex;
2151 	struct ext4_extent newex, *ex;
2152 	struct inode *inode;
2153 	ext4_lblk_t start, cur;
2154 	int remaining, len;
2155 	ext4_fsblk_t start_pblk;
2156 	struct ext4_map_blocks map;
2157 	struct ext4_ext_path *path = NULL;
2158 	int ret;
2159 
2160 	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
2161 	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
2162 
2163 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
2164 		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
2165 		ext4_ext_get_actual_len(ex));
2166 
2167 	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
2168 	if (IS_ERR(inode)) {
2169 		ext4_debug("Inode not found.");
2170 		return 0;
2171 	}
2172 
2173 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
2174 	if (ret)
2175 		goto out;
2176 
2177 	start = le32_to_cpu(ex->ee_block);
2178 	start_pblk = ext4_ext_pblock(ex);
2179 	len = ext4_ext_get_actual_len(ex);
2180 
2181 	cur = start;
2182 	remaining = len;
2183 	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n",
2184 		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
2185 		  inode->i_ino);
2186 
2187 	while (remaining > 0) {
2188 		map.m_lblk = cur;
2189 		map.m_len = remaining;
2190 		map.m_pblk = 0;
2191 		ret = ext4_map_blocks(NULL, inode, &map, 0);
2192 
2193 		if (ret < 0)
2194 			goto out;
2195 
2196 		if (ret == 0) {
2197 			/* Range is not mapped */
2198 			path = ext4_find_extent(inode, cur, path, 0);
2199 			if (IS_ERR(path))
2200 				goto out;
2201 			memset(&newex, 0, sizeof(newex));
2202 			newex.ee_block = cpu_to_le32(cur);
2203 			ext4_ext_store_pblock(
2204 				&newex, start_pblk + cur - start);
2205 			newex.ee_len = cpu_to_le16(map.m_len);
2206 			if (ext4_ext_is_unwritten(ex))
2207 				ext4_ext_mark_unwritten(&newex);
2208 			down_write(&EXT4_I(inode)->i_data_sem);
2209 			path = ext4_ext_insert_extent(NULL, inode,
2210 						      path, &newex, 0);
2211 			up_write((&EXT4_I(inode)->i_data_sem));
2212 			if (IS_ERR(path))
2213 				goto out;
2214 			goto next;
2215 		}
2216 
2217 		if (start_pblk + cur - start != map.m_pblk) {
2218 			/*
2219 			 * Logical to physical mapping changed. This can happen
2220 			 * if this range was removed and then reallocated to
2221 			 * map to new physical blocks during a fast commit.
2222 			 */
2223 			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
2224 					ext4_ext_is_unwritten(ex),
2225 					start_pblk + cur - start);
2226 			if (ret)
2227 				goto out;
2228 			/*
2229 			 * Mark the old blocks as free since they aren't used
2230 			 * anymore. We maintain an array of all the modified
2231 			 * inodes. In case these blocks are still used at either
2232 			 * a different logical range in the same inode or in
2233 			 * some different inode, we will mark them as allocated
2234 			 * at the end of the FC replay using our array of
2235 			 * modified inodes.
2236 			 */
2237 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
2238 			goto next;
2239 		}
2240 
2241 		/* Range is mapped and needs a state change */
2242 		ext4_debug("Converting from %ld to %d %lld",
2243 				map.m_flags & EXT4_MAP_UNWRITTEN,
2244 			ext4_ext_is_unwritten(ex), map.m_pblk);
2245 		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
2246 					ext4_ext_is_unwritten(ex), map.m_pblk);
2247 		if (ret)
2248 			goto out;
2249 		/*
2250 		 * We may have split the extent tree while toggling the state.
2251 		 * Try to shrink the extent tree now.
2252 		 */
2253 		ext4_ext_replay_shrink_inode(inode, start + len);
2254 next:
2255 		cur += map.m_len;
2256 		remaining -= map.m_len;
2257 	}
2258 	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
2259 					sb->s_blocksize_bits);
2260 out:
2261 	ext4_free_ext_path(path);
2262 	iput(inode);
2263 	return 0;
2264 }
2265 
2266 /* Replay DEL_RANGE tag */
2267 static int
2268 ext4_fc_replay_del_range(struct super_block *sb, u8 *val)
2269 {
2270 	struct inode *inode;
2271 	struct ext4_fc_del_range lrange;
2272 	struct ext4_map_blocks map;
2273 	ext4_lblk_t cur, remaining;
2274 	int ret;
2275 
2276 	memcpy(&lrange, val, sizeof(lrange));
2277 	cur = le32_to_cpu(lrange.fc_lblk);
2278 	remaining = le32_to_cpu(lrange.fc_len);
2279 
2280 	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
2281 		le32_to_cpu(lrange.fc_ino), cur, remaining);
2282 
2283 	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
2284 	if (IS_ERR(inode)) {
2285 		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
2286 		return 0;
2287 	}
2288 
2289 	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
2290 	if (ret)
2291 		goto out;
2292 
2293 	ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n",
2294 			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
2295 			le32_to_cpu(lrange.fc_len));
2296 	while (remaining > 0) {
2297 		map.m_lblk = cur;
2298 		map.m_len = remaining;
2299 
2300 		ret = ext4_map_blocks(NULL, inode, &map, 0);
2301 		if (ret < 0)
2302 			goto out;
2303 		if (ret > 0) {
2304 			remaining -= ret;
2305 			cur += ret;
2306 			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
2307 		} else {
2308 			remaining -= map.m_len;
2309 			cur += map.m_len;
2310 		}
2311 	}
2312 
2313 	down_write(&EXT4_I(inode)->i_data_sem);
2314 	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
2315 				le32_to_cpu(lrange.fc_lblk) +
2316 				le32_to_cpu(lrange.fc_len) - 1);
2317 	up_write(&EXT4_I(inode)->i_data_sem);
2318 	if (ret)
2319 		goto out;
2320 	ext4_ext_replay_shrink_inode(inode,
2321 		i_size_read(inode) >> sb->s_blocksize_bits);
2322 	ext4_mark_inode_dirty(NULL, inode);
2323 out:
2324 	iput(inode);
2325 	return 0;
2326 }
2327 
2328 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
2329 {
2330 	struct ext4_fc_replay_state *state;
2331 	struct inode *inode;
2332 	struct ext4_ext_path *path = NULL;
2333 	struct ext4_map_blocks map;
2334 	int i, ret, j;
2335 	ext4_lblk_t cur, end;
2336 
2337 	state = &EXT4_SB(sb)->s_fc_replay_state;
2338 	for (i = 0; i < state->fc_modified_inodes_used; i++) {
2339 		inode = ext4_iget(sb, state->fc_modified_inodes[i],
2340 			EXT4_IGET_NORMAL);
2341 		if (IS_ERR(inode)) {
2342 			ext4_debug("Inode %d not found.",
2343 				state->fc_modified_inodes[i]);
2344 			continue;
2345 		}
2346 		cur = 0;
2347 		end = EXT_MAX_BLOCKS;
2348 		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
2349 			iput(inode);
2350 			continue;
2351 		}
2352 		while (cur < end) {
2353 			map.m_lblk = cur;
2354 			map.m_len = end - cur;
2355 
2356 			ret = ext4_map_blocks(NULL, inode, &map, 0);
2357 			if (ret < 0)
2358 				break;
2359 
2360 			if (ret > 0) {
2361 				path = ext4_find_extent(inode, map.m_lblk, path, 0);
2362 				if (!IS_ERR(path)) {
2363 					for (j = 0; j < path->p_depth; j++)
2364 						ext4_mb_mark_bb(inode->i_sb,
2365 							path[j].p_block, 1, true);
2366 				} else {
2367 					path = NULL;
2368 				}
2369 				cur += ret;
2370 				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
2371 							map.m_len, true);
2372 			} else {
2373 				cur = cur + (map.m_len ? map.m_len : 1);
2374 			}
2375 		}
2376 		iput(inode);
2377 	}
2378 
2379 	ext4_free_ext_path(path);
2380 }
2381 
2382 /*
2383  * Check if block is in excluded regions for block allocation. The simple
2384  * allocator that runs during replay phase is calls this function to see
2385  * if it is okay to use a block.
2386  */
2387 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
2388 {
2389 	int i;
2390 	struct ext4_fc_replay_state *state;
2391 
2392 	state = &EXT4_SB(sb)->s_fc_replay_state;
2393 	for (i = 0; i < state->fc_regions_valid; i++) {
2394 		if (state->fc_regions[i].ino == 0 ||
2395 			state->fc_regions[i].len == 0)
2396 			continue;
2397 		if (in_range(blk, state->fc_regions[i].pblk,
2398 					state->fc_regions[i].len))
2399 			return true;
2400 	}
2401 	return false;
2402 }
2403 
2404 /* Cleanup function called after replay */
2405 void ext4_fc_replay_cleanup(struct super_block *sb)
2406 {
2407 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2408 
2409 	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2410 	kfree(sbi->s_fc_replay_state.fc_regions);
2411 	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2412 }
2413 
2414 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2415 				      int tag, int len)
2416 {
2417 	switch (tag) {
2418 	case EXT4_FC_TAG_ADD_RANGE:
2419 		return len == sizeof(struct ext4_fc_add_range);
2420 	case EXT4_FC_TAG_DEL_RANGE:
2421 		return len == sizeof(struct ext4_fc_del_range);
2422 	case EXT4_FC_TAG_CREAT:
2423 	case EXT4_FC_TAG_LINK:
2424 	case EXT4_FC_TAG_UNLINK:
2425 		len -= sizeof(struct ext4_fc_dentry_info);
2426 		return len >= 1 && len <= EXT4_NAME_LEN;
2427 	case EXT4_FC_TAG_INODE:
2428 		len -= sizeof(struct ext4_fc_inode);
2429 		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2430 			len <= sbi->s_inode_size;
2431 	case EXT4_FC_TAG_PAD:
2432 		return true; /* padding can have any length */
2433 	case EXT4_FC_TAG_TAIL:
2434 		return len >= sizeof(struct ext4_fc_tail);
2435 	case EXT4_FC_TAG_HEAD:
2436 		return len == sizeof(struct ext4_fc_head);
2437 	}
2438 	return false;
2439 }
2440 
2441 /*
2442  * Recovery Scan phase handler
2443  *
2444  * This function is called during the scan phase and is responsible
2445  * for doing following things:
2446  * - Make sure the fast commit area has valid tags for replay
2447  * - Count number of tags that need to be replayed by the replay handler
2448  * - Verify CRC
2449  * - Create a list of excluded blocks for allocation during replay phase
2450  *
2451  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2452  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2453  * to indicate that scan has finished and JBD2 can now start replay phase.
2454  * It returns a negative error to indicate that there was an error. At the end
2455  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2456  * to indicate the number of tags that need to replayed during the replay phase.
2457  */
2458 static int ext4_fc_replay_scan(journal_t *journal,
2459 				struct buffer_head *bh, int off,
2460 				tid_t expected_tid)
2461 {
2462 	struct super_block *sb = journal->j_private;
2463 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2464 	struct ext4_fc_replay_state *state;
2465 	int ret = JBD2_FC_REPLAY_CONTINUE;
2466 	struct ext4_fc_add_range ext;
2467 	struct ext4_fc_tl_mem tl;
2468 	struct ext4_fc_tail tail;
2469 	__u8 *start, *end, *cur, *val;
2470 	struct ext4_fc_head head;
2471 	struct ext4_extent *ex;
2472 
2473 	state = &sbi->s_fc_replay_state;
2474 
2475 	start = (u8 *)bh->b_data;
2476 	end = start + journal->j_blocksize;
2477 
2478 	if (state->fc_replay_expected_off == 0) {
2479 		state->fc_cur_tag = 0;
2480 		state->fc_replay_num_tags = 0;
2481 		state->fc_crc = 0;
2482 		state->fc_regions = NULL;
2483 		state->fc_regions_valid = state->fc_regions_used =
2484 			state->fc_regions_size = 0;
2485 		/* Check if we can stop early */
2486 		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2487 			!= EXT4_FC_TAG_HEAD)
2488 			return 0;
2489 	}
2490 
2491 	if (off != state->fc_replay_expected_off) {
2492 		ret = -EFSCORRUPTED;
2493 		goto out_err;
2494 	}
2495 
2496 	state->fc_replay_expected_off++;
2497 	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2498 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2499 		ext4_fc_get_tl(&tl, cur);
2500 		val = cur + EXT4_FC_TAG_BASE_LEN;
2501 		if (tl.fc_len > end - val ||
2502 		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2503 			ret = state->fc_replay_num_tags ?
2504 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2505 			goto out_err;
2506 		}
2507 		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2508 			   tag2str(tl.fc_tag), bh->b_blocknr);
2509 		switch (tl.fc_tag) {
2510 		case EXT4_FC_TAG_ADD_RANGE:
2511 			memcpy(&ext, val, sizeof(ext));
2512 			ex = (struct ext4_extent *)&ext.fc_ex;
2513 			ret = ext4_fc_record_regions(sb,
2514 				le32_to_cpu(ext.fc_ino),
2515 				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2516 				ext4_ext_get_actual_len(ex), 0);
2517 			if (ret < 0)
2518 				break;
2519 			ret = JBD2_FC_REPLAY_CONTINUE;
2520 			fallthrough;
2521 		case EXT4_FC_TAG_DEL_RANGE:
2522 		case EXT4_FC_TAG_LINK:
2523 		case EXT4_FC_TAG_UNLINK:
2524 		case EXT4_FC_TAG_CREAT:
2525 		case EXT4_FC_TAG_INODE:
2526 		case EXT4_FC_TAG_PAD:
2527 			state->fc_cur_tag++;
2528 			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2529 				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2530 			break;
2531 		case EXT4_FC_TAG_TAIL:
2532 			state->fc_cur_tag++;
2533 			memcpy(&tail, val, sizeof(tail));
2534 			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2535 						EXT4_FC_TAG_BASE_LEN +
2536 						offsetof(struct ext4_fc_tail,
2537 						fc_crc));
2538 			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2539 				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2540 				state->fc_replay_num_tags = state->fc_cur_tag;
2541 				state->fc_regions_valid =
2542 					state->fc_regions_used;
2543 			} else {
2544 				ret = state->fc_replay_num_tags ?
2545 					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2546 			}
2547 			state->fc_crc = 0;
2548 			break;
2549 		case EXT4_FC_TAG_HEAD:
2550 			memcpy(&head, val, sizeof(head));
2551 			if (le32_to_cpu(head.fc_features) &
2552 				~EXT4_FC_SUPPORTED_FEATURES) {
2553 				ret = -EOPNOTSUPP;
2554 				break;
2555 			}
2556 			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2557 				ret = JBD2_FC_REPLAY_STOP;
2558 				break;
2559 			}
2560 			state->fc_cur_tag++;
2561 			state->fc_crc = ext4_chksum(state->fc_crc, cur,
2562 				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2563 			break;
2564 		default:
2565 			ret = state->fc_replay_num_tags ?
2566 				JBD2_FC_REPLAY_STOP : -ECANCELED;
2567 		}
2568 		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2569 			break;
2570 	}
2571 
2572 out_err:
2573 	trace_ext4_fc_replay_scan(sb, ret, off);
2574 	return ret;
2575 }
2576 
2577 /*
2578  * Main recovery path entry point.
2579  * The meaning of return codes is similar as above.
2580  */
2581 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2582 				enum passtype pass, int off, tid_t expected_tid)
2583 {
2584 	struct super_block *sb = journal->j_private;
2585 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2586 	struct ext4_fc_tl_mem tl;
2587 	__u8 *start, *end, *cur, *val;
2588 	int ret = JBD2_FC_REPLAY_CONTINUE;
2589 	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2590 	struct ext4_fc_tail tail;
2591 
2592 	if (pass == PASS_SCAN) {
2593 		state->fc_current_pass = PASS_SCAN;
2594 		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2595 	}
2596 
2597 	if (state->fc_current_pass != pass) {
2598 		state->fc_current_pass = pass;
2599 		sbi->s_mount_state |= EXT4_FC_REPLAY;
2600 	}
2601 	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2602 		ext4_debug("Replay stops\n");
2603 		ext4_fc_set_bitmaps_and_counters(sb);
2604 		return 0;
2605 	}
2606 
2607 #ifdef CONFIG_EXT4_DEBUG
2608 	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2609 		pr_warn("Dropping fc block %d because max_replay set\n", off);
2610 		return JBD2_FC_REPLAY_STOP;
2611 	}
2612 #endif
2613 
2614 	start = (u8 *)bh->b_data;
2615 	end = start + journal->j_blocksize;
2616 
2617 	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2618 	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2619 		ext4_fc_get_tl(&tl, cur);
2620 		val = cur + EXT4_FC_TAG_BASE_LEN;
2621 
2622 		if (state->fc_replay_num_tags == 0) {
2623 			ret = JBD2_FC_REPLAY_STOP;
2624 			ext4_fc_set_bitmaps_and_counters(sb);
2625 			break;
2626 		}
2627 
2628 		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2629 		state->fc_replay_num_tags--;
2630 		switch (tl.fc_tag) {
2631 		case EXT4_FC_TAG_LINK:
2632 			ret = ext4_fc_replay_link(sb, &tl, val);
2633 			break;
2634 		case EXT4_FC_TAG_UNLINK:
2635 			ret = ext4_fc_replay_unlink(sb, &tl, val);
2636 			break;
2637 		case EXT4_FC_TAG_ADD_RANGE:
2638 			ret = ext4_fc_replay_add_range(sb, val);
2639 			break;
2640 		case EXT4_FC_TAG_CREAT:
2641 			ret = ext4_fc_replay_create(sb, &tl, val);
2642 			break;
2643 		case EXT4_FC_TAG_DEL_RANGE:
2644 			ret = ext4_fc_replay_del_range(sb, val);
2645 			break;
2646 		case EXT4_FC_TAG_INODE:
2647 			ret = ext4_fc_replay_inode(sb, &tl, val);
2648 			break;
2649 		case EXT4_FC_TAG_PAD:
2650 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2651 					     tl.fc_len, 0);
2652 			break;
2653 		case EXT4_FC_TAG_TAIL:
2654 			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2655 					     0, tl.fc_len, 0);
2656 			memcpy(&tail, val, sizeof(tail));
2657 			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2658 			break;
2659 		case EXT4_FC_TAG_HEAD:
2660 			break;
2661 		default:
2662 			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2663 			ret = -ECANCELED;
2664 			break;
2665 		}
2666 		if (ret < 0)
2667 			break;
2668 		ret = JBD2_FC_REPLAY_CONTINUE;
2669 	}
2670 	return ret;
2671 }
2672 
2673 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2674 {
2675 	/*
2676 	 * We set replay callback even if fast commit disabled because we may
2677 	 * could still have fast commit blocks that need to be replayed even if
2678 	 * fast commit has now been turned off.
2679 	 */
2680 	journal->j_fc_replay_callback = ext4_fc_replay;
2681 	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2682 		return;
2683 	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2684 }
2685 
2686 static const char * const fc_ineligible_reasons[] = {
2687 	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2688 	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2689 	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2690 	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2691 	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2692 	[EXT4_FC_REASON_RESIZE] = "Resize",
2693 	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2694 	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2695 	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2696 	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2697 	[EXT4_FC_REASON_MIGRATE] = "Inode format migration",
2698 	[EXT4_FC_REASON_VERITY] = "fs-verity enable",
2699 	[EXT4_FC_REASON_MOVE_EXT] = "Move extents",
2700 };
2701 
2702 int ext4_fc_info_show(struct seq_file *seq, void *v)
2703 {
2704 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2705 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2706 	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
2707 	u64 lock_avg_ns = 0;
2708 	u64 lock_updates_samples;
2709 	u64 lock_updates_ns_total;
2710 	u64 lock_updates_ns_max;
2711 	int i;
2712 
2713 	if (v != SEQ_START_TOKEN)
2714 		return 0;
2715 
2716 	lock_updates_samples =
2717 		atomic64_read(&snap_stats->lock_updates_samples);
2718 	lock_updates_ns_total =
2719 		atomic64_read(&snap_stats->lock_updates_ns_total);
2720 	lock_updates_ns_max =
2721 		atomic64_read(&snap_stats->lock_updates_ns_max);
2722 	if (lock_updates_samples)
2723 		lock_avg_ns = div64_u64(lock_updates_ns_total,
2724 					lock_updates_samples);
2725 
2726 	seq_printf(seq,
2727 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2728 		   stats->fc_num_commits, stats->fc_ineligible_commits,
2729 		   stats->fc_numblks,
2730 		   div_u64(stats->s_fc_avg_commit_time, 1000));
2731 	seq_puts(seq, "Ineligible reasons:\n");
2732 	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2733 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2734 			stats->fc_ineligible_reason_count[i]);
2735 
2736 	seq_printf(seq,
2737 		   "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n",
2738 		   atomic64_read(&snap_stats->snap_inodes),
2739 		   atomic64_read(&snap_stats->snap_ranges),
2740 		   div_u64(lock_avg_ns, 1000),
2741 		   div_u64(lock_updates_ns_max, 1000));
2742 	seq_printf(seq,
2743 		   "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n",
2744 		   atomic64_read(&snap_stats->snap_fail_es_miss),
2745 		   atomic64_read(&snap_stats->snap_fail_es_delayed),
2746 		   atomic64_read(&snap_stats->snap_fail_es_other),
2747 		   atomic64_read(&snap_stats->snap_fail_inodes_cap),
2748 		   atomic64_read(&snap_stats->snap_fail_ranges_cap),
2749 		   atomic64_read(&snap_stats->snap_fail_nomem),
2750 		   atomic64_read(&snap_stats->snap_fail_inode_loc),
2751 		   atomic64_read(&snap_stats->snap_fail_no_snap));
2752 
2753 	return 0;
2754 }
2755 
2756 int __init ext4_fc_init_dentry_cache(void)
2757 {
2758 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2759 					   SLAB_RECLAIM_ACCOUNT);
2760 
2761 	if (!ext4_fc_dentry_cachep)
2762 		return -ENOMEM;
2763 
2764 	ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT);
2765 	if (!ext4_fc_range_cachep) {
2766 		kmem_cache_destroy(ext4_fc_dentry_cachep);
2767 		return -ENOMEM;
2768 	}
2769 
2770 	return 0;
2771 }
2772 
2773 void ext4_fc_destroy_dentry_cache(void)
2774 {
2775 	kmem_cache_destroy(ext4_fc_range_cachep);
2776 	kmem_cache_destroy(ext4_fc_dentry_cachep);
2777 }
2778