1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 #include <linux/lockdep.h>
16 /*
17 * Ext4 Fast Commits
18 * -----------------
19 *
20 * Ext4 fast commits implement fine grained journalling for Ext4.
21 *
22 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
23 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
24 * TLV during the recovery phase. For the scenarios for which we currently
25 * don't have replay code, fast commit falls back to full commits.
26 * Fast commits record delta in one of the following three categories.
27 *
28 * (A) Directory entry updates:
29 *
30 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
31 * - EXT4_FC_TAG_LINK - records directory entry link
32 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
33 *
34 * (B) File specific data range updates:
35 *
36 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
37 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
38 *
39 * (C) Inode metadata (mtime / ctime etc):
40 *
41 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
42 * during recovery. Note that iblocks field is
43 * not replayed and instead derived during
44 * replay.
45 * Commit Operation
46 * ----------------
47 * With fast commits, we maintain all the directory entry operations in the
48 * order in which they are issued in an in-memory queue. This queue is flushed
49 * to disk during the commit operation. We also maintain a list of inodes
50 * that need to be committed during a fast commit in another in memory queue of
51 * inodes. During the commit operation, we commit in the following order:
52 *
53 * [1] Prepare all the inodes to write out their data by setting
54 * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
55 * deleted while it is being flushed.
56 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
57 * state.
58 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
59 * all the exsiting handles finish and no new handles can start.
60 * [4] Mark all the fast commit eligible inodes as undergoing fast commit
61 * by setting "EXT4_STATE_FC_COMMITTING" state.
62 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
63 * starting of new handles. If new handles try to start an update on
64 * any of the inodes that are being committed, ext4_fc_track_inode()
65 * will block until those inodes have finished the fast commit.
66 * [6] Commit all the directory entry updates in the fast commit space.
67 * [7] Commit all the changed inodes in the fast commit space and clear
68 * "EXT4_STATE_FC_COMMITTING" for these inodes.
69 * [8] Write tail tag (this tag ensures the atomicity, please read the following
70 * section for more details).
71 *
72 * All the inode updates must be enclosed within jbd2_jounrnal_start()
73 * and jbd2_journal_stop() similar to JBD2 journaling.
74 *
75 * Fast Commit Ineligibility
76 * -------------------------
77 *
78 * Not all operations are supported by fast commits today (e.g extended
79 * attributes). Fast commit ineligibility is marked by calling
80 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
81 * to full commit.
82 *
83 * Atomicity of commits
84 * --------------------
85 * In order to guarantee atomicity during the commit operation, fast commit
86 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
87 * tag contains CRC of the contents and TID of the transaction after which
88 * this fast commit should be applied. Recovery code replays fast commit
89 * logs only if there's at least 1 valid tail present. For every fast commit
90 * operation, there is 1 tail. This means, we may end up with multiple tails
91 * in the fast commit space. Here's an example:
92 *
93 * - Create a new file A and remove existing file B
94 * - fsync()
95 * - Append contents to file A
96 * - Truncate file A
97 * - fsync()
98 *
99 * The fast commit space at the end of above operations would look like this:
100 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
101 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
102 *
103 * Replay code should thus check for all the valid tails in the FC area.
104 *
105 * Fast Commit Replay Idempotence
106 * ------------------------------
107 *
108 * Fast commits tags are idempotent in nature provided the recovery code follows
109 * certain rules. The guiding principle that the commit path follows while
110 * committing is that it stores the result of a particular operation instead of
111 * storing the procedure.
112 *
113 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
114 * was associated with inode 10. During fast commit, instead of storing this
115 * operation as a procedure "rename a to b", we store the resulting file system
116 * state as a "series" of outcomes:
117 *
118 * - Link dirent b to inode 10
119 * - Unlink dirent a
120 * - Inode <10> with valid refcount
121 *
122 * Now when recovery code runs, it needs "enforce" this state on the file
123 * system. This is what guarantees idempotence of fast commit replay.
124 *
125 * Let's take an example of a procedure that is not idempotent and see how fast
126 * commits make it idempotent. Consider following sequence of operations:
127 *
128 * rm A; mv B A; read A
129 * (x) (y) (z)
130 *
131 * (x), (y) and (z) are the points at which we can crash. If we store this
132 * sequence of operations as is then the replay is not idempotent. Let's say
133 * while in replay, we crash at (z). During the second replay, file A (which was
134 * actually created as a result of "mv B A" operation) would get deleted. Thus,
135 * file named A would be absent when we try to read A. So, this sequence of
136 * operations is not idempotent. However, as mentioned above, instead of storing
137 * the procedure fast commits store the outcome of each procedure. Thus the fast
138 * commit log for above procedure would be as follows:
139 *
140 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
141 * inode 11 before the replay)
142 *
143 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
144 * (w) (x) (y) (z)
145 *
146 * If we crash at (z), we will have file A linked to inode 11. During the second
147 * replay, we will remove file A (inode 11). But we will create it back and make
148 * it point to inode 11. We won't find B, so we'll just skip that step. At this
149 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
150 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
151 * similarly. Thus, by converting a non-idempotent procedure into a series of
152 * idempotent outcomes, fast commits ensured idempotence during the replay.
153 *
154 * Locking
155 * -------
156 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
157 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
158 * inode. Most of the code avoids acquiring both the locks, but if one must do
159 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
160 *
161 * TODOs
162 * -----
163 *
164 * 0) Fast commit replay path hardening: Fast commit replay code should use
165 * journal handles to make sure all the updates it does during the replay
166 * path are atomic. With that if we crash during fast commit replay, after
167 * trying to do recovery again, we will find a file system where fast commit
168 * area is invalid (because new full commit would be found). In order to deal
169 * with that, fast commit replay code should ensure that the "FC_REPLAY"
170 * superblock state is persisted before starting the replay, so that after
171 * the crash, fast commit recovery code can look at that flag and perform
172 * fast commit recovery even if that area is invalidated by later full
173 * commits.
174 *
175 * 1) Handle more ineligible cases.
176 *
177 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
178 * status tree. This would get rid of the need to call ext4_fc_track_inode()
179 * before acquiring i_data_sem. To do that we would need to ensure that
180 * modified extents from the extent status tree are not evicted from memory.
181 */
182
183 #include <trace/events/ext4.h>
184 static struct kmem_cache *ext4_fc_dentry_cachep;
185
ext4_end_buffer_io_sync(struct buffer_head * bh,int uptodate)186 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
187 {
188 BUFFER_TRACE(bh, "");
189 if (uptodate) {
190 ext4_debug("%s: Block %lld up-to-date",
191 __func__, bh->b_blocknr);
192 set_buffer_uptodate(bh);
193 } else {
194 ext4_debug("%s: Block %lld not up-to-date",
195 __func__, bh->b_blocknr);
196 clear_buffer_uptodate(bh);
197 }
198
199 unlock_buffer(bh);
200 }
201
ext4_fc_reset_inode(struct inode * inode)202 static inline void ext4_fc_reset_inode(struct inode *inode)
203 {
204 struct ext4_inode_info *ei = EXT4_I(inode);
205
206 ei->i_fc_lblk_start = 0;
207 ei->i_fc_lblk_len = 0;
208 }
209
ext4_fc_init_inode(struct inode * inode)210 void ext4_fc_init_inode(struct inode *inode)
211 {
212 struct ext4_inode_info *ei = EXT4_I(inode);
213
214 ext4_fc_reset_inode(inode);
215 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
216 INIT_LIST_HEAD(&ei->i_fc_list);
217 INIT_LIST_HEAD(&ei->i_fc_dilist);
218 init_waitqueue_head(&ei->i_fc_wait);
219 }
220
ext4_fc_disabled(struct super_block * sb)221 static bool ext4_fc_disabled(struct super_block *sb)
222 {
223 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
224 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
225 }
226
227 /*
228 * Remove inode from fast commit list. If the inode is being committed
229 * we wait until inode commit is done.
230 */
ext4_fc_del(struct inode * inode)231 void ext4_fc_del(struct inode *inode)
232 {
233 struct ext4_inode_info *ei = EXT4_I(inode);
234 struct ext4_fc_dentry_update *fc_dentry;
235 wait_queue_head_t *wq;
236 int alloc_ctx;
237
238 if (ext4_fc_disabled(inode->i_sb))
239 return;
240
241 alloc_ctx = ext4_fc_lock(inode->i_sb);
242 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
243 ext4_fc_unlock(inode->i_sb, alloc_ctx);
244 return;
245 }
246
247 /*
248 * Since ext4_fc_del is called from ext4_evict_inode while having a
249 * handle open, there is no need for us to wait here even if a fast
250 * commit is going on. That is because, if this inode is being
251 * committed, ext4_mark_inode_dirty would have waited for inode commit
252 * operation to finish before we come here. So, by the time we come
253 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
254 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
255 * here.
256 *
257 * We may come here without any handles open in the "no_delete" case of
258 * ext4_evict_inode as well. However, if that happens, we first mark the
259 * file system as fast commit ineligible anyway. So, even in that case,
260 * it is okay to remove the inode from the fc list.
261 */
262 WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
263 && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
264 while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
265 #if (BITS_PER_LONG < 64)
266 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
267 EXT4_STATE_FC_FLUSHING_DATA);
268 wq = bit_waitqueue(&ei->i_state_flags,
269 EXT4_STATE_FC_FLUSHING_DATA);
270 #else
271 DEFINE_WAIT_BIT(wait, &ei->i_flags,
272 EXT4_STATE_FC_FLUSHING_DATA);
273 wq = bit_waitqueue(&ei->i_flags,
274 EXT4_STATE_FC_FLUSHING_DATA);
275 #endif
276 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
277 if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
278 ext4_fc_unlock(inode->i_sb, alloc_ctx);
279 schedule();
280 alloc_ctx = ext4_fc_lock(inode->i_sb);
281 }
282 finish_wait(wq, &wait.wq_entry);
283 }
284 list_del_init(&ei->i_fc_list);
285
286 /*
287 * Since this inode is getting removed, let's also remove all FC
288 * dentry create references, since it is not needed to log it anyways.
289 */
290 if (list_empty(&ei->i_fc_dilist)) {
291 ext4_fc_unlock(inode->i_sb, alloc_ctx);
292 return;
293 }
294
295 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
296 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
297 list_del_init(&fc_dentry->fcd_list);
298 list_del_init(&fc_dentry->fcd_dilist);
299
300 WARN_ON(!list_empty(&ei->i_fc_dilist));
301 ext4_fc_unlock(inode->i_sb, alloc_ctx);
302
303 release_dentry_name_snapshot(&fc_dentry->fcd_name);
304 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
305 }
306
307 /*
308 * Mark file system as fast commit ineligible, and record latest
309 * ineligible transaction tid. This means until the recorded
310 * transaction, commit operation would result in a full jbd2 commit.
311 */
ext4_fc_mark_ineligible(struct super_block * sb,int reason,handle_t * handle)312 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
313 {
314 struct ext4_sb_info *sbi = EXT4_SB(sb);
315 tid_t tid;
316 bool has_transaction = true;
317 bool is_ineligible;
318 int alloc_ctx;
319
320 if (ext4_fc_disabled(sb))
321 return;
322
323 if (handle && !IS_ERR(handle))
324 tid = handle->h_transaction->t_tid;
325 else {
326 read_lock(&sbi->s_journal->j_state_lock);
327 if (sbi->s_journal->j_running_transaction)
328 tid = sbi->s_journal->j_running_transaction->t_tid;
329 else
330 has_transaction = false;
331 read_unlock(&sbi->s_journal->j_state_lock);
332 }
333 alloc_ctx = ext4_fc_lock(sb);
334 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
335 if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
336 sbi->s_fc_ineligible_tid = tid;
337 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
338 ext4_fc_unlock(sb, alloc_ctx);
339 WARN_ON(reason >= EXT4_FC_REASON_MAX);
340 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
341 }
342
343 /*
344 * Generic fast commit tracking function. If this is the first time this we are
345 * called after a full commit, we initialize fast commit fields and then call
346 * __fc_track_fn() with update = 0. If we have already been called after a full
347 * commit, we pass update = 1. Based on that, the track function can determine
348 * if it needs to track a field for the first time or if it needs to just
349 * update the previously tracked value.
350 *
351 * If enqueue is set, this function enqueues the inode in fast commit list.
352 */
ext4_fc_track_template(handle_t * handle,struct inode * inode,int (* __fc_track_fn)(handle_t * handle,struct inode *,void *,bool),void * args,int enqueue)353 static int ext4_fc_track_template(
354 handle_t *handle, struct inode *inode,
355 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
356 void *args, int enqueue)
357 {
358 bool update = false;
359 struct ext4_inode_info *ei = EXT4_I(inode);
360 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
361 tid_t tid = 0;
362 int alloc_ctx;
363 int ret;
364
365 tid = handle->h_transaction->t_tid;
366 spin_lock(&ei->i_fc_lock);
367 if (tid == ei->i_sync_tid) {
368 update = true;
369 } else {
370 ext4_fc_reset_inode(inode);
371 ei->i_sync_tid = tid;
372 }
373 ret = __fc_track_fn(handle, inode, args, update);
374 spin_unlock(&ei->i_fc_lock);
375 if (!enqueue)
376 return ret;
377
378 alloc_ctx = ext4_fc_lock(inode->i_sb);
379 if (list_empty(&EXT4_I(inode)->i_fc_list))
380 list_add_tail(&EXT4_I(inode)->i_fc_list,
381 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
382 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
383 &sbi->s_fc_q[FC_Q_STAGING] :
384 &sbi->s_fc_q[FC_Q_MAIN]);
385 ext4_fc_unlock(inode->i_sb, alloc_ctx);
386
387 return ret;
388 }
389
390 struct __track_dentry_update_args {
391 struct dentry *dentry;
392 int op;
393 };
394
395 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
__track_dentry_update(handle_t * handle,struct inode * inode,void * arg,bool update)396 static int __track_dentry_update(handle_t *handle, struct inode *inode,
397 void *arg, bool update)
398 {
399 struct ext4_fc_dentry_update *node;
400 struct ext4_inode_info *ei = EXT4_I(inode);
401 struct __track_dentry_update_args *dentry_update =
402 (struct __track_dentry_update_args *)arg;
403 struct dentry *dentry = dentry_update->dentry;
404 struct inode *dir = dentry->d_parent->d_inode;
405 struct super_block *sb = inode->i_sb;
406 struct ext4_sb_info *sbi = EXT4_SB(sb);
407 int alloc_ctx;
408
409 spin_unlock(&ei->i_fc_lock);
410
411 if (IS_ENCRYPTED(dir)) {
412 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
413 handle);
414 spin_lock(&ei->i_fc_lock);
415 return -EOPNOTSUPP;
416 }
417
418 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
419 if (!node) {
420 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
421 spin_lock(&ei->i_fc_lock);
422 return -ENOMEM;
423 }
424
425 node->fcd_op = dentry_update->op;
426 node->fcd_parent = dir->i_ino;
427 node->fcd_ino = inode->i_ino;
428 take_dentry_name_snapshot(&node->fcd_name, dentry);
429 INIT_LIST_HEAD(&node->fcd_dilist);
430 INIT_LIST_HEAD(&node->fcd_list);
431 alloc_ctx = ext4_fc_lock(sb);
432 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
433 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
434 list_add_tail(&node->fcd_list,
435 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
436 else
437 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
438
439 /*
440 * This helps us keep a track of all fc_dentry updates which is part of
441 * this ext4 inode. So in case the inode is getting unlinked, before
442 * even we get a chance to fsync, we could remove all fc_dentry
443 * references while evicting the inode in ext4_fc_del().
444 * Also with this, we don't need to loop over all the inodes in
445 * sbi->s_fc_q to get the corresponding inode in
446 * ext4_fc_commit_dentry_updates().
447 */
448 if (dentry_update->op == EXT4_FC_TAG_CREAT) {
449 WARN_ON(!list_empty(&ei->i_fc_dilist));
450 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
451 }
452 ext4_fc_unlock(sb, alloc_ctx);
453 spin_lock(&ei->i_fc_lock);
454
455 return 0;
456 }
457
__ext4_fc_track_unlink(handle_t * handle,struct inode * inode,struct dentry * dentry)458 void __ext4_fc_track_unlink(handle_t *handle,
459 struct inode *inode, struct dentry *dentry)
460 {
461 struct __track_dentry_update_args args;
462 int ret;
463
464 args.dentry = dentry;
465 args.op = EXT4_FC_TAG_UNLINK;
466
467 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
468 (void *)&args, 0);
469 trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
470 }
471
ext4_fc_track_unlink(handle_t * handle,struct dentry * dentry)472 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
473 {
474 struct inode *inode = d_inode(dentry);
475
476 if (ext4_fc_disabled(inode->i_sb))
477 return;
478
479 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
480 return;
481
482 __ext4_fc_track_unlink(handle, inode, dentry);
483 }
484
__ext4_fc_track_link(handle_t * handle,struct inode * inode,struct dentry * dentry)485 void __ext4_fc_track_link(handle_t *handle,
486 struct inode *inode, struct dentry *dentry)
487 {
488 struct __track_dentry_update_args args;
489 int ret;
490
491 args.dentry = dentry;
492 args.op = EXT4_FC_TAG_LINK;
493
494 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
495 (void *)&args, 0);
496 trace_ext4_fc_track_link(handle, inode, dentry, ret);
497 }
498
ext4_fc_track_link(handle_t * handle,struct dentry * dentry)499 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
500 {
501 struct inode *inode = d_inode(dentry);
502
503 if (ext4_fc_disabled(inode->i_sb))
504 return;
505
506 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
507 return;
508
509 __ext4_fc_track_link(handle, inode, dentry);
510 }
511
__ext4_fc_track_create(handle_t * handle,struct inode * inode,struct dentry * dentry)512 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
513 struct dentry *dentry)
514 {
515 struct __track_dentry_update_args args;
516 int ret;
517
518 args.dentry = dentry;
519 args.op = EXT4_FC_TAG_CREAT;
520
521 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
522 (void *)&args, 0);
523 trace_ext4_fc_track_create(handle, inode, dentry, ret);
524 }
525
ext4_fc_track_create(handle_t * handle,struct dentry * dentry)526 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
527 {
528 struct inode *inode = d_inode(dentry);
529
530 if (ext4_fc_disabled(inode->i_sb))
531 return;
532
533 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
534 return;
535
536 __ext4_fc_track_create(handle, inode, dentry);
537 }
538
539 /* __track_fn for inode tracking */
__track_inode(handle_t * handle,struct inode * inode,void * arg,bool update)540 static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
541 bool update)
542 {
543 if (update)
544 return -EEXIST;
545
546 EXT4_I(inode)->i_fc_lblk_len = 0;
547
548 return 0;
549 }
550
ext4_fc_track_inode(handle_t * handle,struct inode * inode)551 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
552 {
553 struct ext4_inode_info *ei = EXT4_I(inode);
554 wait_queue_head_t *wq;
555 int ret;
556
557 if (S_ISDIR(inode->i_mode))
558 return;
559
560 if (ext4_fc_disabled(inode->i_sb))
561 return;
562
563 if (ext4_should_journal_data(inode)) {
564 ext4_fc_mark_ineligible(inode->i_sb,
565 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
566 return;
567 }
568
569 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
570 return;
571
572 /*
573 * If we come here, we may sleep while waiting for the inode to
574 * commit. We shouldn't be holding i_data_sem when we go to sleep since
575 * the commit path needs to grab the lock while committing the inode.
576 */
577 lockdep_assert_not_held(&ei->i_data_sem);
578
579 while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
580 #if (BITS_PER_LONG < 64)
581 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
582 EXT4_STATE_FC_COMMITTING);
583 wq = bit_waitqueue(&ei->i_state_flags,
584 EXT4_STATE_FC_COMMITTING);
585 #else
586 DEFINE_WAIT_BIT(wait, &ei->i_flags,
587 EXT4_STATE_FC_COMMITTING);
588 wq = bit_waitqueue(&ei->i_flags,
589 EXT4_STATE_FC_COMMITTING);
590 #endif
591 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
592 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
593 schedule();
594 finish_wait(wq, &wait.wq_entry);
595 }
596
597 /*
598 * From this point on, this inode will not be committed either
599 * by fast or full commit as long as the handle is open.
600 */
601 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
602 trace_ext4_fc_track_inode(handle, inode, ret);
603 }
604
605 struct __track_range_args {
606 ext4_lblk_t start, end;
607 };
608
609 /* __track_fn for tracking data updates */
__track_range(handle_t * handle,struct inode * inode,void * arg,bool update)610 static int __track_range(handle_t *handle, struct inode *inode, void *arg,
611 bool update)
612 {
613 struct ext4_inode_info *ei = EXT4_I(inode);
614 ext4_lblk_t oldstart;
615 struct __track_range_args *__arg =
616 (struct __track_range_args *)arg;
617
618 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
619 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
620 return -ECANCELED;
621 }
622
623 oldstart = ei->i_fc_lblk_start;
624
625 if (update && ei->i_fc_lblk_len > 0) {
626 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
627 ei->i_fc_lblk_len =
628 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
629 ei->i_fc_lblk_start + 1;
630 } else {
631 ei->i_fc_lblk_start = __arg->start;
632 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
633 }
634
635 return 0;
636 }
637
ext4_fc_track_range(handle_t * handle,struct inode * inode,ext4_lblk_t start,ext4_lblk_t end)638 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
639 ext4_lblk_t end)
640 {
641 struct __track_range_args args;
642 int ret;
643
644 if (S_ISDIR(inode->i_mode))
645 return;
646
647 if (ext4_fc_disabled(inode->i_sb))
648 return;
649
650 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
651 return;
652
653 if (ext4_has_inline_data(inode)) {
654 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
655 handle);
656 return;
657 }
658
659 args.start = start;
660 args.end = end;
661
662 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
663
664 trace_ext4_fc_track_range(handle, inode, start, end, ret);
665 }
666
ext4_fc_submit_bh(struct super_block * sb,bool is_tail)667 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
668 {
669 blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS;
670 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
671
672 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
673 if (test_opt(sb, BARRIER) && is_tail)
674 write_flags |= REQ_FUA | REQ_PREFLUSH;
675 lock_buffer(bh);
676 set_buffer_dirty(bh);
677 set_buffer_uptodate(bh);
678 bh->b_end_io = ext4_end_buffer_io_sync;
679 submit_bh(REQ_OP_WRITE | write_flags, bh);
680 EXT4_SB(sb)->s_fc_bh = NULL;
681 }
682
683 /* Ext4 commit path routines */
684
685 /*
686 * Allocate len bytes on a fast commit buffer.
687 *
688 * During the commit time this function is used to manage fast commit
689 * block space. We don't split a fast commit log onto different
690 * blocks. So this function makes sure that if there's not enough space
691 * on the current block, the remaining space in the current block is
692 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
693 * new block is from jbd2 and CRC is updated to reflect the padding
694 * we added.
695 */
ext4_fc_reserve_space(struct super_block * sb,int len,u32 * crc)696 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
697 {
698 struct ext4_fc_tl tl;
699 struct ext4_sb_info *sbi = EXT4_SB(sb);
700 struct buffer_head *bh;
701 int bsize = sbi->s_journal->j_blocksize;
702 int ret, off = sbi->s_fc_bytes % bsize;
703 int remaining;
704 u8 *dst;
705
706 /*
707 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
708 * cannot fulfill the request.
709 */
710 if (len > bsize - EXT4_FC_TAG_BASE_LEN)
711 return NULL;
712
713 if (!sbi->s_fc_bh) {
714 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
715 if (ret)
716 return NULL;
717 sbi->s_fc_bh = bh;
718 }
719 dst = sbi->s_fc_bh->b_data + off;
720
721 /*
722 * Allocate the bytes in the current block if we can do so while still
723 * leaving enough space for a PAD tlv.
724 */
725 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
726 if (len <= remaining) {
727 sbi->s_fc_bytes += len;
728 return dst;
729 }
730
731 /*
732 * Else, terminate the current block with a PAD tlv, then allocate a new
733 * block and allocate the bytes at the start of that new block.
734 */
735
736 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
737 tl.fc_len = cpu_to_le16(remaining);
738 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
739 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
740 *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
741
742 ext4_fc_submit_bh(sb, false);
743
744 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
745 if (ret)
746 return NULL;
747 sbi->s_fc_bh = bh;
748 sbi->s_fc_bytes += bsize - off + len;
749 return sbi->s_fc_bh->b_data;
750 }
751
752 /*
753 * Complete a fast commit by writing tail tag.
754 *
755 * Writing tail tag marks the end of a fast commit. In order to guarantee
756 * atomicity, after writing tail tag, even if there's space remaining
757 * in the block, next commit shouldn't use it. That's why tail tag
758 * has the length as that of the remaining space on the block.
759 */
ext4_fc_write_tail(struct super_block * sb,u32 crc)760 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
761 {
762 struct ext4_sb_info *sbi = EXT4_SB(sb);
763 struct ext4_fc_tl tl;
764 struct ext4_fc_tail tail;
765 int off, bsize = sbi->s_journal->j_blocksize;
766 u8 *dst;
767
768 /*
769 * ext4_fc_reserve_space takes care of allocating an extra block if
770 * there's no enough space on this block for accommodating this tail.
771 */
772 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
773 if (!dst)
774 return -ENOSPC;
775
776 off = sbi->s_fc_bytes % bsize;
777
778 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
779 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
780 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
781
782 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
783 dst += EXT4_FC_TAG_BASE_LEN;
784 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
785 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
786 dst += sizeof(tail.fc_tid);
787 crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
788 dst - (u8 *)sbi->s_fc_bh->b_data);
789 tail.fc_crc = cpu_to_le32(crc);
790 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
791 dst += sizeof(tail.fc_crc);
792 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
793
794 ext4_fc_submit_bh(sb, true);
795
796 return 0;
797 }
798
799 /*
800 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
801 * Returns false if there's not enough space.
802 */
ext4_fc_add_tlv(struct super_block * sb,u16 tag,u16 len,u8 * val,u32 * crc)803 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
804 u32 *crc)
805 {
806 struct ext4_fc_tl tl;
807 u8 *dst;
808
809 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
810 if (!dst)
811 return false;
812
813 tl.fc_tag = cpu_to_le16(tag);
814 tl.fc_len = cpu_to_le16(len);
815
816 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
817 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
818
819 return true;
820 }
821
822 /* Same as above, but adds dentry tlv. */
ext4_fc_add_dentry_tlv(struct super_block * sb,u32 * crc,struct ext4_fc_dentry_update * fc_dentry)823 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
824 struct ext4_fc_dentry_update *fc_dentry)
825 {
826 struct ext4_fc_dentry_info fcd;
827 struct ext4_fc_tl tl;
828 int dlen = fc_dentry->fcd_name.name.len;
829 u8 *dst = ext4_fc_reserve_space(sb,
830 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
831
832 if (!dst)
833 return false;
834
835 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
836 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
837 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
838 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
839 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
840 dst += EXT4_FC_TAG_BASE_LEN;
841 memcpy(dst, &fcd, sizeof(fcd));
842 dst += sizeof(fcd);
843 memcpy(dst, fc_dentry->fcd_name.name.name, dlen);
844
845 return true;
846 }
847
848 /*
849 * Writes inode in the fast commit space under TLV with tag @tag.
850 * Returns 0 on success, error on failure.
851 */
ext4_fc_write_inode(struct inode * inode,u32 * crc)852 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
853 {
854 struct ext4_inode_info *ei = EXT4_I(inode);
855 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
856 int ret;
857 struct ext4_iloc iloc;
858 struct ext4_fc_inode fc_inode;
859 struct ext4_fc_tl tl;
860 u8 *dst;
861
862 ret = ext4_get_inode_loc(inode, &iloc);
863 if (ret)
864 return ret;
865
866 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
867 inode_len = EXT4_INODE_SIZE(inode->i_sb);
868 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
869 inode_len += ei->i_extra_isize;
870
871 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
872 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
873 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
874
875 ret = -ECANCELED;
876 dst = ext4_fc_reserve_space(inode->i_sb,
877 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
878 if (!dst)
879 goto err;
880
881 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
882 dst += EXT4_FC_TAG_BASE_LEN;
883 memcpy(dst, &fc_inode, sizeof(fc_inode));
884 dst += sizeof(fc_inode);
885 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
886 ret = 0;
887 err:
888 brelse(iloc.bh);
889 return ret;
890 }
891
892 /*
893 * Writes updated data ranges for the inode in question. Updates CRC.
894 * Returns 0 on success, error otherwise.
895 */
ext4_fc_write_inode_data(struct inode * inode,u32 * crc)896 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
897 {
898 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
899 struct ext4_inode_info *ei = EXT4_I(inode);
900 struct ext4_map_blocks map;
901 struct ext4_fc_add_range fc_ext;
902 struct ext4_fc_del_range lrange;
903 struct ext4_extent *ex;
904 int ret;
905
906 spin_lock(&ei->i_fc_lock);
907 if (ei->i_fc_lblk_len == 0) {
908 spin_unlock(&ei->i_fc_lock);
909 return 0;
910 }
911 old_blk_size = ei->i_fc_lblk_start;
912 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
913 ei->i_fc_lblk_len = 0;
914 spin_unlock(&ei->i_fc_lock);
915
916 cur_lblk_off = old_blk_size;
917 ext4_debug("will try writing %d to %d for inode %ld\n",
918 cur_lblk_off, new_blk_size, inode->i_ino);
919
920 while (cur_lblk_off <= new_blk_size) {
921 map.m_lblk = cur_lblk_off;
922 map.m_len = new_blk_size - cur_lblk_off + 1;
923 ret = ext4_map_blocks(NULL, inode, &map,
924 EXT4_GET_BLOCKS_IO_SUBMIT |
925 EXT4_EX_NOCACHE);
926 if (ret < 0)
927 return -ECANCELED;
928
929 if (map.m_len == 0) {
930 cur_lblk_off++;
931 continue;
932 }
933
934 if (ret == 0) {
935 lrange.fc_ino = cpu_to_le32(inode->i_ino);
936 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
937 lrange.fc_len = cpu_to_le32(map.m_len);
938 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
939 sizeof(lrange), (u8 *)&lrange, crc))
940 return -ENOSPC;
941 } else {
942 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
943 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
944
945 /* Limit the number of blocks in one extent */
946 map.m_len = min(max, map.m_len);
947
948 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
949 ex = (struct ext4_extent *)&fc_ext.fc_ex;
950 ex->ee_block = cpu_to_le32(map.m_lblk);
951 ex->ee_len = cpu_to_le16(map.m_len);
952 ext4_ext_store_pblock(ex, map.m_pblk);
953 if (map.m_flags & EXT4_MAP_UNWRITTEN)
954 ext4_ext_mark_unwritten(ex);
955 else
956 ext4_ext_mark_initialized(ex);
957 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
958 sizeof(fc_ext), (u8 *)&fc_ext, crc))
959 return -ENOSPC;
960 }
961
962 cur_lblk_off += map.m_len;
963 }
964
965 return 0;
966 }
967
968
969 /* Flushes data of all the inodes in the commit queue. */
ext4_fc_flush_data(journal_t * journal)970 static int ext4_fc_flush_data(journal_t *journal)
971 {
972 struct super_block *sb = journal->j_private;
973 struct ext4_sb_info *sbi = EXT4_SB(sb);
974 struct ext4_inode_info *ei;
975 int ret = 0;
976
977 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
978 ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode));
979 if (ret)
980 return ret;
981 }
982
983 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
984 ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode));
985 if (ret)
986 return ret;
987 }
988
989 return 0;
990 }
991
992 /* Commit all the directory entry updates */
ext4_fc_commit_dentry_updates(journal_t * journal,u32 * crc)993 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
994 {
995 struct super_block *sb = journal->j_private;
996 struct ext4_sb_info *sbi = EXT4_SB(sb);
997 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
998 struct inode *inode;
999 struct ext4_inode_info *ei;
1000 int ret;
1001
1002 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1003 return 0;
1004 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1005 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1006 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1007 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1008 return -ENOSPC;
1009 continue;
1010 }
1011 /*
1012 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1013 * corresponding inode. Also, the corresponding inode could have been
1014 * deleted, in which case, we don't need to do anything.
1015 */
1016 if (list_empty(&fc_dentry->fcd_dilist))
1017 continue;
1018 ei = list_first_entry(&fc_dentry->fcd_dilist,
1019 struct ext4_inode_info, i_fc_dilist);
1020 inode = &ei->vfs_inode;
1021 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1022
1023 /*
1024 * We first write the inode and then the create dirent. This
1025 * allows the recovery code to create an unnamed inode first
1026 * and then link it to a directory entry. This allows us
1027 * to use namei.c routines almost as is and simplifies
1028 * the recovery code.
1029 */
1030 ret = ext4_fc_write_inode(inode, crc);
1031 if (ret)
1032 return ret;
1033 ret = ext4_fc_write_inode_data(inode, crc);
1034 if (ret)
1035 return ret;
1036 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
1037 return -ENOSPC;
1038 }
1039 return 0;
1040 }
1041
ext4_fc_perform_commit(journal_t * journal)1042 static int ext4_fc_perform_commit(journal_t *journal)
1043 {
1044 struct super_block *sb = journal->j_private;
1045 struct ext4_sb_info *sbi = EXT4_SB(sb);
1046 struct ext4_inode_info *iter;
1047 struct ext4_fc_head head;
1048 struct inode *inode;
1049 struct blk_plug plug;
1050 int ret = 0;
1051 u32 crc = 0;
1052 int alloc_ctx;
1053
1054 /*
1055 * Step 1: Mark all inodes on s_fc_q[MAIN] with
1056 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
1057 * freed until the data flush is over.
1058 */
1059 alloc_ctx = ext4_fc_lock(sb);
1060 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1061 ext4_set_inode_state(&iter->vfs_inode,
1062 EXT4_STATE_FC_FLUSHING_DATA);
1063 }
1064 ext4_fc_unlock(sb, alloc_ctx);
1065
1066 /* Step 2: Flush data for all the eligible inodes. */
1067 ret = ext4_fc_flush_data(journal);
1068
1069 /*
1070 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
1071 * any error from step 2. This ensures that waiters waiting on
1072 * EXT4_STATE_FC_FLUSHING_DATA can resume.
1073 */
1074 alloc_ctx = ext4_fc_lock(sb);
1075 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1076 ext4_clear_inode_state(&iter->vfs_inode,
1077 EXT4_STATE_FC_FLUSHING_DATA);
1078 #if (BITS_PER_LONG < 64)
1079 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
1080 #else
1081 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
1082 #endif
1083 }
1084
1085 /*
1086 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
1087 * the waiter checks the bit. Pairs with implicit barrier in
1088 * prepare_to_wait() in ext4_fc_del().
1089 */
1090 smp_mb();
1091 ext4_fc_unlock(sb, alloc_ctx);
1092
1093 /*
1094 * If we encountered error in Step 2, return it now after clearing
1095 * EXT4_STATE_FC_FLUSHING_DATA bit.
1096 */
1097 if (ret)
1098 return ret;
1099
1100
1101 /* Step 4: Mark all inodes as being committed. */
1102 jbd2_journal_lock_updates(journal);
1103 /*
1104 * The journal is now locked. No more handles can start and all the
1105 * previous handles are now drained. We now mark the inodes on the
1106 * commit queue as being committed.
1107 */
1108 alloc_ctx = ext4_fc_lock(sb);
1109 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1110 ext4_set_inode_state(&iter->vfs_inode,
1111 EXT4_STATE_FC_COMMITTING);
1112 }
1113 ext4_fc_unlock(sb, alloc_ctx);
1114 jbd2_journal_unlock_updates(journal);
1115
1116 /*
1117 * Step 5: If file system device is different from journal device,
1118 * issue a cache flush before we start writing fast commit blocks.
1119 */
1120 if (journal->j_fs_dev != journal->j_dev)
1121 blkdev_issue_flush(journal->j_fs_dev);
1122
1123 blk_start_plug(&plug);
1124 alloc_ctx = ext4_fc_lock(sb);
1125 /* Step 6: Write fast commit blocks to disk. */
1126 if (sbi->s_fc_bytes == 0) {
1127 /*
1128 * Step 6.1: Add a head tag only if this is the first fast
1129 * commit in this TID.
1130 */
1131 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1132 head.fc_tid = cpu_to_le32(
1133 sbi->s_journal->j_running_transaction->t_tid);
1134 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1135 (u8 *)&head, &crc)) {
1136 ret = -ENOSPC;
1137 goto out;
1138 }
1139 }
1140
1141 /* Step 6.2: Now write all the dentry updates. */
1142 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1143 if (ret)
1144 goto out;
1145
1146 /* Step 6.3: Now write all the changed inodes to disk. */
1147 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1148 inode = &iter->vfs_inode;
1149 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1150 continue;
1151
1152 ret = ext4_fc_write_inode_data(inode, &crc);
1153 if (ret)
1154 goto out;
1155 ret = ext4_fc_write_inode(inode, &crc);
1156 if (ret)
1157 goto out;
1158 }
1159 /* Step 6.4: Finally write tail tag to conclude this fast commit. */
1160 ret = ext4_fc_write_tail(sb, crc);
1161
1162 out:
1163 ext4_fc_unlock(sb, alloc_ctx);
1164 blk_finish_plug(&plug);
1165 return ret;
1166 }
1167
ext4_fc_update_stats(struct super_block * sb,int status,u64 commit_time,int nblks,tid_t commit_tid)1168 static void ext4_fc_update_stats(struct super_block *sb, int status,
1169 u64 commit_time, int nblks, tid_t commit_tid)
1170 {
1171 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1172
1173 ext4_debug("Fast commit ended with status = %d for tid %u",
1174 status, commit_tid);
1175 if (status == EXT4_FC_STATUS_OK) {
1176 stats->fc_num_commits++;
1177 stats->fc_numblks += nblks;
1178 if (likely(stats->s_fc_avg_commit_time))
1179 stats->s_fc_avg_commit_time =
1180 (commit_time +
1181 stats->s_fc_avg_commit_time * 3) / 4;
1182 else
1183 stats->s_fc_avg_commit_time = commit_time;
1184 } else if (status == EXT4_FC_STATUS_FAILED ||
1185 status == EXT4_FC_STATUS_INELIGIBLE) {
1186 if (status == EXT4_FC_STATUS_FAILED)
1187 stats->fc_failed_commits++;
1188 stats->fc_ineligible_commits++;
1189 } else {
1190 stats->fc_skipped_commits++;
1191 }
1192 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1193 }
1194
1195 /*
1196 * The main commit entry point. Performs a fast commit for transaction
1197 * commit_tid if needed. If it's not possible to perform a fast commit
1198 * due to various reasons, we fall back to full commit. Returns 0
1199 * on success, error otherwise.
1200 */
ext4_fc_commit(journal_t * journal,tid_t commit_tid)1201 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1202 {
1203 struct super_block *sb = journal->j_private;
1204 struct ext4_sb_info *sbi = EXT4_SB(sb);
1205 int nblks = 0, ret, bsize = journal->j_blocksize;
1206 int subtid = atomic_read(&sbi->s_fc_subtid);
1207 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1208 ktime_t start_time, commit_time;
1209 int old_ioprio, journal_ioprio;
1210
1211 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1212 return jbd2_complete_transaction(journal, commit_tid);
1213
1214 trace_ext4_fc_commit_start(sb, commit_tid);
1215
1216 start_time = ktime_get();
1217 old_ioprio = get_current_ioprio();
1218
1219 restart_fc:
1220 ret = jbd2_fc_begin_commit(journal, commit_tid);
1221 if (ret == -EALREADY) {
1222 /* There was an ongoing commit, check if we need to restart */
1223 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1224 tid_gt(commit_tid, journal->j_commit_sequence))
1225 goto restart_fc;
1226 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1227 commit_tid);
1228 return 0;
1229 } else if (ret) {
1230 /*
1231 * Commit couldn't start. Just update stats and perform a
1232 * full commit.
1233 */
1234 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1235 commit_tid);
1236 return jbd2_complete_transaction(journal, commit_tid);
1237 }
1238
1239 /*
1240 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241 * if we are fast commit ineligible.
1242 */
1243 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1244 status = EXT4_FC_STATUS_INELIGIBLE;
1245 goto fallback;
1246 }
1247
1248 /*
1249 * Now that we know that this thread is going to do a fast commit,
1250 * elevate the priority to match that of the journal thread.
1251 */
1252 if (journal->j_task->io_context)
1253 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
1254 else
1255 journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
1256 set_task_ioprio(current, journal_ioprio);
1257 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1258 ret = ext4_fc_perform_commit(journal);
1259 if (ret < 0) {
1260 status = EXT4_FC_STATUS_FAILED;
1261 goto fallback;
1262 }
1263 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1264 ret = jbd2_fc_wait_bufs(journal, nblks);
1265 if (ret < 0) {
1266 status = EXT4_FC_STATUS_FAILED;
1267 goto fallback;
1268 }
1269 atomic_inc(&sbi->s_fc_subtid);
1270 ret = jbd2_fc_end_commit(journal);
1271 set_task_ioprio(current, old_ioprio);
1272 /*
1273 * weight the commit time higher than the average time so we
1274 * don't react too strongly to vast changes in the commit time
1275 */
1276 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1277 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1278 return ret;
1279
1280 fallback:
1281 set_task_ioprio(current, old_ioprio);
1282 ret = jbd2_fc_end_commit_fallback(journal);
1283 ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1284 return ret;
1285 }
1286
1287 /*
1288 * Fast commit cleanup routine. This is called after every fast commit and
1289 * full commit. full is true if we are called after a full commit.
1290 */
ext4_fc_cleanup(journal_t * journal,int full,tid_t tid)1291 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1292 {
1293 struct super_block *sb = journal->j_private;
1294 struct ext4_sb_info *sbi = EXT4_SB(sb);
1295 struct ext4_inode_info *ei;
1296 struct ext4_fc_dentry_update *fc_dentry;
1297 int alloc_ctx;
1298
1299 if (full && sbi->s_fc_bh)
1300 sbi->s_fc_bh = NULL;
1301
1302 trace_ext4_fc_cleanup(journal, full, tid);
1303 jbd2_fc_release_bufs(journal);
1304
1305 alloc_ctx = ext4_fc_lock(sb);
1306 while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
1307 ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
1308 struct ext4_inode_info,
1309 i_fc_list);
1310 list_del_init(&ei->i_fc_list);
1311 ext4_clear_inode_state(&ei->vfs_inode,
1312 EXT4_STATE_FC_COMMITTING);
1313 if (tid_geq(tid, ei->i_sync_tid)) {
1314 ext4_fc_reset_inode(&ei->vfs_inode);
1315 } else if (full) {
1316 /*
1317 * We are called after a full commit, inode has been
1318 * modified while the commit was running. Re-enqueue
1319 * the inode into STAGING, which will then be splice
1320 * back into MAIN. This cannot happen during
1321 * fastcommit because the journal is locked all the
1322 * time in that case (and tid doesn't increase so
1323 * tid check above isn't reliable).
1324 */
1325 list_add_tail(&ei->i_fc_list,
1326 &sbi->s_fc_q[FC_Q_STAGING]);
1327 }
1328 /*
1329 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
1330 * visible before we send the wakeup. Pairs with implicit
1331 * barrier in prepare_to_wait() in ext4_fc_track_inode().
1332 */
1333 smp_mb();
1334 #if (BITS_PER_LONG < 64)
1335 wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
1336 #else
1337 wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
1338 #endif
1339 }
1340
1341 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1342 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1343 struct ext4_fc_dentry_update,
1344 fcd_list);
1345 list_del_init(&fc_dentry->fcd_list);
1346 list_del_init(&fc_dentry->fcd_dilist);
1347
1348 release_dentry_name_snapshot(&fc_dentry->fcd_name);
1349 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1350 }
1351
1352 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1353 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1354 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1355 &sbi->s_fc_q[FC_Q_MAIN]);
1356
1357 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1358 sbi->s_fc_ineligible_tid = 0;
1359 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1360 }
1361
1362 if (full)
1363 sbi->s_fc_bytes = 0;
1364 ext4_fc_unlock(sb, alloc_ctx);
1365 trace_ext4_fc_stats(sb);
1366 }
1367
1368 /* Ext4 Replay Path Routines */
1369
1370 /* Helper struct for dentry replay routines */
1371 struct dentry_info_args {
1372 int parent_ino, dname_len, ino, inode_len;
1373 char *dname;
1374 };
1375
1376 /* Same as struct ext4_fc_tl, but uses native endianness fields */
1377 struct ext4_fc_tl_mem {
1378 u16 fc_tag;
1379 u16 fc_len;
1380 };
1381
tl_to_darg(struct dentry_info_args * darg,struct ext4_fc_tl_mem * tl,u8 * val)1382 static inline void tl_to_darg(struct dentry_info_args *darg,
1383 struct ext4_fc_tl_mem *tl, u8 *val)
1384 {
1385 struct ext4_fc_dentry_info fcd;
1386
1387 memcpy(&fcd, val, sizeof(fcd));
1388
1389 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1390 darg->ino = le32_to_cpu(fcd.fc_ino);
1391 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1392 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1393 }
1394
ext4_fc_get_tl(struct ext4_fc_tl_mem * tl,u8 * val)1395 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1396 {
1397 struct ext4_fc_tl tl_disk;
1398
1399 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1400 tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1401 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1402 }
1403
1404 /* Unlink replay function */
ext4_fc_replay_unlink(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1405 static int ext4_fc_replay_unlink(struct super_block *sb,
1406 struct ext4_fc_tl_mem *tl, u8 *val)
1407 {
1408 struct inode *inode, *old_parent;
1409 struct qstr entry;
1410 struct dentry_info_args darg;
1411 int ret = 0;
1412
1413 tl_to_darg(&darg, tl, val);
1414
1415 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1416 darg.parent_ino, darg.dname_len);
1417
1418 entry.name = darg.dname;
1419 entry.len = darg.dname_len;
1420 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421
1422 if (IS_ERR(inode)) {
1423 ext4_debug("Inode %d not found", darg.ino);
1424 return 0;
1425 }
1426
1427 old_parent = ext4_iget(sb, darg.parent_ino,
1428 EXT4_IGET_NORMAL);
1429 if (IS_ERR(old_parent)) {
1430 ext4_debug("Dir with inode %d not found", darg.parent_ino);
1431 iput(inode);
1432 return 0;
1433 }
1434
1435 ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1436 /* -ENOENT ok coz it might not exist anymore. */
1437 if (ret == -ENOENT)
1438 ret = 0;
1439 iput(old_parent);
1440 iput(inode);
1441 return ret;
1442 }
1443
ext4_fc_replay_link_internal(struct super_block * sb,struct dentry_info_args * darg,struct inode * inode)1444 static int ext4_fc_replay_link_internal(struct super_block *sb,
1445 struct dentry_info_args *darg,
1446 struct inode *inode)
1447 {
1448 struct inode *dir = NULL;
1449 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1450 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1451 int ret = 0;
1452
1453 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1454 if (IS_ERR(dir)) {
1455 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1456 dir = NULL;
1457 goto out;
1458 }
1459
1460 dentry_dir = d_obtain_alias(dir);
1461 if (IS_ERR(dentry_dir)) {
1462 ext4_debug("Failed to obtain dentry");
1463 dentry_dir = NULL;
1464 goto out;
1465 }
1466
1467 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1468 if (!dentry_inode) {
1469 ext4_debug("Inode dentry not created.");
1470 ret = -ENOMEM;
1471 goto out;
1472 }
1473
1474 ret = __ext4_link(dir, inode, dentry_inode);
1475 /*
1476 * It's possible that link already existed since data blocks
1477 * for the dir in question got persisted before we crashed OR
1478 * we replayed this tag and crashed before the entire replay
1479 * could complete.
1480 */
1481 if (ret && ret != -EEXIST) {
1482 ext4_debug("Failed to link\n");
1483 goto out;
1484 }
1485
1486 ret = 0;
1487 out:
1488 if (dentry_dir) {
1489 d_drop(dentry_dir);
1490 dput(dentry_dir);
1491 } else if (dir) {
1492 iput(dir);
1493 }
1494 if (dentry_inode) {
1495 d_drop(dentry_inode);
1496 dput(dentry_inode);
1497 }
1498
1499 return ret;
1500 }
1501
1502 /* Link replay function */
ext4_fc_replay_link(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1503 static int ext4_fc_replay_link(struct super_block *sb,
1504 struct ext4_fc_tl_mem *tl, u8 *val)
1505 {
1506 struct inode *inode;
1507 struct dentry_info_args darg;
1508 int ret = 0;
1509
1510 tl_to_darg(&darg, tl, val);
1511 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1512 darg.parent_ino, darg.dname_len);
1513
1514 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1515 if (IS_ERR(inode)) {
1516 ext4_debug("Inode not found.");
1517 return 0;
1518 }
1519
1520 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1521 iput(inode);
1522 return ret;
1523 }
1524
1525 /*
1526 * Record all the modified inodes during replay. We use this later to setup
1527 * block bitmaps correctly.
1528 */
ext4_fc_record_modified_inode(struct super_block * sb,int ino)1529 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1530 {
1531 struct ext4_fc_replay_state *state;
1532 int i;
1533
1534 state = &EXT4_SB(sb)->s_fc_replay_state;
1535 for (i = 0; i < state->fc_modified_inodes_used; i++)
1536 if (state->fc_modified_inodes[i] == ino)
1537 return 0;
1538 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1539 int *fc_modified_inodes;
1540
1541 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1542 sizeof(int) * (state->fc_modified_inodes_size +
1543 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1544 GFP_KERNEL);
1545 if (!fc_modified_inodes)
1546 return -ENOMEM;
1547 state->fc_modified_inodes = fc_modified_inodes;
1548 state->fc_modified_inodes_size +=
1549 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1550 }
1551 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1552 return 0;
1553 }
1554
1555 /*
1556 * Inode replay function
1557 */
ext4_fc_replay_inode(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1558 static int ext4_fc_replay_inode(struct super_block *sb,
1559 struct ext4_fc_tl_mem *tl, u8 *val)
1560 {
1561 struct ext4_fc_inode fc_inode;
1562 struct ext4_inode *raw_inode;
1563 struct ext4_inode *raw_fc_inode;
1564 struct inode *inode = NULL;
1565 struct ext4_iloc iloc;
1566 int inode_len, ino, ret, tag = tl->fc_tag;
1567 struct ext4_extent_header *eh;
1568 size_t off_gen = offsetof(struct ext4_inode, i_generation);
1569
1570 memcpy(&fc_inode, val, sizeof(fc_inode));
1571
1572 ino = le32_to_cpu(fc_inode.fc_ino);
1573 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1574
1575 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1576 if (!IS_ERR(inode)) {
1577 ext4_ext_clear_bb(inode);
1578 iput(inode);
1579 }
1580 inode = NULL;
1581
1582 ret = ext4_fc_record_modified_inode(sb, ino);
1583 if (ret)
1584 goto out;
1585
1586 raw_fc_inode = (struct ext4_inode *)
1587 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1588 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1589 if (ret)
1590 goto out;
1591
1592 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1593 raw_inode = ext4_raw_inode(&iloc);
1594
1595 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1596 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1597 inode_len - off_gen);
1598 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1599 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1600 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1601 memset(eh, 0, sizeof(*eh));
1602 eh->eh_magic = EXT4_EXT_MAGIC;
1603 eh->eh_max = cpu_to_le16(
1604 (sizeof(raw_inode->i_block) -
1605 sizeof(struct ext4_extent_header))
1606 / sizeof(struct ext4_extent));
1607 }
1608 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1609 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1610 sizeof(raw_inode->i_block));
1611 }
1612
1613 /* Immediately update the inode on disk. */
1614 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1615 if (ret)
1616 goto out_brelse;
1617 ret = sync_dirty_buffer(iloc.bh);
1618 if (ret)
1619 goto out_brelse;
1620 ret = ext4_mark_inode_used(sb, ino);
1621 if (ret)
1622 goto out_brelse;
1623
1624 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1625 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1626 if (IS_ERR(inode)) {
1627 ext4_debug("Inode not found.");
1628 inode = NULL;
1629 ret = -EFSCORRUPTED;
1630 goto out_brelse;
1631 }
1632
1633 /*
1634 * Our allocator could have made different decisions than before
1635 * crashing. This should be fixed but until then, we calculate
1636 * the number of blocks the inode.
1637 */
1638 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1639 ext4_ext_replay_set_iblocks(inode);
1640
1641 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1642 ext4_reset_inode_seed(inode);
1643
1644 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1645 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1646 sync_dirty_buffer(iloc.bh);
1647 out_brelse:
1648 brelse(iloc.bh);
1649 out:
1650 iput(inode);
1651 if (!ret)
1652 blkdev_issue_flush(sb->s_bdev);
1653
1654 return ret;
1655 }
1656
1657 /*
1658 * Dentry create replay function.
1659 *
1660 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1661 * inode for which we are trying to create a dentry here, should already have
1662 * been replayed before we start here.
1663 */
ext4_fc_replay_create(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1664 static int ext4_fc_replay_create(struct super_block *sb,
1665 struct ext4_fc_tl_mem *tl, u8 *val)
1666 {
1667 int ret = 0;
1668 struct inode *inode = NULL;
1669 struct inode *dir = NULL;
1670 struct dentry_info_args darg;
1671
1672 tl_to_darg(&darg, tl, val);
1673
1674 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1675 darg.parent_ino, darg.dname_len);
1676
1677 /* This takes care of update group descriptor and other metadata */
1678 ret = ext4_mark_inode_used(sb, darg.ino);
1679 if (ret)
1680 goto out;
1681
1682 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1683 if (IS_ERR(inode)) {
1684 ext4_debug("inode %d not found.", darg.ino);
1685 inode = NULL;
1686 ret = -EINVAL;
1687 goto out;
1688 }
1689
1690 if (S_ISDIR(inode->i_mode)) {
1691 /*
1692 * If we are creating a directory, we need to make sure that the
1693 * dot and dot dot dirents are setup properly.
1694 */
1695 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1696 if (IS_ERR(dir)) {
1697 ext4_debug("Dir %d not found.", darg.ino);
1698 goto out;
1699 }
1700 ret = ext4_init_new_dir(NULL, dir, inode);
1701 iput(dir);
1702 if (ret) {
1703 ret = 0;
1704 goto out;
1705 }
1706 }
1707 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1708 if (ret)
1709 goto out;
1710 set_nlink(inode, 1);
1711 ext4_mark_inode_dirty(NULL, inode);
1712 out:
1713 iput(inode);
1714 return ret;
1715 }
1716
1717 /*
1718 * Record physical disk regions which are in use as per fast commit area,
1719 * and used by inodes during replay phase. Our simple replay phase
1720 * allocator excludes these regions from allocation.
1721 */
ext4_fc_record_regions(struct super_block * sb,int ino,ext4_lblk_t lblk,ext4_fsblk_t pblk,int len,int replay)1722 int ext4_fc_record_regions(struct super_block *sb, int ino,
1723 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1724 {
1725 struct ext4_fc_replay_state *state;
1726 struct ext4_fc_alloc_region *region;
1727
1728 state = &EXT4_SB(sb)->s_fc_replay_state;
1729 /*
1730 * during replay phase, the fc_regions_valid may not same as
1731 * fc_regions_used, update it when do new additions.
1732 */
1733 if (replay && state->fc_regions_used != state->fc_regions_valid)
1734 state->fc_regions_used = state->fc_regions_valid;
1735 if (state->fc_regions_used == state->fc_regions_size) {
1736 struct ext4_fc_alloc_region *fc_regions;
1737
1738 fc_regions = krealloc(state->fc_regions,
1739 sizeof(struct ext4_fc_alloc_region) *
1740 (state->fc_regions_size +
1741 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1742 GFP_KERNEL);
1743 if (!fc_regions)
1744 return -ENOMEM;
1745 state->fc_regions_size +=
1746 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1747 state->fc_regions = fc_regions;
1748 }
1749 region = &state->fc_regions[state->fc_regions_used++];
1750 region->ino = ino;
1751 region->lblk = lblk;
1752 region->pblk = pblk;
1753 region->len = len;
1754
1755 if (replay)
1756 state->fc_regions_valid++;
1757
1758 return 0;
1759 }
1760
1761 /* Replay add range tag */
ext4_fc_replay_add_range(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1762 static int ext4_fc_replay_add_range(struct super_block *sb,
1763 struct ext4_fc_tl_mem *tl, u8 *val)
1764 {
1765 struct ext4_fc_add_range fc_add_ex;
1766 struct ext4_extent newex, *ex;
1767 struct inode *inode;
1768 ext4_lblk_t start, cur;
1769 int remaining, len;
1770 ext4_fsblk_t start_pblk;
1771 struct ext4_map_blocks map;
1772 struct ext4_ext_path *path = NULL;
1773 int ret;
1774
1775 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1776 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1777
1778 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1779 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1780 ext4_ext_get_actual_len(ex));
1781
1782 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1783 if (IS_ERR(inode)) {
1784 ext4_debug("Inode not found.");
1785 return 0;
1786 }
1787
1788 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1789 if (ret)
1790 goto out;
1791
1792 start = le32_to_cpu(ex->ee_block);
1793 start_pblk = ext4_ext_pblock(ex);
1794 len = ext4_ext_get_actual_len(ex);
1795
1796 cur = start;
1797 remaining = len;
1798 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1799 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1800 inode->i_ino);
1801
1802 while (remaining > 0) {
1803 map.m_lblk = cur;
1804 map.m_len = remaining;
1805 map.m_pblk = 0;
1806 ret = ext4_map_blocks(NULL, inode, &map, 0);
1807
1808 if (ret < 0)
1809 goto out;
1810
1811 if (ret == 0) {
1812 /* Range is not mapped */
1813 path = ext4_find_extent(inode, cur, path, 0);
1814 if (IS_ERR(path))
1815 goto out;
1816 memset(&newex, 0, sizeof(newex));
1817 newex.ee_block = cpu_to_le32(cur);
1818 ext4_ext_store_pblock(
1819 &newex, start_pblk + cur - start);
1820 newex.ee_len = cpu_to_le16(map.m_len);
1821 if (ext4_ext_is_unwritten(ex))
1822 ext4_ext_mark_unwritten(&newex);
1823 down_write(&EXT4_I(inode)->i_data_sem);
1824 path = ext4_ext_insert_extent(NULL, inode,
1825 path, &newex, 0);
1826 up_write((&EXT4_I(inode)->i_data_sem));
1827 if (IS_ERR(path))
1828 goto out;
1829 goto next;
1830 }
1831
1832 if (start_pblk + cur - start != map.m_pblk) {
1833 /*
1834 * Logical to physical mapping changed. This can happen
1835 * if this range was removed and then reallocated to
1836 * map to new physical blocks during a fast commit.
1837 */
1838 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1839 ext4_ext_is_unwritten(ex),
1840 start_pblk + cur - start);
1841 if (ret)
1842 goto out;
1843 /*
1844 * Mark the old blocks as free since they aren't used
1845 * anymore. We maintain an array of all the modified
1846 * inodes. In case these blocks are still used at either
1847 * a different logical range in the same inode or in
1848 * some different inode, we will mark them as allocated
1849 * at the end of the FC replay using our array of
1850 * modified inodes.
1851 */
1852 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1853 goto next;
1854 }
1855
1856 /* Range is mapped and needs a state change */
1857 ext4_debug("Converting from %ld to %d %lld",
1858 map.m_flags & EXT4_MAP_UNWRITTEN,
1859 ext4_ext_is_unwritten(ex), map.m_pblk);
1860 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1861 ext4_ext_is_unwritten(ex), map.m_pblk);
1862 if (ret)
1863 goto out;
1864 /*
1865 * We may have split the extent tree while toggling the state.
1866 * Try to shrink the extent tree now.
1867 */
1868 ext4_ext_replay_shrink_inode(inode, start + len);
1869 next:
1870 cur += map.m_len;
1871 remaining -= map.m_len;
1872 }
1873 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1874 sb->s_blocksize_bits);
1875 out:
1876 ext4_free_ext_path(path);
1877 iput(inode);
1878 return 0;
1879 }
1880
1881 /* Replay DEL_RANGE tag */
1882 static int
ext4_fc_replay_del_range(struct super_block * sb,struct ext4_fc_tl_mem * tl,u8 * val)1883 ext4_fc_replay_del_range(struct super_block *sb,
1884 struct ext4_fc_tl_mem *tl, u8 *val)
1885 {
1886 struct inode *inode;
1887 struct ext4_fc_del_range lrange;
1888 struct ext4_map_blocks map;
1889 ext4_lblk_t cur, remaining;
1890 int ret;
1891
1892 memcpy(&lrange, val, sizeof(lrange));
1893 cur = le32_to_cpu(lrange.fc_lblk);
1894 remaining = le32_to_cpu(lrange.fc_len);
1895
1896 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1897 le32_to_cpu(lrange.fc_ino), cur, remaining);
1898
1899 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1900 if (IS_ERR(inode)) {
1901 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1902 return 0;
1903 }
1904
1905 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1906 if (ret)
1907 goto out;
1908
1909 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1910 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1911 le32_to_cpu(lrange.fc_len));
1912 while (remaining > 0) {
1913 map.m_lblk = cur;
1914 map.m_len = remaining;
1915
1916 ret = ext4_map_blocks(NULL, inode, &map, 0);
1917 if (ret < 0)
1918 goto out;
1919 if (ret > 0) {
1920 remaining -= ret;
1921 cur += ret;
1922 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1923 } else {
1924 remaining -= map.m_len;
1925 cur += map.m_len;
1926 }
1927 }
1928
1929 down_write(&EXT4_I(inode)->i_data_sem);
1930 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1931 le32_to_cpu(lrange.fc_lblk) +
1932 le32_to_cpu(lrange.fc_len) - 1);
1933 up_write(&EXT4_I(inode)->i_data_sem);
1934 if (ret)
1935 goto out;
1936 ext4_ext_replay_shrink_inode(inode,
1937 i_size_read(inode) >> sb->s_blocksize_bits);
1938 ext4_mark_inode_dirty(NULL, inode);
1939 out:
1940 iput(inode);
1941 return 0;
1942 }
1943
ext4_fc_set_bitmaps_and_counters(struct super_block * sb)1944 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1945 {
1946 struct ext4_fc_replay_state *state;
1947 struct inode *inode;
1948 struct ext4_ext_path *path = NULL;
1949 struct ext4_map_blocks map;
1950 int i, ret, j;
1951 ext4_lblk_t cur, end;
1952
1953 state = &EXT4_SB(sb)->s_fc_replay_state;
1954 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1955 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1956 EXT4_IGET_NORMAL);
1957 if (IS_ERR(inode)) {
1958 ext4_debug("Inode %d not found.",
1959 state->fc_modified_inodes[i]);
1960 continue;
1961 }
1962 cur = 0;
1963 end = EXT_MAX_BLOCKS;
1964 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1965 iput(inode);
1966 continue;
1967 }
1968 while (cur < end) {
1969 map.m_lblk = cur;
1970 map.m_len = end - cur;
1971
1972 ret = ext4_map_blocks(NULL, inode, &map, 0);
1973 if (ret < 0)
1974 break;
1975
1976 if (ret > 0) {
1977 path = ext4_find_extent(inode, map.m_lblk, path, 0);
1978 if (!IS_ERR(path)) {
1979 for (j = 0; j < path->p_depth; j++)
1980 ext4_mb_mark_bb(inode->i_sb,
1981 path[j].p_block, 1, true);
1982 } else {
1983 path = NULL;
1984 }
1985 cur += ret;
1986 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1987 map.m_len, true);
1988 } else {
1989 cur = cur + (map.m_len ? map.m_len : 1);
1990 }
1991 }
1992 iput(inode);
1993 }
1994
1995 ext4_free_ext_path(path);
1996 }
1997
1998 /*
1999 * Check if block is in excluded regions for block allocation. The simple
2000 * allocator that runs during replay phase is calls this function to see
2001 * if it is okay to use a block.
2002 */
ext4_fc_replay_check_excluded(struct super_block * sb,ext4_fsblk_t blk)2003 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
2004 {
2005 int i;
2006 struct ext4_fc_replay_state *state;
2007
2008 state = &EXT4_SB(sb)->s_fc_replay_state;
2009 for (i = 0; i < state->fc_regions_valid; i++) {
2010 if (state->fc_regions[i].ino == 0 ||
2011 state->fc_regions[i].len == 0)
2012 continue;
2013 if (in_range(blk, state->fc_regions[i].pblk,
2014 state->fc_regions[i].len))
2015 return true;
2016 }
2017 return false;
2018 }
2019
2020 /* Cleanup function called after replay */
ext4_fc_replay_cleanup(struct super_block * sb)2021 void ext4_fc_replay_cleanup(struct super_block *sb)
2022 {
2023 struct ext4_sb_info *sbi = EXT4_SB(sb);
2024
2025 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2026 kfree(sbi->s_fc_replay_state.fc_regions);
2027 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2028 }
2029
ext4_fc_value_len_isvalid(struct ext4_sb_info * sbi,int tag,int len)2030 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2031 int tag, int len)
2032 {
2033 switch (tag) {
2034 case EXT4_FC_TAG_ADD_RANGE:
2035 return len == sizeof(struct ext4_fc_add_range);
2036 case EXT4_FC_TAG_DEL_RANGE:
2037 return len == sizeof(struct ext4_fc_del_range);
2038 case EXT4_FC_TAG_CREAT:
2039 case EXT4_FC_TAG_LINK:
2040 case EXT4_FC_TAG_UNLINK:
2041 len -= sizeof(struct ext4_fc_dentry_info);
2042 return len >= 1 && len <= EXT4_NAME_LEN;
2043 case EXT4_FC_TAG_INODE:
2044 len -= sizeof(struct ext4_fc_inode);
2045 return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2046 len <= sbi->s_inode_size;
2047 case EXT4_FC_TAG_PAD:
2048 return true; /* padding can have any length */
2049 case EXT4_FC_TAG_TAIL:
2050 return len >= sizeof(struct ext4_fc_tail);
2051 case EXT4_FC_TAG_HEAD:
2052 return len == sizeof(struct ext4_fc_head);
2053 }
2054 return false;
2055 }
2056
2057 /*
2058 * Recovery Scan phase handler
2059 *
2060 * This function is called during the scan phase and is responsible
2061 * for doing following things:
2062 * - Make sure the fast commit area has valid tags for replay
2063 * - Count number of tags that need to be replayed by the replay handler
2064 * - Verify CRC
2065 * - Create a list of excluded blocks for allocation during replay phase
2066 *
2067 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2068 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2069 * to indicate that scan has finished and JBD2 can now start replay phase.
2070 * It returns a negative error to indicate that there was an error. At the end
2071 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2072 * to indicate the number of tags that need to replayed during the replay phase.
2073 */
ext4_fc_replay_scan(journal_t * journal,struct buffer_head * bh,int off,tid_t expected_tid)2074 static int ext4_fc_replay_scan(journal_t *journal,
2075 struct buffer_head *bh, int off,
2076 tid_t expected_tid)
2077 {
2078 struct super_block *sb = journal->j_private;
2079 struct ext4_sb_info *sbi = EXT4_SB(sb);
2080 struct ext4_fc_replay_state *state;
2081 int ret = JBD2_FC_REPLAY_CONTINUE;
2082 struct ext4_fc_add_range ext;
2083 struct ext4_fc_tl_mem tl;
2084 struct ext4_fc_tail tail;
2085 __u8 *start, *end, *cur, *val;
2086 struct ext4_fc_head head;
2087 struct ext4_extent *ex;
2088
2089 state = &sbi->s_fc_replay_state;
2090
2091 start = (u8 *)bh->b_data;
2092 end = start + journal->j_blocksize;
2093
2094 if (state->fc_replay_expected_off == 0) {
2095 state->fc_cur_tag = 0;
2096 state->fc_replay_num_tags = 0;
2097 state->fc_crc = 0;
2098 state->fc_regions = NULL;
2099 state->fc_regions_valid = state->fc_regions_used =
2100 state->fc_regions_size = 0;
2101 /* Check if we can stop early */
2102 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2103 != EXT4_FC_TAG_HEAD)
2104 return 0;
2105 }
2106
2107 if (off != state->fc_replay_expected_off) {
2108 ret = -EFSCORRUPTED;
2109 goto out_err;
2110 }
2111
2112 state->fc_replay_expected_off++;
2113 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2114 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2115 ext4_fc_get_tl(&tl, cur);
2116 val = cur + EXT4_FC_TAG_BASE_LEN;
2117 if (tl.fc_len > end - val ||
2118 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2119 ret = state->fc_replay_num_tags ?
2120 JBD2_FC_REPLAY_STOP : -ECANCELED;
2121 goto out_err;
2122 }
2123 ext4_debug("Scan phase, tag:%s, blk %lld\n",
2124 tag2str(tl.fc_tag), bh->b_blocknr);
2125 switch (tl.fc_tag) {
2126 case EXT4_FC_TAG_ADD_RANGE:
2127 memcpy(&ext, val, sizeof(ext));
2128 ex = (struct ext4_extent *)&ext.fc_ex;
2129 ret = ext4_fc_record_regions(sb,
2130 le32_to_cpu(ext.fc_ino),
2131 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2132 ext4_ext_get_actual_len(ex), 0);
2133 if (ret < 0)
2134 break;
2135 ret = JBD2_FC_REPLAY_CONTINUE;
2136 fallthrough;
2137 case EXT4_FC_TAG_DEL_RANGE:
2138 case EXT4_FC_TAG_LINK:
2139 case EXT4_FC_TAG_UNLINK:
2140 case EXT4_FC_TAG_CREAT:
2141 case EXT4_FC_TAG_INODE:
2142 case EXT4_FC_TAG_PAD:
2143 state->fc_cur_tag++;
2144 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2145 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2146 break;
2147 case EXT4_FC_TAG_TAIL:
2148 state->fc_cur_tag++;
2149 memcpy(&tail, val, sizeof(tail));
2150 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2151 EXT4_FC_TAG_BASE_LEN +
2152 offsetof(struct ext4_fc_tail,
2153 fc_crc));
2154 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2155 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2156 state->fc_replay_num_tags = state->fc_cur_tag;
2157 state->fc_regions_valid =
2158 state->fc_regions_used;
2159 } else {
2160 ret = state->fc_replay_num_tags ?
2161 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2162 }
2163 state->fc_crc = 0;
2164 break;
2165 case EXT4_FC_TAG_HEAD:
2166 memcpy(&head, val, sizeof(head));
2167 if (le32_to_cpu(head.fc_features) &
2168 ~EXT4_FC_SUPPORTED_FEATURES) {
2169 ret = -EOPNOTSUPP;
2170 break;
2171 }
2172 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2173 ret = JBD2_FC_REPLAY_STOP;
2174 break;
2175 }
2176 state->fc_cur_tag++;
2177 state->fc_crc = ext4_chksum(state->fc_crc, cur,
2178 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2179 break;
2180 default:
2181 ret = state->fc_replay_num_tags ?
2182 JBD2_FC_REPLAY_STOP : -ECANCELED;
2183 }
2184 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2185 break;
2186 }
2187
2188 out_err:
2189 trace_ext4_fc_replay_scan(sb, ret, off);
2190 return ret;
2191 }
2192
2193 /*
2194 * Main recovery path entry point.
2195 * The meaning of return codes is similar as above.
2196 */
ext4_fc_replay(journal_t * journal,struct buffer_head * bh,enum passtype pass,int off,tid_t expected_tid)2197 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2198 enum passtype pass, int off, tid_t expected_tid)
2199 {
2200 struct super_block *sb = journal->j_private;
2201 struct ext4_sb_info *sbi = EXT4_SB(sb);
2202 struct ext4_fc_tl_mem tl;
2203 __u8 *start, *end, *cur, *val;
2204 int ret = JBD2_FC_REPLAY_CONTINUE;
2205 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2206 struct ext4_fc_tail tail;
2207
2208 if (pass == PASS_SCAN) {
2209 state->fc_current_pass = PASS_SCAN;
2210 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2211 }
2212
2213 if (state->fc_current_pass != pass) {
2214 state->fc_current_pass = pass;
2215 sbi->s_mount_state |= EXT4_FC_REPLAY;
2216 }
2217 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2218 ext4_debug("Replay stops\n");
2219 ext4_fc_set_bitmaps_and_counters(sb);
2220 return 0;
2221 }
2222
2223 #ifdef CONFIG_EXT4_DEBUG
2224 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2225 pr_warn("Dropping fc block %d because max_replay set\n", off);
2226 return JBD2_FC_REPLAY_STOP;
2227 }
2228 #endif
2229
2230 start = (u8 *)bh->b_data;
2231 end = start + journal->j_blocksize;
2232
2233 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2234 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2235 ext4_fc_get_tl(&tl, cur);
2236 val = cur + EXT4_FC_TAG_BASE_LEN;
2237
2238 if (state->fc_replay_num_tags == 0) {
2239 ret = JBD2_FC_REPLAY_STOP;
2240 ext4_fc_set_bitmaps_and_counters(sb);
2241 break;
2242 }
2243
2244 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2245 state->fc_replay_num_tags--;
2246 switch (tl.fc_tag) {
2247 case EXT4_FC_TAG_LINK:
2248 ret = ext4_fc_replay_link(sb, &tl, val);
2249 break;
2250 case EXT4_FC_TAG_UNLINK:
2251 ret = ext4_fc_replay_unlink(sb, &tl, val);
2252 break;
2253 case EXT4_FC_TAG_ADD_RANGE:
2254 ret = ext4_fc_replay_add_range(sb, &tl, val);
2255 break;
2256 case EXT4_FC_TAG_CREAT:
2257 ret = ext4_fc_replay_create(sb, &tl, val);
2258 break;
2259 case EXT4_FC_TAG_DEL_RANGE:
2260 ret = ext4_fc_replay_del_range(sb, &tl, val);
2261 break;
2262 case EXT4_FC_TAG_INODE:
2263 ret = ext4_fc_replay_inode(sb, &tl, val);
2264 break;
2265 case EXT4_FC_TAG_PAD:
2266 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2267 tl.fc_len, 0);
2268 break;
2269 case EXT4_FC_TAG_TAIL:
2270 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2271 0, tl.fc_len, 0);
2272 memcpy(&tail, val, sizeof(tail));
2273 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2274 break;
2275 case EXT4_FC_TAG_HEAD:
2276 break;
2277 default:
2278 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2279 ret = -ECANCELED;
2280 break;
2281 }
2282 if (ret < 0)
2283 break;
2284 ret = JBD2_FC_REPLAY_CONTINUE;
2285 }
2286 return ret;
2287 }
2288
ext4_fc_init(struct super_block * sb,journal_t * journal)2289 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2290 {
2291 /*
2292 * We set replay callback even if fast commit disabled because we may
2293 * could still have fast commit blocks that need to be replayed even if
2294 * fast commit has now been turned off.
2295 */
2296 journal->j_fc_replay_callback = ext4_fc_replay;
2297 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2298 return;
2299 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2300 }
2301
2302 static const char * const fc_ineligible_reasons[] = {
2303 [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2304 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2305 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2306 [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2307 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2308 [EXT4_FC_REASON_RESIZE] = "Resize",
2309 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2310 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2311 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2312 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2313 [EXT4_FC_REASON_MIGRATE] = "Inode format migration",
2314 [EXT4_FC_REASON_VERITY] = "fs-verity enable",
2315 [EXT4_FC_REASON_MOVE_EXT] = "Move extents",
2316 };
2317
ext4_fc_info_show(struct seq_file * seq,void * v)2318 int ext4_fc_info_show(struct seq_file *seq, void *v)
2319 {
2320 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2321 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2322 int i;
2323
2324 if (v != SEQ_START_TOKEN)
2325 return 0;
2326
2327 seq_printf(seq,
2328 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2329 stats->fc_num_commits, stats->fc_ineligible_commits,
2330 stats->fc_numblks,
2331 div_u64(stats->s_fc_avg_commit_time, 1000));
2332 seq_puts(seq, "Ineligible reasons:\n");
2333 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2334 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2335 stats->fc_ineligible_reason_count[i]);
2336
2337 return 0;
2338 }
2339
ext4_fc_init_dentry_cache(void)2340 int __init ext4_fc_init_dentry_cache(void)
2341 {
2342 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2343 SLAB_RECLAIM_ACCOUNT);
2344
2345 if (ext4_fc_dentry_cachep == NULL)
2346 return -ENOMEM;
2347
2348 return 0;
2349 }
2350
ext4_fc_destroy_dentry_cache(void)2351 void ext4_fc_destroy_dentry_cache(void)
2352 {
2353 kmem_cache_destroy(ext4_fc_dentry_cachep);
2354 }
2355