1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 #include <linux/lockdep.h> 16 #include <linux/wait_bit.h> 17 /* 18 * Ext4 Fast Commits 19 * ----------------- 20 * 21 * Ext4 fast commits implement fine grained journalling for Ext4. 22 * 23 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 24 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 25 * TLV during the recovery phase. For the scenarios for which we currently 26 * don't have replay code, fast commit falls back to full commits. 27 * Fast commits record delta in one of the following three categories. 28 * 29 * (A) Directory entry updates: 30 * 31 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 32 * - EXT4_FC_TAG_LINK - records directory entry link 33 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 34 * 35 * (B) File specific data range updates: 36 * 37 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 38 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 39 * 40 * (C) Inode metadata (mtime / ctime etc): 41 * 42 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 43 * during recovery. Note that iblocks field is 44 * not replayed and instead derived during 45 * replay. 46 * Commit Operation 47 * ---------------- 48 * With fast commits, we maintain all the directory entry operations in the 49 * order in which they are issued in an in-memory queue. This queue is flushed 50 * to disk during the commit operation. We also maintain a list of inodes 51 * that need to be committed during a fast commit in another in memory queue of 52 * inodes. During the commit operation, we commit in the following order: 53 * 54 * [1] Prepare all the inodes to write out their data by setting 55 * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be 56 * deleted while it is being flushed. 57 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" 58 * state. 59 * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures 60 * that all the existing handles finish and no new handles can start. 61 * [4] Mark all the fast commit eligible inodes as undergoing fast commit by 62 * setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state 63 * needed for log writing. 64 * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows 65 * starting of new handles. Updates to inodes being fast committed are 66 * tracked for requeue rather than blocking. 67 * [6] Commit all the directory entry updates in the fast commit space. 68 * [7] Commit all the changed inodes in the fast commit space. 69 * [8] Write tail tag (this tag ensures the atomicity, please read the following 70 * section for more details). 71 * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in 72 * ext4_fc_cleanup(). 73 * 74 * All the inode updates must be enclosed within jbd2_journal_start() 75 * and jbd2_journal_stop() similar to JBD2 journaling. 76 * 77 * Fast Commit Ineligibility 78 * ------------------------- 79 * 80 * Not all operations are supported by fast commits today (e.g extended 81 * attributes). Fast commit ineligibility is marked by calling 82 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 83 * to full commit. 84 * 85 * Atomicity of commits 86 * -------------------- 87 * In order to guarantee atomicity during the commit operation, fast commit 88 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 89 * tag contains CRC of the contents and TID of the transaction after which 90 * this fast commit should be applied. Recovery code replays fast commit 91 * logs only if there's at least 1 valid tail present. For every fast commit 92 * operation, there is 1 tail. This means, we may end up with multiple tails 93 * in the fast commit space. Here's an example: 94 * 95 * - Create a new file A and remove existing file B 96 * - fsync() 97 * - Append contents to file A 98 * - Truncate file A 99 * - fsync() 100 * 101 * The fast commit space at the end of above operations would look like this: 102 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 103 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 104 * 105 * Replay code should thus check for all the valid tails in the FC area. 106 * 107 * Fast Commit Replay Idempotence 108 * ------------------------------ 109 * 110 * Fast commits tags are idempotent in nature provided the recovery code follows 111 * certain rules. The guiding principle that the commit path follows while 112 * committing is that it stores the result of a particular operation instead of 113 * storing the procedure. 114 * 115 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 116 * was associated with inode 10. During fast commit, instead of storing this 117 * operation as a procedure "rename a to b", we store the resulting file system 118 * state as a "series" of outcomes: 119 * 120 * - Link dirent b to inode 10 121 * - Unlink dirent a 122 * - Inode <10> with valid refcount 123 * 124 * Now when recovery code runs, it needs "enforce" this state on the file 125 * system. This is what guarantees idempotence of fast commit replay. 126 * 127 * Let's take an example of a procedure that is not idempotent and see how fast 128 * commits make it idempotent. Consider following sequence of operations: 129 * 130 * rm A; mv B A; read A 131 * (x) (y) (z) 132 * 133 * (x), (y) and (z) are the points at which we can crash. If we store this 134 * sequence of operations as is then the replay is not idempotent. Let's say 135 * while in replay, we crash at (z). During the second replay, file A (which was 136 * actually created as a result of "mv B A" operation) would get deleted. Thus, 137 * file named A would be absent when we try to read A. So, this sequence of 138 * operations is not idempotent. However, as mentioned above, instead of storing 139 * the procedure fast commits store the outcome of each procedure. Thus the fast 140 * commit log for above procedure would be as follows: 141 * 142 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 143 * inode 11 before the replay) 144 * 145 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 146 * (w) (x) (y) (z) 147 * 148 * If we crash at (z), we will have file A linked to inode 11. During the second 149 * replay, we will remove file A (inode 11). But we will create it back and make 150 * it point to inode 11. We won't find B, so we'll just skip that step. At this 151 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 152 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 153 * similarly. Thus, by converting a non-idempotent procedure into a series of 154 * idempotent outcomes, fast commits ensured idempotence during the replay. 155 * 156 * Locking 157 * ------- 158 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit 159 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given 160 * inode. Most of the code avoids acquiring both the locks, but if one must do 161 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. 162 * 163 * TODOs 164 * ----- 165 * 166 * 0) Fast commit replay path hardening: Fast commit replay code should use 167 * journal handles to make sure all the updates it does during the replay 168 * path are atomic. With that if we crash during fast commit replay, after 169 * trying to do recovery again, we will find a file system where fast commit 170 * area is invalid (because new full commit would be found). In order to deal 171 * with that, fast commit replay code should ensure that the "FC_REPLAY" 172 * superblock state is persisted before starting the replay, so that after 173 * the crash, fast commit recovery code can look at that flag and perform 174 * fast commit recovery even if that area is invalidated by later full 175 * commits. 176 * 177 * 1) Handle more ineligible cases. 178 * 179 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent 180 * status tree. This would get rid of the need to call ext4_fc_track_inode() 181 * before acquiring i_data_sem. To do that we would need to ensure that 182 * modified extents from the extent status tree are not evicted from memory. 183 */ 184 185 #include <trace/events/ext4.h> 186 static struct kmem_cache *ext4_fc_dentry_cachep; 187 static struct kmem_cache *ext4_fc_range_cachep; 188 189 /* 190 * Avoid spending unbounded time/memory snapshotting highly fragmented files 191 * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to 192 * full commit. 193 */ 194 #define EXT4_FC_SNAPSHOT_MAX_INODES 1024 195 #define EXT4_FC_SNAPSHOT_MAX_RANGES 2048 196 197 static inline void ext4_fc_set_snap_err(int *snap_err, int err) 198 { 199 if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE) 200 *snap_err = err; 201 } 202 203 static void ext4_end_buffer_io_sync(struct bio *bio) 204 { 205 struct buffer_head *bh; 206 bool uptodate = bio_endio_bh(bio, &bh); 207 208 BUFFER_TRACE(bh, ""); 209 if (uptodate) { 210 ext4_debug("%s: Block %lld up-to-date", 211 __func__, bh->b_blocknr); 212 set_buffer_uptodate(bh); 213 } else { 214 ext4_debug("%s: Block %lld not up-to-date", 215 __func__, bh->b_blocknr); 216 clear_buffer_uptodate(bh); 217 } 218 219 unlock_buffer(bh); 220 } 221 222 static void ext4_fc_free_inode_snap(struct inode *inode); 223 224 static inline void ext4_fc_reset_inode(struct inode *inode) 225 { 226 struct ext4_inode_info *ei = EXT4_I(inode); 227 228 ei->i_fc_lblk_start = 0; 229 ei->i_fc_lblk_len = 0; 230 } 231 232 void ext4_fc_init_inode(struct inode *inode) 233 { 234 struct ext4_inode_info *ei = EXT4_I(inode); 235 236 ext4_fc_reset_inode(inode); 237 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 238 ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE); 239 INIT_LIST_HEAD(&ei->i_fc_list); 240 INIT_LIST_HEAD(&ei->i_fc_dilist); 241 ei->i_fc_snap = NULL; 242 } 243 244 static bool ext4_fc_disabled(struct super_block *sb) 245 { 246 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 247 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 248 } 249 250 static bool ext4_fc_eligible(struct super_block *sb) 251 { 252 return !ext4_fc_disabled(sb) && 253 !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)); 254 } 255 256 /* 257 * Wait for an inode fast-commit state bit to clear while dropping the 258 * fast-commit lock around schedule(). 259 */ 260 static void ext4_fc_wait_inode_state(struct inode *inode, int bit, 261 int *alloc_ctx) 262 { 263 wait_queue_head_t *wq; 264 unsigned long *wait_word = ext4_inode_state_wait_word(inode); 265 int wait_bit = ext4_inode_state_wait_bit(bit); 266 267 while (ext4_test_inode_state(inode, bit)) { 268 DEFINE_WAIT_BIT(wait, wait_word, wait_bit); 269 270 wq = bit_waitqueue(wait_word, wait_bit); 271 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 272 if (ext4_test_inode_state(inode, bit)) { 273 ext4_fc_unlock(inode->i_sb, *alloc_ctx); 274 schedule(); 275 *alloc_ctx = ext4_fc_lock(inode->i_sb); 276 } 277 finish_wait(wq, &wait.wq_entry); 278 } 279 } 280 281 static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit) 282 { 283 wake_up_bit(ext4_inode_state_wait_word(inode), 284 ext4_inode_state_wait_bit(bit)); 285 } 286 287 static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value) 288 { 289 u64 old = atomic64_read(stat); 290 291 while (value > old) { 292 u64 prev = atomic64_cmpxchg(stat, old, value); 293 294 if (prev == old) 295 break; 296 old = prev; 297 } 298 } 299 300 /* 301 * Remove inode from fast commit list. If the inode is being committed 302 * we wait until inode commit is done. 303 */ 304 void ext4_fc_del(struct inode *inode) 305 { 306 struct ext4_inode_info *ei = EXT4_I(inode); 307 struct ext4_fc_dentry_update *fc_dentry; 308 int alloc_ctx; 309 310 if (ext4_fc_disabled(inode->i_sb)) 311 return; 312 313 alloc_ctx = ext4_fc_lock(inode->i_sb); 314 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 315 ext4_fc_free_inode_snap(inode); 316 ext4_fc_unlock(inode->i_sb, alloc_ctx); 317 return; 318 } 319 320 /* 321 * Wait for ongoing fast commit to finish. We cannot remove the inode 322 * from fast commit lists while it is being committed. If we wake from 323 * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the 324 * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA. 325 */ 326 for (;;) { 327 ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING, 328 &alloc_ctx); 329 330 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) 331 break; 332 333 ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA, 334 &alloc_ctx); 335 } 336 337 ext4_fc_free_inode_snap(inode); 338 list_del_init(&ei->i_fc_list); 339 340 /* 341 * Since this inode is getting removed, let's also remove all FC dentry 342 * create references, since it is not needed to log it anyways. 343 */ 344 if (list_empty(&ei->i_fc_dilist)) { 345 ext4_fc_unlock(inode->i_sb, alloc_ctx); 346 return; 347 } 348 349 fc_dentry = list_first_entry(&ei->i_fc_dilist, 350 struct ext4_fc_dentry_update, 351 fcd_dilist); 352 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 353 list_del_init(&fc_dentry->fcd_list); 354 list_del_init(&fc_dentry->fcd_dilist); 355 356 WARN_ON(!list_empty(&ei->i_fc_dilist)); 357 ext4_fc_unlock(inode->i_sb, alloc_ctx); 358 359 release_dentry_name_snapshot(&fc_dentry->fcd_name); 360 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 361 } 362 363 /* 364 * Mark file system as fast commit ineligible, and record latest 365 * ineligible transaction tid. This means until the recorded 366 * transaction, commit operation would result in a full jbd2 commit. 367 */ 368 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 369 { 370 struct ext4_sb_info *sbi = EXT4_SB(sb); 371 tid_t tid; 372 bool has_transaction = true; 373 bool is_ineligible; 374 int alloc_ctx; 375 376 if (ext4_fc_disabled(sb)) 377 return; 378 379 if (!IS_ERR_OR_NULL(handle)) 380 tid = handle->h_transaction->t_tid; 381 else { 382 read_lock(&sbi->s_journal->j_state_lock); 383 if (sbi->s_journal->j_running_transaction) 384 tid = sbi->s_journal->j_running_transaction->t_tid; 385 else 386 has_transaction = false; 387 read_unlock(&sbi->s_journal->j_state_lock); 388 } 389 alloc_ctx = ext4_fc_lock(sb); 390 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 391 if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) 392 sbi->s_fc_ineligible_tid = tid; 393 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 394 ext4_fc_unlock(sb, alloc_ctx); 395 WARN_ON(reason >= EXT4_FC_REASON_MAX); 396 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 397 } 398 399 /* 400 * Generic fast commit tracking function. If this is the first time this we are 401 * called after a full commit, we initialize fast commit fields and then call 402 * __fc_track_fn() with update = 0. If we have already been called after a full 403 * commit, we pass update = 1. Based on that, the track function can determine 404 * if it needs to track a field for the first time or if it needs to just 405 * update the previously tracked value. 406 * 407 * If enqueue is set, this function enqueues the inode in fast commit list. 408 */ 409 static int ext4_fc_track_template( 410 handle_t *handle, struct inode *inode, 411 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), 412 void *args, int enqueue) 413 { 414 bool update = false; 415 struct ext4_inode_info *ei = EXT4_I(inode); 416 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 417 tid_t tid = 0; 418 int alloc_ctx; 419 int ret; 420 421 tid = handle->h_transaction->t_tid; 422 spin_lock(&ei->i_fc_lock); 423 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 424 ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE); 425 if (tid == ei->i_sync_tid) { 426 update = true; 427 } else { 428 ext4_fc_reset_inode(inode); 429 ei->i_sync_tid = tid; 430 } 431 ret = __fc_track_fn(handle, inode, args, update); 432 spin_unlock(&ei->i_fc_lock); 433 if (!enqueue) 434 return ret; 435 436 alloc_ctx = ext4_fc_lock(inode->i_sb); 437 if (list_empty(&EXT4_I(inode)->i_fc_list)) 438 list_add_tail(&EXT4_I(inode)->i_fc_list, 439 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 440 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 441 &sbi->s_fc_q[FC_Q_STAGING] : 442 &sbi->s_fc_q[FC_Q_MAIN]); 443 ext4_fc_unlock(inode->i_sb, alloc_ctx); 444 445 return ret; 446 } 447 448 struct __track_dentry_update_args { 449 struct dentry *dentry; 450 int op; 451 }; 452 453 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 454 static int __track_dentry_update(handle_t *handle, struct inode *inode, 455 void *arg, bool update) 456 { 457 struct ext4_fc_dentry_update *node; 458 struct ext4_inode_info *ei = EXT4_I(inode); 459 struct __track_dentry_update_args *dentry_update = 460 (struct __track_dentry_update_args *)arg; 461 struct dentry *dentry = dentry_update->dentry; 462 struct inode *dir = dentry->d_parent->d_inode; 463 struct super_block *sb = inode->i_sb; 464 struct ext4_sb_info *sbi = EXT4_SB(sb); 465 int alloc_ctx; 466 467 spin_unlock(&ei->i_fc_lock); 468 469 if (IS_ENCRYPTED(dir)) { 470 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 471 handle); 472 spin_lock(&ei->i_fc_lock); 473 return -EOPNOTSUPP; 474 } 475 476 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 477 if (!node) { 478 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); 479 spin_lock(&ei->i_fc_lock); 480 return -ENOMEM; 481 } 482 483 node->fcd_op = dentry_update->op; 484 node->fcd_parent = dir->i_ino; 485 node->fcd_ino = inode->i_ino; 486 take_dentry_name_snapshot(&node->fcd_name, dentry); 487 INIT_LIST_HEAD(&node->fcd_dilist); 488 INIT_LIST_HEAD(&node->fcd_list); 489 alloc_ctx = ext4_fc_lock(sb); 490 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 491 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 492 list_add_tail(&node->fcd_list, 493 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 494 else 495 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 496 497 /* 498 * This helps us keep a track of all fc_dentry updates which is part of 499 * this ext4 inode. So in case the inode is getting unlinked, before 500 * even we get a chance to fsync, we could remove all fc_dentry 501 * references while evicting the inode in ext4_fc_del(). 502 * Also with this, we don't need to loop over all the inodes in 503 * sbi->s_fc_q to get the corresponding inode in 504 * ext4_fc_commit_dentry_updates(). 505 */ 506 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 507 WARN_ON(!list_empty(&ei->i_fc_dilist)); 508 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 509 } 510 ext4_fc_unlock(sb, alloc_ctx); 511 spin_lock(&ei->i_fc_lock); 512 513 return 0; 514 } 515 516 void __ext4_fc_track_unlink(handle_t *handle, 517 struct inode *inode, struct dentry *dentry) 518 { 519 struct __track_dentry_update_args args; 520 int ret; 521 522 args.dentry = dentry; 523 args.op = EXT4_FC_TAG_UNLINK; 524 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 (void *)&args, 0); 527 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 528 } 529 530 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 531 { 532 struct inode *inode = d_inode(dentry); 533 534 if (ext4_fc_eligible(inode->i_sb)) 535 __ext4_fc_track_unlink(handle, inode, dentry); 536 } 537 538 void __ext4_fc_track_link(handle_t *handle, 539 struct inode *inode, struct dentry *dentry) 540 { 541 struct __track_dentry_update_args args; 542 int ret; 543 544 args.dentry = dentry; 545 args.op = EXT4_FC_TAG_LINK; 546 547 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 548 (void *)&args, 0); 549 trace_ext4_fc_track_link(handle, inode, dentry, ret); 550 } 551 552 void ext4_fc_track_link(handle_t *handle, struct inode *inode, 553 struct dentry *dentry) 554 { 555 if (ext4_fc_eligible(inode->i_sb)) 556 __ext4_fc_track_link(handle, inode, dentry); 557 } 558 559 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 560 struct dentry *dentry) 561 { 562 struct __track_dentry_update_args args; 563 int ret; 564 565 args.dentry = dentry; 566 args.op = EXT4_FC_TAG_CREAT; 567 568 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 569 (void *)&args, 0); 570 trace_ext4_fc_track_create(handle, inode, dentry, ret); 571 } 572 573 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 574 { 575 struct inode *inode = d_inode(dentry); 576 577 if (ext4_fc_eligible(inode->i_sb)) 578 __ext4_fc_track_create(handle, inode, dentry); 579 } 580 581 /* __track_fn for inode tracking */ 582 static int __track_inode(handle_t *handle, struct inode *inode, void *arg, 583 bool update) 584 { 585 if (update) 586 return -EEXIST; 587 588 EXT4_I(inode)->i_fc_lblk_len = 0; 589 590 return 0; 591 } 592 593 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 594 { 595 int ret; 596 597 if (S_ISDIR(inode->i_mode)) 598 return; 599 600 if (ext4_should_journal_data(inode)) { 601 ext4_fc_mark_ineligible(inode->i_sb, 602 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 603 return; 604 } 605 606 if (!ext4_fc_eligible(inode->i_sb)) 607 return; 608 609 /* 610 * Fast commit snapshots inode state at commit time, so there's no need 611 * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already 612 * on the commit queue, ext4_fc_cleanup() will requeue it for the new 613 * transaction once the current commit finishes. 614 */ 615 616 /* 617 * From this point on, this inode will not be committed either 618 * by fast or full commit as long as the handle is open. 619 */ 620 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 621 trace_ext4_fc_track_inode(handle, inode, ret); 622 } 623 624 struct __track_range_args { 625 ext4_lblk_t start, end; 626 }; 627 628 /* __track_fn for tracking data updates */ 629 static int __track_range(handle_t *handle, struct inode *inode, void *arg, 630 bool update) 631 { 632 struct ext4_inode_info *ei = EXT4_I(inode); 633 ext4_lblk_t oldstart; 634 struct __track_range_args *__arg = 635 (struct __track_range_args *)arg; 636 637 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 638 ext4_debug("Special inode %llu being modified\n", inode->i_ino); 639 return -ECANCELED; 640 } 641 642 oldstart = ei->i_fc_lblk_start; 643 644 if (update && ei->i_fc_lblk_len > 0) { 645 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 646 ei->i_fc_lblk_len = 647 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 648 ei->i_fc_lblk_start + 1; 649 } else { 650 ei->i_fc_lblk_start = __arg->start; 651 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 652 } 653 654 return 0; 655 } 656 657 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 658 ext4_lblk_t end) 659 { 660 struct __track_range_args args; 661 int ret; 662 663 if (S_ISDIR(inode->i_mode)) 664 return; 665 666 if (!ext4_fc_eligible(inode->i_sb)) 667 return; 668 669 if (ext4_has_inline_data(inode)) { 670 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, 671 handle); 672 return; 673 } 674 675 args.start = start; 676 args.end = end; 677 678 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 679 680 trace_ext4_fc_track_range(handle, inode, start, end, ret); 681 } 682 683 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 684 { 685 blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; 686 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 687 688 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 689 if (test_opt(sb, BARRIER) && is_tail) 690 write_flags |= REQ_FUA | REQ_PREFLUSH; 691 lock_buffer(bh); 692 set_buffer_dirty(bh); 693 set_buffer_uptodate(bh); 694 bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync); 695 EXT4_SB(sb)->s_fc_bh = NULL; 696 } 697 698 /* Ext4 commit path routines */ 699 700 /* 701 * Allocate len bytes on a fast commit buffer. 702 * 703 * During the commit time this function is used to manage fast commit 704 * block space. We don't split a fast commit log onto different 705 * blocks. So this function makes sure that if there's not enough space 706 * on the current block, the remaining space in the current block is 707 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 708 * new block is from jbd2 and CRC is updated to reflect the padding 709 * we added. 710 */ 711 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 712 { 713 struct ext4_fc_tl tl; 714 struct ext4_sb_info *sbi = EXT4_SB(sb); 715 struct buffer_head *bh; 716 int bsize = sbi->s_journal->j_blocksize; 717 int ret, off = sbi->s_fc_bytes % bsize; 718 int remaining; 719 u8 *dst; 720 721 /* 722 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 723 * cannot fulfill the request. 724 */ 725 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 726 return NULL; 727 728 if (!sbi->s_fc_bh) { 729 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 730 if (ret) 731 return NULL; 732 sbi->s_fc_bh = bh; 733 } 734 dst = sbi->s_fc_bh->b_data + off; 735 736 /* 737 * Allocate the bytes in the current block if we can do so while still 738 * leaving enough space for a PAD tlv. 739 */ 740 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 741 if (len <= remaining) { 742 sbi->s_fc_bytes += len; 743 return dst; 744 } 745 746 /* 747 * Else, terminate the current block with a PAD tlv, then allocate a new 748 * block and allocate the bytes at the start of that new block. 749 */ 750 751 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 752 tl.fc_len = cpu_to_le16(remaining); 753 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 754 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 755 *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); 756 757 ext4_fc_submit_bh(sb, false); 758 759 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 760 if (ret) 761 return NULL; 762 sbi->s_fc_bh = bh; 763 sbi->s_fc_bytes += bsize - off + len; 764 return sbi->s_fc_bh->b_data; 765 } 766 767 /* 768 * Complete a fast commit by writing tail tag. 769 * 770 * Writing tail tag marks the end of a fast commit. In order to guarantee 771 * atomicity, after writing tail tag, even if there's space remaining 772 * in the block, next commit shouldn't use it. That's why tail tag 773 * has the length as that of the remaining space on the block. 774 */ 775 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 776 { 777 struct ext4_sb_info *sbi = EXT4_SB(sb); 778 struct ext4_fc_tl tl; 779 struct ext4_fc_tail tail; 780 int off, bsize = sbi->s_journal->j_blocksize; 781 u8 *dst; 782 783 /* 784 * ext4_fc_reserve_space takes care of allocating an extra block if 785 * there's no enough space on this block for accommodating this tail. 786 */ 787 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 788 if (!dst) 789 return -ENOSPC; 790 791 off = sbi->s_fc_bytes % bsize; 792 793 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 794 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 795 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 796 797 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 798 dst += EXT4_FC_TAG_BASE_LEN; 799 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 800 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 801 dst += sizeof(tail.fc_tid); 802 crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, 803 dst - (u8 *)sbi->s_fc_bh->b_data); 804 tail.fc_crc = cpu_to_le32(crc); 805 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 806 dst += sizeof(tail.fc_crc); 807 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 808 809 ext4_fc_submit_bh(sb, true); 810 811 return 0; 812 } 813 814 /* 815 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 816 * Returns false if there's not enough space. 817 */ 818 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 819 u32 *crc) 820 { 821 struct ext4_fc_tl tl; 822 u8 *dst; 823 824 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 825 if (!dst) 826 return false; 827 828 tl.fc_tag = cpu_to_le16(tag); 829 tl.fc_len = cpu_to_le16(len); 830 831 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 832 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 833 834 return true; 835 } 836 837 /* Same as above, but adds dentry tlv. */ 838 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 839 struct ext4_fc_dentry_update *fc_dentry) 840 { 841 struct ext4_fc_dentry_info fcd; 842 struct ext4_fc_tl tl; 843 int dlen = fc_dentry->fcd_name.name.len; 844 u8 *dst = ext4_fc_reserve_space(sb, 845 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 846 847 if (!dst) 848 return false; 849 850 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 851 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 852 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 853 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 854 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 855 dst += EXT4_FC_TAG_BASE_LEN; 856 memcpy(dst, &fcd, sizeof(fcd)); 857 dst += sizeof(fcd); 858 memcpy(dst, fc_dentry->fcd_name.name.name, dlen); 859 860 return true; 861 } 862 863 struct ext4_fc_range { 864 struct list_head list; 865 u16 tag; 866 ext4_lblk_t lblk; 867 ext4_lblk_t len; 868 ext4_fsblk_t pblk; 869 bool unwritten; 870 }; 871 872 struct ext4_fc_inode_snap { 873 struct list_head data_list; 874 unsigned int inode_len; 875 u8 inode_buf[]; 876 }; 877 878 /* 879 * Writes inode in the fast commit space under TLV with tag @tag. 880 * Returns 0 on success, error on failure. 881 */ 882 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 883 { 884 struct ext4_inode_info *ei = EXT4_I(inode); 885 struct ext4_fc_inode_snap *snap = ei->i_fc_snap; 886 struct ext4_fc_snap_stats *stats = 887 &EXT4_SB(inode->i_sb)->s_fc_snap_stats; 888 struct ext4_fc_inode fc_inode; 889 struct ext4_fc_tl tl; 890 u8 *dst; 891 u8 *src; 892 int inode_len; 893 int ret; 894 895 if (!snap) { 896 atomic64_inc(&stats->snap_fail_no_snap); 897 return -ECANCELED; 898 } 899 900 src = snap->inode_buf; 901 inode_len = snap->inode_len; 902 if (!src || inode_len == 0) { 903 atomic64_inc(&stats->snap_fail_no_snap); 904 return -ECANCELED; 905 } 906 907 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 908 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 909 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 910 911 ret = -ECANCELED; 912 dst = ext4_fc_reserve_space(inode->i_sb, 913 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 914 if (!dst) 915 goto err; 916 917 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 918 dst += EXT4_FC_TAG_BASE_LEN; 919 memcpy(dst, &fc_inode, sizeof(fc_inode)); 920 dst += sizeof(fc_inode); 921 memcpy(dst, src, inode_len); 922 ret = 0; 923 err: 924 return ret; 925 } 926 927 /* 928 * Writes updated data ranges for the inode in question. Updates CRC. 929 * Returns 0 on success, error otherwise. 930 */ 931 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 932 { 933 struct ext4_inode_info *ei = EXT4_I(inode); 934 struct ext4_fc_inode_snap *snap = ei->i_fc_snap; 935 struct ext4_fc_snap_stats *stats = 936 &EXT4_SB(inode->i_sb)->s_fc_snap_stats; 937 struct ext4_fc_add_range fc_ext; 938 struct ext4_fc_del_range lrange; 939 struct ext4_extent *ex; 940 struct ext4_fc_range *range; 941 942 if (!snap) { 943 atomic64_inc(&stats->snap_fail_no_snap); 944 return -ECANCELED; 945 } 946 947 list_for_each_entry(range, &snap->data_list, list) { 948 if (range->tag == EXT4_FC_TAG_DEL_RANGE) { 949 lrange.fc_ino = cpu_to_le32(inode->i_ino); 950 lrange.fc_lblk = cpu_to_le32(range->lblk); 951 lrange.fc_len = cpu_to_le32(range->len); 952 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 953 sizeof(lrange), (u8 *)&lrange, crc)) 954 return -ENOSPC; 955 continue; 956 } 957 958 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 959 ex = (struct ext4_extent *)&fc_ext.fc_ex; 960 ex->ee_block = cpu_to_le32(range->lblk); 961 ex->ee_len = cpu_to_le16(range->len); 962 ext4_ext_store_pblock(ex, range->pblk); 963 if (range->unwritten) 964 ext4_ext_mark_unwritten(ex); 965 else 966 ext4_ext_mark_initialized(ex); 967 968 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 969 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 970 return -ENOSPC; 971 } 972 973 return 0; 974 } 975 976 static void ext4_fc_free_ranges(struct list_head *head) 977 { 978 struct ext4_fc_range *range, *range_n; 979 980 list_for_each_entry_safe(range, range_n, head, list) { 981 list_del(&range->list); 982 kmem_cache_free(ext4_fc_range_cachep, range); 983 } 984 } 985 986 static void ext4_fc_free_inode_snap(struct inode *inode) 987 { 988 struct ext4_inode_info *ei = EXT4_I(inode); 989 struct ext4_fc_inode_snap *snap = ei->i_fc_snap; 990 991 if (!snap) 992 return; 993 994 ext4_fc_free_ranges(&snap->data_list); 995 kfree(snap); 996 ei->i_fc_snap = NULL; 997 } 998 999 static int ext4_fc_snapshot_inode_data(struct inode *inode, 1000 struct list_head *ranges, 1001 unsigned int nr_ranges_total, 1002 unsigned int *nr_rangesp, 1003 int *snap_err) 1004 { 1005 struct ext4_inode_info *ei = EXT4_I(inode); 1006 struct ext4_fc_snap_stats *stats = 1007 &EXT4_SB(inode->i_sb)->s_fc_snap_stats; 1008 ext4_lblk_t start_lblk, end_lblk, cur_lblk; 1009 unsigned int nr_ranges = 0; 1010 1011 spin_lock(&ei->i_fc_lock); 1012 if (ei->i_fc_lblk_len == 0) { 1013 spin_unlock(&ei->i_fc_lock); 1014 if (nr_rangesp) 1015 *nr_rangesp = 0; 1016 return 0; 1017 } 1018 start_lblk = ei->i_fc_lblk_start; 1019 end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 1020 ei->i_fc_lblk_len = 0; 1021 spin_unlock(&ei->i_fc_lock); 1022 1023 cur_lblk = start_lblk; 1024 ext4_debug("snapshot data ranges %u-%u for inode %llu\n", 1025 start_lblk, end_lblk, 1026 (unsigned long long)inode->i_ino); 1027 1028 while (cur_lblk <= end_lblk) { 1029 struct extent_status es; 1030 struct ext4_fc_range *range; 1031 ext4_lblk_t len; 1032 u64 remaining = (u64)end_lblk - cur_lblk + 1; 1033 1034 if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) { 1035 atomic64_inc(&stats->snap_fail_es_miss); 1036 ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS); 1037 return -EAGAIN; 1038 } 1039 1040 if (ext4_es_is_delayed(&es)) { 1041 atomic64_inc(&stats->snap_fail_es_delayed); 1042 ext4_fc_set_snap_err(snap_err, 1043 EXT4_FC_SNAP_ERR_ES_DELAYED); 1044 return -EAGAIN; 1045 } 1046 1047 len = es.es_len - (cur_lblk - es.es_lblk); 1048 if (len > remaining) 1049 len = remaining; 1050 if (len == 0) { 1051 cur_lblk++; 1052 continue; 1053 } 1054 1055 if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) { 1056 atomic64_inc(&stats->snap_fail_ranges_cap); 1057 ext4_fc_set_snap_err(snap_err, 1058 EXT4_FC_SNAP_ERR_RANGES_CAP); 1059 return -E2BIG; 1060 } 1061 1062 range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS); 1063 if (!range) { 1064 atomic64_inc(&stats->snap_fail_nomem); 1065 ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM); 1066 return -ENOMEM; 1067 } 1068 nr_ranges++; 1069 1070 range->lblk = cur_lblk; 1071 range->len = len; 1072 range->pblk = 0; 1073 range->unwritten = false; 1074 1075 if (ext4_es_is_hole(&es)) { 1076 range->tag = EXT4_FC_TAG_DEL_RANGE; 1077 } else if (ext4_es_is_written(&es) || 1078 ext4_es_is_unwritten(&es)) { 1079 unsigned int max; 1080 1081 range->tag = EXT4_FC_TAG_ADD_RANGE; 1082 range->pblk = ext4_es_pblock(&es) + 1083 (cur_lblk - es.es_lblk); 1084 range->unwritten = ext4_es_is_unwritten(&es); 1085 1086 max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN : 1087 EXT_INIT_MAX_LEN; 1088 if (range->len > max) 1089 range->len = max; 1090 } else { 1091 kmem_cache_free(ext4_fc_range_cachep, range); 1092 atomic64_inc(&stats->snap_fail_es_other); 1093 ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER); 1094 return -EAGAIN; 1095 } 1096 1097 INIT_LIST_HEAD(&range->list); 1098 list_add_tail(&range->list, ranges); 1099 1100 if ((u64)range->len > (u64)end_lblk - cur_lblk) 1101 break; 1102 1103 cur_lblk += range->len; 1104 } 1105 1106 if (nr_rangesp) 1107 *nr_rangesp = nr_ranges; 1108 return 0; 1109 } 1110 1111 static int ext4_fc_snapshot_inode(struct inode *inode, 1112 unsigned int nr_ranges_total, 1113 unsigned int *nr_rangesp, int *snap_err) 1114 { 1115 struct ext4_inode_info *ei = EXT4_I(inode); 1116 struct ext4_fc_snap_stats *stats = 1117 &EXT4_SB(inode->i_sb)->s_fc_snap_stats; 1118 struct ext4_fc_inode_snap *snap; 1119 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 1120 struct ext4_iloc iloc; 1121 LIST_HEAD(ranges); 1122 unsigned int nr_ranges = 0; 1123 int ret; 1124 int alloc_ctx; 1125 1126 ret = ext4_get_inode_loc_noio(inode, &iloc); 1127 if (ret) { 1128 atomic64_inc(&stats->snap_fail_inode_loc); 1129 ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC); 1130 return ret; 1131 } 1132 1133 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1134 inode_len = EXT4_INODE_SIZE(inode->i_sb); 1135 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 1136 inode_len += ei->i_extra_isize; 1137 1138 snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS); 1139 if (!snap) { 1140 atomic64_inc(&stats->snap_fail_nomem); 1141 ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM); 1142 brelse(iloc.bh); 1143 return -ENOMEM; 1144 } 1145 INIT_LIST_HEAD(&snap->data_list); 1146 snap->inode_len = inode_len; 1147 1148 memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len); 1149 brelse(iloc.bh); 1150 1151 ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total, 1152 &nr_ranges, snap_err); 1153 if (ret) { 1154 kfree(snap); 1155 ext4_fc_free_ranges(&ranges); 1156 return ret; 1157 } 1158 1159 alloc_ctx = ext4_fc_lock(inode->i_sb); 1160 ext4_fc_free_inode_snap(inode); 1161 ei->i_fc_snap = snap; 1162 list_splice_tail_init(&ranges, &snap->data_list); 1163 ext4_fc_unlock(inode->i_sb, alloc_ctx); 1164 1165 atomic64_inc(&stats->snap_inodes); 1166 atomic64_add(nr_ranges, &stats->snap_ranges); 1167 if (nr_rangesp) 1168 *nr_rangesp = nr_ranges; 1169 return 0; 1170 } 1171 1172 /* Flushes data of all the inodes in the commit queue. */ 1173 static int ext4_fc_flush_data(journal_t *journal) 1174 { 1175 struct super_block *sb = journal->j_private; 1176 struct ext4_sb_info *sbi = EXT4_SB(sb); 1177 struct ext4_inode_info *ei; 1178 int ret = 0; 1179 1180 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1181 ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode)); 1182 if (ret) 1183 return ret; 1184 } 1185 1186 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1187 ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode)); 1188 if (ret) 1189 return ret; 1190 } 1191 1192 return 0; 1193 } 1194 1195 /* Commit all the directory entry updates */ 1196 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1197 { 1198 struct super_block *sb = journal->j_private; 1199 struct ext4_sb_info *sbi = EXT4_SB(sb); 1200 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1201 struct inode *inode; 1202 struct ext4_inode_info *ei; 1203 int ret; 1204 1205 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1206 return 0; 1207 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1208 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1209 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1210 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) 1211 return -ENOSPC; 1212 continue; 1213 } 1214 /* 1215 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1216 * corresponding inode. Also, the corresponding inode could have been 1217 * deleted, in which case, we don't need to do anything. 1218 */ 1219 if (list_empty(&fc_dentry->fcd_dilist)) 1220 continue; 1221 /* 1222 * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created 1223 * inode's i_fc_dilist list (kept singular), so we can recover the 1224 * inode through it. 1225 */ 1226 ei = list_first_entry(&fc_dentry->fcd_dilist, 1227 struct ext4_inode_info, i_fc_dilist); 1228 inode = &ei->vfs_inode; 1229 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1230 1231 /* 1232 * We first write the inode and then the create dirent. This 1233 * allows the recovery code to create an unnamed inode first 1234 * and then link it to a directory entry. This allows us 1235 * to use namei.c routines almost as is and simplifies 1236 * the recovery code. 1237 */ 1238 ret = ext4_fc_write_inode(inode, crc); 1239 if (ret) 1240 return ret; 1241 ret = ext4_fc_write_inode_data(inode, crc); 1242 if (ret) 1243 return ret; 1244 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) 1245 return -ENOSPC; 1246 } 1247 return 0; 1248 } 1249 1250 static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb, 1251 struct inode ***inodesp, 1252 unsigned int *nr_inodesp); 1253 1254 static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes, 1255 unsigned int inodes_size, 1256 unsigned int *nr_inodesp, 1257 unsigned int *nr_rangesp, 1258 int *snap_err) 1259 { 1260 struct super_block *sb = journal->j_private; 1261 struct ext4_sb_info *sbi = EXT4_SB(sb); 1262 struct ext4_inode_info *iter; 1263 struct ext4_fc_dentry_update *fc_dentry; 1264 unsigned int i = 0; 1265 unsigned int idx; 1266 unsigned int nr_ranges = 0; 1267 int ret = 0; 1268 int alloc_ctx; 1269 1270 alloc_ctx = ext4_fc_lock(sb); 1271 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1272 if (i >= inodes_size) { 1273 atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap); 1274 ext4_fc_set_snap_err(snap_err, 1275 EXT4_FC_SNAP_ERR_INODES_CAP); 1276 ret = -E2BIG; 1277 goto unlock; 1278 } 1279 inodes[i++] = &iter->vfs_inode; 1280 } 1281 1282 list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1283 struct ext4_inode_info *ei; 1284 struct inode *inode; 1285 1286 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) 1287 continue; 1288 if (list_empty(&fc_dentry->fcd_dilist)) 1289 continue; 1290 1291 /* See the comment in ext4_fc_commit_dentry_updates(). */ 1292 ei = list_first_entry(&fc_dentry->fcd_dilist, 1293 struct ext4_inode_info, i_fc_dilist); 1294 inode = &ei->vfs_inode; 1295 if (!list_empty(&ei->i_fc_list)) 1296 continue; 1297 1298 if (i >= inodes_size) { 1299 atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap); 1300 ext4_fc_set_snap_err(snap_err, 1301 EXT4_FC_SNAP_ERR_INODES_CAP); 1302 ret = -E2BIG; 1303 goto unlock; 1304 } 1305 /* 1306 * Create-only inodes may only be referenced via fcd_dilist and 1307 * not appear on s_fc_q[MAIN]. They may hit the last iput while 1308 * we are snapshotting, but inode eviction calls ext4_fc_del(), 1309 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING 1310 * so the inode stays pinned and the snapshot stays valid until 1311 * ext4_fc_cleanup(). 1312 */ 1313 ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING); 1314 inodes[i++] = inode; 1315 } 1316 unlock: 1317 ext4_fc_unlock(sb, alloc_ctx); 1318 1319 if (ret) 1320 return ret; 1321 1322 for (idx = 0; idx < i; idx++) { 1323 unsigned int inode_ranges = 0; 1324 1325 ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges, 1326 &inode_ranges, snap_err); 1327 if (ret) 1328 break; 1329 nr_ranges += inode_ranges; 1330 } 1331 1332 if (nr_inodesp) 1333 *nr_inodesp = idx; 1334 if (nr_rangesp) 1335 *nr_rangesp = nr_ranges; 1336 return ret; 1337 } 1338 1339 static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid) 1340 { 1341 struct super_block *sb = journal->j_private; 1342 struct ext4_sb_info *sbi = EXT4_SB(sb); 1343 struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats; 1344 struct ext4_inode_info *iter; 1345 struct ext4_fc_head head; 1346 struct inode *inode; 1347 struct inode **inodes; 1348 unsigned int inodes_size; 1349 unsigned int snap_inodes = 0; 1350 unsigned int snap_ranges = 0; 1351 int snap_err = EXT4_FC_SNAP_ERR_NONE; 1352 struct blk_plug plug; 1353 int ret = 0; 1354 u32 crc = 0; 1355 int alloc_ctx; 1356 ktime_t lock_start; 1357 u64 locked_ns; 1358 1359 /* 1360 * Step 1: Mark all inodes on s_fc_q[MAIN] with 1361 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being 1362 * freed until the data flush is over. 1363 */ 1364 alloc_ctx = ext4_fc_lock(sb); 1365 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1366 ext4_set_inode_state(&iter->vfs_inode, 1367 EXT4_STATE_FC_FLUSHING_DATA); 1368 } 1369 ext4_fc_unlock(sb, alloc_ctx); 1370 1371 /* Step 2: Flush data for all the eligible inodes. */ 1372 ret = ext4_fc_flush_data(journal); 1373 1374 /* 1375 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning 1376 * any error from step 2. This ensures that waiters waiting on 1377 * EXT4_STATE_FC_FLUSHING_DATA can resume. 1378 */ 1379 alloc_ctx = ext4_fc_lock(sb); 1380 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1381 ext4_clear_inode_state(&iter->vfs_inode, 1382 EXT4_STATE_FC_FLUSHING_DATA); 1383 ext4_fc_wake_inode_state(&iter->vfs_inode, 1384 EXT4_STATE_FC_FLUSHING_DATA); 1385 } 1386 1387 /* 1388 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before 1389 * the waiter checks the bit. Pairs with implicit barrier in 1390 * prepare_to_wait() in ext4_fc_del(). 1391 */ 1392 smp_mb(); 1393 ext4_fc_unlock(sb, alloc_ctx); 1394 1395 /* 1396 * If we encountered error in Step 2, return it now after clearing 1397 * EXT4_STATE_FC_FLUSHING_DATA bit. 1398 */ 1399 if (ret) 1400 return ret; 1401 1402 ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size); 1403 if (ret) { 1404 if (ret == -E2BIG) 1405 atomic64_inc(&snap_stats->snap_fail_inodes_cap); 1406 else if (ret == -ENOMEM) 1407 atomic64_inc(&snap_stats->snap_fail_nomem); 1408 return ret; 1409 } 1410 1411 /* Step 4: Mark all inodes as being committed. */ 1412 jbd2_journal_lock_updates(journal); 1413 lock_start = ktime_get(); 1414 /* 1415 * The journal is now locked. No more handles can start and all the 1416 * previous handles are now drained. Snapshotting happens in this 1417 * window so log writing can consume only stable snapshots without 1418 * doing logical-to-physical mapping. 1419 */ 1420 alloc_ctx = ext4_fc_lock(sb); 1421 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1422 ext4_set_inode_state(&iter->vfs_inode, 1423 EXT4_STATE_FC_COMMITTING); 1424 } 1425 ext4_fc_unlock(sb, alloc_ctx); 1426 1427 ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size, 1428 &snap_inodes, &snap_ranges, &snap_err); 1429 jbd2_journal_unlock_updates(journal); 1430 locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start)); 1431 atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total); 1432 atomic64_inc(&snap_stats->lock_updates_samples); 1433 ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max, 1434 locked_ns); 1435 if (trace_ext4_fc_lock_updates_enabled()) 1436 trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns, 1437 snap_inodes, snap_ranges, 1438 ret, snap_err); 1439 kvfree(inodes); 1440 if (ret) 1441 return ret; 1442 1443 /* 1444 * Step 5: If file system device is different from journal device, 1445 * issue a cache flush before we start writing fast commit blocks. 1446 */ 1447 if (journal->j_fs_dev != journal->j_dev) 1448 blkdev_issue_flush(journal->j_fs_dev); 1449 1450 blk_start_plug(&plug); 1451 alloc_ctx = ext4_fc_lock(sb); 1452 /* Step 6: Write fast commit blocks to disk. */ 1453 if (sbi->s_fc_bytes == 0) { 1454 /* 1455 * Step 6.1: Add a head tag only if this is the first fast 1456 * commit in this TID. 1457 */ 1458 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1459 head.fc_tid = cpu_to_le32( 1460 sbi->s_journal->j_running_transaction->t_tid); 1461 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1462 (u8 *)&head, &crc)) { 1463 ret = -ENOSPC; 1464 goto out; 1465 } 1466 } 1467 1468 /* Step 6.2: Now write all the dentry updates. */ 1469 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1470 if (ret) 1471 goto out; 1472 1473 /* Step 6.3: Now write all the changed inodes to disk. */ 1474 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1475 inode = &iter->vfs_inode; 1476 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1477 continue; 1478 1479 ret = ext4_fc_write_inode_data(inode, &crc); 1480 if (ret) 1481 goto out; 1482 ret = ext4_fc_write_inode(inode, &crc); 1483 if (ret) 1484 goto out; 1485 } 1486 /* Step 6.4: Finally write tail tag to conclude this fast commit. */ 1487 ret = ext4_fc_write_tail(sb, crc); 1488 1489 out: 1490 ext4_fc_unlock(sb, alloc_ctx); 1491 blk_finish_plug(&plug); 1492 return ret; 1493 } 1494 1495 static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb) 1496 { 1497 struct ext4_sb_info *sbi = EXT4_SB(sb); 1498 struct ext4_inode_info *iter; 1499 struct ext4_fc_dentry_update *fc_dentry; 1500 unsigned int nr_inodes = 0; 1501 int alloc_ctx; 1502 1503 alloc_ctx = ext4_fc_lock(sb); 1504 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) 1505 nr_inodes++; 1506 1507 list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1508 struct ext4_inode_info *ei; 1509 1510 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) 1511 continue; 1512 if (list_empty(&fc_dentry->fcd_dilist)) 1513 continue; 1514 1515 /* See the comment in ext4_fc_commit_dentry_updates(). */ 1516 ei = list_first_entry(&fc_dentry->fcd_dilist, 1517 struct ext4_inode_info, i_fc_dilist); 1518 if (!list_empty(&ei->i_fc_list)) 1519 continue; 1520 1521 nr_inodes++; 1522 } 1523 ext4_fc_unlock(sb, alloc_ctx); 1524 1525 return nr_inodes; 1526 } 1527 1528 static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb, 1529 struct inode ***inodesp, 1530 unsigned int *nr_inodesp) 1531 { 1532 unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb); 1533 struct inode **inodes; 1534 1535 *inodesp = NULL; 1536 *nr_inodesp = 0; 1537 1538 if (!nr_inodes) 1539 return 0; 1540 1541 if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES) 1542 return -E2BIG; 1543 1544 inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS); 1545 if (!inodes) 1546 return -ENOMEM; 1547 1548 *inodesp = inodes; 1549 *nr_inodesp = nr_inodes; 1550 return 0; 1551 } 1552 1553 static void ext4_fc_update_stats(struct super_block *sb, int status, 1554 u64 commit_time, int nblks, tid_t commit_tid) 1555 { 1556 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1557 1558 ext4_debug("Fast commit ended with status = %d for tid %u", 1559 status, commit_tid); 1560 if (status == EXT4_FC_STATUS_OK) { 1561 stats->fc_num_commits++; 1562 stats->fc_numblks += nblks; 1563 if (likely(stats->s_fc_avg_commit_time)) 1564 stats->s_fc_avg_commit_time = 1565 (commit_time + 1566 stats->s_fc_avg_commit_time * 3) / 4; 1567 else 1568 stats->s_fc_avg_commit_time = commit_time; 1569 } else if (status == EXT4_FC_STATUS_FAILED || 1570 status == EXT4_FC_STATUS_INELIGIBLE) { 1571 if (status == EXT4_FC_STATUS_FAILED) 1572 stats->fc_failed_commits++; 1573 stats->fc_ineligible_commits++; 1574 } else { 1575 stats->fc_skipped_commits++; 1576 } 1577 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1578 } 1579 1580 /* 1581 * The main commit entry point. Performs a fast commit for transaction 1582 * commit_tid if needed. If it's not possible to perform a fast commit 1583 * due to various reasons, we fall back to full commit. Returns 0 1584 * on success, error otherwise. 1585 */ 1586 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1587 { 1588 struct super_block *sb = journal->j_private; 1589 struct ext4_sb_info *sbi = EXT4_SB(sb); 1590 int nblks = 0, ret, bsize = journal->j_blocksize; 1591 int subtid = atomic_read(&sbi->s_fc_subtid); 1592 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1593 ktime_t start_time, commit_time; 1594 int old_ioprio, journal_ioprio; 1595 1596 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1597 return jbd2_complete_transaction(journal, commit_tid); 1598 1599 trace_ext4_fc_commit_start(sb, commit_tid); 1600 1601 start_time = ktime_get(); 1602 old_ioprio = get_current_ioprio(); 1603 1604 restart_fc: 1605 ret = jbd2_fc_begin_commit(journal, commit_tid); 1606 if (ret == -EALREADY) { 1607 /* There was an ongoing commit, check if we need to restart */ 1608 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1609 tid_gt(commit_tid, journal->j_commit_sequence)) 1610 goto restart_fc; 1611 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1612 commit_tid); 1613 return 0; 1614 } else if (ret) { 1615 /* 1616 * Commit couldn't start. Just update stats and perform a 1617 * full commit. 1618 */ 1619 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1620 commit_tid); 1621 return jbd2_complete_transaction(journal, commit_tid); 1622 } 1623 1624 /* 1625 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1626 * if we are fast commit ineligible. 1627 */ 1628 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1629 status = EXT4_FC_STATUS_INELIGIBLE; 1630 goto fallback; 1631 } 1632 1633 /* 1634 * Now that we know that this thread is going to do a fast commit, 1635 * elevate the priority to match that of the journal thread. 1636 */ 1637 if (journal->j_task->io_context) 1638 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; 1639 else 1640 journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; 1641 set_task_ioprio(current, journal_ioprio); 1642 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1643 ret = ext4_fc_perform_commit(journal, commit_tid); 1644 if (ret < 0) { 1645 if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED) 1646 status = EXT4_FC_STATUS_INELIGIBLE; 1647 else 1648 status = EXT4_FC_STATUS_FAILED; 1649 goto fallback; 1650 } 1651 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1652 ret = jbd2_fc_wait_bufs(journal, nblks); 1653 if (ret < 0) { 1654 status = EXT4_FC_STATUS_FAILED; 1655 goto fallback; 1656 } 1657 atomic_inc(&sbi->s_fc_subtid); 1658 ret = jbd2_fc_end_commit(journal); 1659 set_task_ioprio(current, old_ioprio); 1660 /* 1661 * weight the commit time higher than the average time so we 1662 * don't react too strongly to vast changes in the commit time 1663 */ 1664 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1665 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1666 return ret; 1667 1668 fallback: 1669 set_task_ioprio(current, old_ioprio); 1670 ret = jbd2_fc_end_commit_fallback(journal); 1671 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1672 return ret; 1673 } 1674 1675 /* 1676 * Fast commit cleanup routine. This is called after every fast commit and 1677 * full commit. full is true if we are called after a full commit. 1678 */ 1679 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1680 { 1681 struct super_block *sb = journal->j_private; 1682 struct ext4_sb_info *sbi = EXT4_SB(sb); 1683 struct ext4_inode_info *ei; 1684 struct ext4_fc_dentry_update *fc_dentry; 1685 int alloc_ctx; 1686 1687 if (full && sbi->s_fc_bh) 1688 sbi->s_fc_bh = NULL; 1689 1690 trace_ext4_fc_cleanup(journal, full, tid); 1691 jbd2_fc_release_bufs(journal); 1692 1693 alloc_ctx = ext4_fc_lock(sb); 1694 while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { 1695 bool requeue; 1696 1697 ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], 1698 struct ext4_inode_info, 1699 i_fc_list); 1700 list_del_init(&ei->i_fc_list); 1701 ext4_fc_free_inode_snap(&ei->vfs_inode); 1702 spin_lock(&ei->i_fc_lock); 1703 if (full) 1704 requeue = !tid_geq(tid, ei->i_sync_tid); 1705 else 1706 requeue = ext4_test_inode_state(&ei->vfs_inode, 1707 EXT4_STATE_FC_REQUEUE); 1708 if (!requeue) 1709 ext4_fc_reset_inode(&ei->vfs_inode); 1710 ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE); 1711 ext4_clear_inode_state(&ei->vfs_inode, 1712 EXT4_STATE_FC_COMMITTING); 1713 spin_unlock(&ei->i_fc_lock); 1714 if (requeue) 1715 list_add_tail(&ei->i_fc_list, 1716 &sbi->s_fc_q[FC_Q_STAGING]); 1717 /* 1718 * Make sure clearing of EXT4_STATE_FC_COMMITTING is 1719 * visible before we send the wakeup. Pairs with implicit 1720 * barrier in prepare_to_wait() in ext4_fc_del(). 1721 */ 1722 smp_mb(); 1723 ext4_fc_wake_inode_state(&ei->vfs_inode, 1724 EXT4_STATE_FC_COMMITTING); 1725 } 1726 1727 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1728 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1729 struct ext4_fc_dentry_update, 1730 fcd_list); 1731 list_del_init(&fc_dentry->fcd_list); 1732 if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT && 1733 !list_empty(&fc_dentry->fcd_dilist)) { 1734 /* See the comment in ext4_fc_commit_dentry_updates(). */ 1735 ei = list_first_entry(&fc_dentry->fcd_dilist, 1736 struct ext4_inode_info, 1737 i_fc_dilist); 1738 ext4_fc_free_inode_snap(&ei->vfs_inode); 1739 spin_lock(&ei->i_fc_lock); 1740 ext4_clear_inode_state(&ei->vfs_inode, 1741 EXT4_STATE_FC_REQUEUE); 1742 ext4_clear_inode_state(&ei->vfs_inode, 1743 EXT4_STATE_FC_COMMITTING); 1744 spin_unlock(&ei->i_fc_lock); 1745 /* 1746 * Make sure clearing of EXT4_STATE_FC_COMMITTING is 1747 * visible before we send the wakeup. Pairs with 1748 * implicit barrier in prepare_to_wait() in 1749 * ext4_fc_del(). 1750 */ 1751 smp_mb(); 1752 ext4_fc_wake_inode_state(&ei->vfs_inode, 1753 EXT4_STATE_FC_COMMITTING); 1754 } 1755 list_del_init(&fc_dentry->fcd_dilist); 1756 1757 release_dentry_name_snapshot(&fc_dentry->fcd_name); 1758 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1759 } 1760 1761 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1762 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1763 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1764 &sbi->s_fc_q[FC_Q_MAIN]); 1765 1766 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { 1767 sbi->s_fc_ineligible_tid = 0; 1768 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1769 } 1770 1771 if (full) 1772 sbi->s_fc_bytes = 0; 1773 ext4_fc_unlock(sb, alloc_ctx); 1774 trace_ext4_fc_stats(sb); 1775 } 1776 1777 /* Ext4 Replay Path Routines */ 1778 1779 /* Helper struct for dentry replay routines */ 1780 struct dentry_info_args { 1781 int parent_ino, dname_len, ino, inode_len; 1782 char *dname; 1783 }; 1784 1785 /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1786 struct ext4_fc_tl_mem { 1787 u16 fc_tag; 1788 u16 fc_len; 1789 }; 1790 1791 static inline void tl_to_darg(struct dentry_info_args *darg, 1792 struct ext4_fc_tl_mem *tl, u8 *val) 1793 { 1794 struct ext4_fc_dentry_info fcd; 1795 1796 memcpy(&fcd, val, sizeof(fcd)); 1797 1798 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1799 darg->ino = le32_to_cpu(fcd.fc_ino); 1800 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1801 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1802 } 1803 1804 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1805 { 1806 struct ext4_fc_tl tl_disk; 1807 1808 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1809 tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1810 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1811 } 1812 1813 /* Unlink replay function */ 1814 static int ext4_fc_replay_unlink(struct super_block *sb, 1815 struct ext4_fc_tl_mem *tl, u8 *val) 1816 { 1817 struct inode *inode, *old_parent; 1818 struct qstr entry; 1819 struct dentry_info_args darg; 1820 int ret = 0; 1821 1822 tl_to_darg(&darg, tl, val); 1823 1824 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1825 darg.parent_ino, darg.dname_len); 1826 1827 entry.name = darg.dname; 1828 entry.len = darg.dname_len; 1829 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1830 1831 if (IS_ERR(inode)) { 1832 ext4_debug("Inode %d not found", darg.ino); 1833 return 0; 1834 } 1835 1836 old_parent = ext4_iget(sb, darg.parent_ino, 1837 EXT4_IGET_NORMAL); 1838 if (IS_ERR(old_parent)) { 1839 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1840 iput(inode); 1841 return 0; 1842 } 1843 1844 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1845 /* -ENOENT ok coz it might not exist anymore. */ 1846 if (ret == -ENOENT) 1847 ret = 0; 1848 iput(old_parent); 1849 iput(inode); 1850 return ret; 1851 } 1852 1853 static int ext4_fc_replay_link_internal(struct super_block *sb, 1854 struct dentry_info_args *darg, 1855 struct inode *inode) 1856 { 1857 struct inode *dir = NULL; 1858 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1859 int ret = 0; 1860 1861 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1862 if (IS_ERR(dir)) { 1863 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1864 dir = NULL; 1865 goto out; 1866 } 1867 1868 ret = __ext4_link(dir, inode, &qstr_dname, NULL); 1869 /* 1870 * It's possible that link already existed since data blocks 1871 * for the dir in question got persisted before we crashed OR 1872 * we replayed this tag and crashed before the entire replay 1873 * could complete. 1874 */ 1875 if (ret && ret != -EEXIST) { 1876 ext4_debug("Failed to link\n"); 1877 goto out; 1878 } 1879 1880 ret = 0; 1881 out: 1882 if (dir) 1883 iput(dir); 1884 1885 return ret; 1886 } 1887 1888 /* Link replay function */ 1889 static int ext4_fc_replay_link(struct super_block *sb, 1890 struct ext4_fc_tl_mem *tl, u8 *val) 1891 { 1892 struct inode *inode; 1893 struct dentry_info_args darg; 1894 int ret = 0; 1895 1896 tl_to_darg(&darg, tl, val); 1897 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1898 darg.parent_ino, darg.dname_len); 1899 1900 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1901 if (IS_ERR(inode)) { 1902 ext4_debug("Inode not found."); 1903 return 0; 1904 } 1905 1906 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1907 iput(inode); 1908 return ret; 1909 } 1910 1911 /* 1912 * Record all the modified inodes during replay. We use this later to setup 1913 * block bitmaps correctly. 1914 */ 1915 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1916 { 1917 struct ext4_fc_replay_state *state; 1918 int i; 1919 1920 state = &EXT4_SB(sb)->s_fc_replay_state; 1921 for (i = 0; i < state->fc_modified_inodes_used; i++) 1922 if (state->fc_modified_inodes[i] == ino) 1923 return 0; 1924 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1925 int *fc_modified_inodes; 1926 1927 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1928 sizeof(int) * (state->fc_modified_inodes_size + 1929 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1930 GFP_KERNEL); 1931 if (!fc_modified_inodes) 1932 return -ENOMEM; 1933 state->fc_modified_inodes = fc_modified_inodes; 1934 state->fc_modified_inodes_size += 1935 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1936 } 1937 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1938 return 0; 1939 } 1940 1941 /* 1942 * Inode replay function 1943 */ 1944 static int ext4_fc_replay_inode(struct super_block *sb, 1945 struct ext4_fc_tl_mem *tl, u8 *val) 1946 { 1947 struct ext4_fc_inode fc_inode; 1948 struct ext4_inode *raw_inode; 1949 struct ext4_inode *raw_fc_inode; 1950 struct inode *inode = NULL; 1951 struct ext4_iloc iloc; 1952 int inode_len, ino, ret, tag = tl->fc_tag; 1953 struct ext4_extent_header *eh; 1954 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1955 1956 memcpy(&fc_inode, val, sizeof(fc_inode)); 1957 1958 ino = le32_to_cpu(fc_inode.fc_ino); 1959 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1960 1961 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1962 if (!IS_ERR(inode)) { 1963 ext4_ext_clear_bb(inode); 1964 iput(inode); 1965 } 1966 inode = NULL; 1967 1968 ret = ext4_fc_record_modified_inode(sb, ino); 1969 if (ret) 1970 goto out; 1971 1972 raw_fc_inode = (struct ext4_inode *) 1973 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1974 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1975 if (ret) 1976 goto out; 1977 1978 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1979 raw_inode = ext4_raw_inode(&iloc); 1980 1981 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1982 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1983 inode_len - off_gen); 1984 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1985 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1986 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1987 memset(eh, 0, sizeof(*eh)); 1988 eh->eh_magic = EXT4_EXT_MAGIC; 1989 eh->eh_max = cpu_to_le16( 1990 (sizeof(raw_inode->i_block) - 1991 sizeof(struct ext4_extent_header)) 1992 / sizeof(struct ext4_extent)); 1993 } 1994 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1995 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1996 sizeof(raw_inode->i_block)); 1997 } 1998 1999 /* Immediately update the inode on disk. */ 2000 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 2001 if (ret) 2002 goto out_brelse; 2003 ret = sync_dirty_buffer(iloc.bh); 2004 if (ret) 2005 goto out_brelse; 2006 ret = ext4_mark_inode_used(sb, ino); 2007 if (ret) 2008 goto out_brelse; 2009 2010 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 2011 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 2012 if (IS_ERR(inode)) { 2013 ext4_debug("Inode not found."); 2014 inode = NULL; 2015 ret = -EFSCORRUPTED; 2016 goto out_brelse; 2017 } 2018 2019 /* 2020 * Our allocator could have made different decisions than before 2021 * crashing. This should be fixed but until then, we calculate 2022 * the number of blocks the inode. 2023 */ 2024 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 2025 ext4_ext_replay_set_iblocks(inode); 2026 2027 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 2028 ext4_reset_inode_seed(inode); 2029 2030 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 2031 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 2032 sync_dirty_buffer(iloc.bh); 2033 out_brelse: 2034 brelse(iloc.bh); 2035 out: 2036 iput(inode); 2037 if (!ret) 2038 blkdev_issue_flush(sb->s_bdev); 2039 2040 return ret; 2041 } 2042 2043 /* 2044 * Dentry create replay function. 2045 * 2046 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 2047 * inode for which we are trying to create a dentry here, should already have 2048 * been replayed before we start here. 2049 */ 2050 static int ext4_fc_replay_create(struct super_block *sb, 2051 struct ext4_fc_tl_mem *tl, u8 *val) 2052 { 2053 int ret = 0; 2054 struct inode *inode = NULL; 2055 struct inode *dir = NULL; 2056 struct dentry_info_args darg; 2057 2058 tl_to_darg(&darg, tl, val); 2059 2060 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 2061 darg.parent_ino, darg.dname_len); 2062 2063 /* This takes care of update group descriptor and other metadata */ 2064 ret = ext4_mark_inode_used(sb, darg.ino); 2065 if (ret) 2066 goto out; 2067 2068 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 2069 if (IS_ERR(inode)) { 2070 ext4_debug("inode %d not found.", darg.ino); 2071 inode = NULL; 2072 ret = -EINVAL; 2073 goto out; 2074 } 2075 2076 if (S_ISDIR(inode->i_mode)) { 2077 /* 2078 * If we are creating a directory, we need to make sure that the 2079 * dot and dot dot dirents are setup properly. 2080 */ 2081 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 2082 if (IS_ERR(dir)) { 2083 ext4_debug("Dir %d not found.", darg.ino); 2084 goto out; 2085 } 2086 ret = ext4_init_new_dir(NULL, dir, inode); 2087 iput(dir); 2088 if (ret) { 2089 ret = 0; 2090 goto out; 2091 } 2092 } 2093 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 2094 if (ret) 2095 goto out; 2096 set_nlink(inode, 1); 2097 ext4_mark_inode_dirty(NULL, inode); 2098 out: 2099 iput(inode); 2100 return ret; 2101 } 2102 2103 /* 2104 * Record physical disk regions which are in use as per fast commit area, 2105 * and used by inodes during replay phase. Our simple replay phase 2106 * allocator excludes these regions from allocation. 2107 */ 2108 int ext4_fc_record_regions(struct super_block *sb, int ino, 2109 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 2110 { 2111 struct ext4_fc_replay_state *state; 2112 struct ext4_fc_alloc_region *region; 2113 2114 state = &EXT4_SB(sb)->s_fc_replay_state; 2115 /* 2116 * during replay phase, the fc_regions_valid may not same as 2117 * fc_regions_used, update it when do new additions. 2118 */ 2119 if (replay && state->fc_regions_used != state->fc_regions_valid) 2120 state->fc_regions_used = state->fc_regions_valid; 2121 if (state->fc_regions_used == state->fc_regions_size) { 2122 struct ext4_fc_alloc_region *fc_regions; 2123 2124 fc_regions = krealloc(state->fc_regions, 2125 sizeof(struct ext4_fc_alloc_region) * 2126 (state->fc_regions_size + 2127 EXT4_FC_REPLAY_REALLOC_INCREMENT), 2128 GFP_KERNEL); 2129 if (!fc_regions) 2130 return -ENOMEM; 2131 state->fc_regions_size += 2132 EXT4_FC_REPLAY_REALLOC_INCREMENT; 2133 state->fc_regions = fc_regions; 2134 } 2135 region = &state->fc_regions[state->fc_regions_used++]; 2136 region->ino = ino; 2137 region->lblk = lblk; 2138 region->pblk = pblk; 2139 region->len = len; 2140 2141 if (replay) 2142 state->fc_regions_valid++; 2143 2144 return 0; 2145 } 2146 2147 /* Replay add range tag */ 2148 static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val) 2149 { 2150 struct ext4_fc_add_range fc_add_ex; 2151 struct ext4_extent newex, *ex; 2152 struct inode *inode; 2153 ext4_lblk_t start, cur; 2154 int remaining, len; 2155 ext4_fsblk_t start_pblk; 2156 struct ext4_map_blocks map; 2157 struct ext4_ext_path *path = NULL; 2158 int ret; 2159 2160 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 2161 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 2162 2163 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 2164 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 2165 ext4_ext_get_actual_len(ex)); 2166 2167 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 2168 if (IS_ERR(inode)) { 2169 ext4_debug("Inode not found."); 2170 return 0; 2171 } 2172 2173 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 2174 if (ret) 2175 goto out; 2176 2177 start = le32_to_cpu(ex->ee_block); 2178 start_pblk = ext4_ext_pblock(ex); 2179 len = ext4_ext_get_actual_len(ex); 2180 2181 cur = start; 2182 remaining = len; 2183 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n", 2184 start, start_pblk, len, ext4_ext_is_unwritten(ex), 2185 inode->i_ino); 2186 2187 while (remaining > 0) { 2188 map.m_lblk = cur; 2189 map.m_len = remaining; 2190 map.m_pblk = 0; 2191 ret = ext4_map_blocks(NULL, inode, &map, 0); 2192 2193 if (ret < 0) 2194 goto out; 2195 2196 if (ret == 0) { 2197 /* Range is not mapped */ 2198 path = ext4_find_extent(inode, cur, path, 0); 2199 if (IS_ERR(path)) 2200 goto out; 2201 memset(&newex, 0, sizeof(newex)); 2202 newex.ee_block = cpu_to_le32(cur); 2203 ext4_ext_store_pblock( 2204 &newex, start_pblk + cur - start); 2205 newex.ee_len = cpu_to_le16(map.m_len); 2206 if (ext4_ext_is_unwritten(ex)) 2207 ext4_ext_mark_unwritten(&newex); 2208 down_write(&EXT4_I(inode)->i_data_sem); 2209 path = ext4_ext_insert_extent(NULL, inode, 2210 path, &newex, 0); 2211 up_write((&EXT4_I(inode)->i_data_sem)); 2212 if (IS_ERR(path)) 2213 goto out; 2214 goto next; 2215 } 2216 2217 if (start_pblk + cur - start != map.m_pblk) { 2218 /* 2219 * Logical to physical mapping changed. This can happen 2220 * if this range was removed and then reallocated to 2221 * map to new physical blocks during a fast commit. 2222 */ 2223 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 2224 ext4_ext_is_unwritten(ex), 2225 start_pblk + cur - start); 2226 if (ret) 2227 goto out; 2228 /* 2229 * Mark the old blocks as free since they aren't used 2230 * anymore. We maintain an array of all the modified 2231 * inodes. In case these blocks are still used at either 2232 * a different logical range in the same inode or in 2233 * some different inode, we will mark them as allocated 2234 * at the end of the FC replay using our array of 2235 * modified inodes. 2236 */ 2237 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 2238 goto next; 2239 } 2240 2241 /* Range is mapped and needs a state change */ 2242 ext4_debug("Converting from %ld to %d %lld", 2243 map.m_flags & EXT4_MAP_UNWRITTEN, 2244 ext4_ext_is_unwritten(ex), map.m_pblk); 2245 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 2246 ext4_ext_is_unwritten(ex), map.m_pblk); 2247 if (ret) 2248 goto out; 2249 /* 2250 * We may have split the extent tree while toggling the state. 2251 * Try to shrink the extent tree now. 2252 */ 2253 ext4_ext_replay_shrink_inode(inode, start + len); 2254 next: 2255 cur += map.m_len; 2256 remaining -= map.m_len; 2257 } 2258 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 2259 sb->s_blocksize_bits); 2260 out: 2261 ext4_free_ext_path(path); 2262 iput(inode); 2263 return 0; 2264 } 2265 2266 /* Replay DEL_RANGE tag */ 2267 static int 2268 ext4_fc_replay_del_range(struct super_block *sb, u8 *val) 2269 { 2270 struct inode *inode; 2271 struct ext4_fc_del_range lrange; 2272 struct ext4_map_blocks map; 2273 ext4_lblk_t cur, remaining; 2274 int ret; 2275 2276 memcpy(&lrange, val, sizeof(lrange)); 2277 cur = le32_to_cpu(lrange.fc_lblk); 2278 remaining = le32_to_cpu(lrange.fc_len); 2279 2280 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 2281 le32_to_cpu(lrange.fc_ino), cur, remaining); 2282 2283 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 2284 if (IS_ERR(inode)) { 2285 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 2286 return 0; 2287 } 2288 2289 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 2290 if (ret) 2291 goto out; 2292 2293 ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n", 2294 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 2295 le32_to_cpu(lrange.fc_len)); 2296 while (remaining > 0) { 2297 map.m_lblk = cur; 2298 map.m_len = remaining; 2299 2300 ret = ext4_map_blocks(NULL, inode, &map, 0); 2301 if (ret < 0) 2302 goto out; 2303 if (ret > 0) { 2304 remaining -= ret; 2305 cur += ret; 2306 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 2307 } else { 2308 remaining -= map.m_len; 2309 cur += map.m_len; 2310 } 2311 } 2312 2313 down_write(&EXT4_I(inode)->i_data_sem); 2314 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 2315 le32_to_cpu(lrange.fc_lblk) + 2316 le32_to_cpu(lrange.fc_len) - 1); 2317 up_write(&EXT4_I(inode)->i_data_sem); 2318 if (ret) 2319 goto out; 2320 ext4_ext_replay_shrink_inode(inode, 2321 i_size_read(inode) >> sb->s_blocksize_bits); 2322 ext4_mark_inode_dirty(NULL, inode); 2323 out: 2324 iput(inode); 2325 return 0; 2326 } 2327 2328 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 2329 { 2330 struct ext4_fc_replay_state *state; 2331 struct inode *inode; 2332 struct ext4_ext_path *path = NULL; 2333 struct ext4_map_blocks map; 2334 int i, ret, j; 2335 ext4_lblk_t cur, end; 2336 2337 state = &EXT4_SB(sb)->s_fc_replay_state; 2338 for (i = 0; i < state->fc_modified_inodes_used; i++) { 2339 inode = ext4_iget(sb, state->fc_modified_inodes[i], 2340 EXT4_IGET_NORMAL); 2341 if (IS_ERR(inode)) { 2342 ext4_debug("Inode %d not found.", 2343 state->fc_modified_inodes[i]); 2344 continue; 2345 } 2346 cur = 0; 2347 end = EXT_MAX_BLOCKS; 2348 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 2349 iput(inode); 2350 continue; 2351 } 2352 while (cur < end) { 2353 map.m_lblk = cur; 2354 map.m_len = end - cur; 2355 2356 ret = ext4_map_blocks(NULL, inode, &map, 0); 2357 if (ret < 0) 2358 break; 2359 2360 if (ret > 0) { 2361 path = ext4_find_extent(inode, map.m_lblk, path, 0); 2362 if (!IS_ERR(path)) { 2363 for (j = 0; j < path->p_depth; j++) 2364 ext4_mb_mark_bb(inode->i_sb, 2365 path[j].p_block, 1, true); 2366 } else { 2367 path = NULL; 2368 } 2369 cur += ret; 2370 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 2371 map.m_len, true); 2372 } else { 2373 cur = cur + (map.m_len ? map.m_len : 1); 2374 } 2375 } 2376 iput(inode); 2377 } 2378 2379 ext4_free_ext_path(path); 2380 } 2381 2382 /* 2383 * Check if block is in excluded regions for block allocation. The simple 2384 * allocator that runs during replay phase is calls this function to see 2385 * if it is okay to use a block. 2386 */ 2387 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 2388 { 2389 int i; 2390 struct ext4_fc_replay_state *state; 2391 2392 state = &EXT4_SB(sb)->s_fc_replay_state; 2393 for (i = 0; i < state->fc_regions_valid; i++) { 2394 if (state->fc_regions[i].ino == 0 || 2395 state->fc_regions[i].len == 0) 2396 continue; 2397 if (in_range(blk, state->fc_regions[i].pblk, 2398 state->fc_regions[i].len)) 2399 return true; 2400 } 2401 return false; 2402 } 2403 2404 /* Cleanup function called after replay */ 2405 void ext4_fc_replay_cleanup(struct super_block *sb) 2406 { 2407 struct ext4_sb_info *sbi = EXT4_SB(sb); 2408 2409 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 2410 kfree(sbi->s_fc_replay_state.fc_regions); 2411 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 2412 } 2413 2414 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 2415 int tag, int len) 2416 { 2417 switch (tag) { 2418 case EXT4_FC_TAG_ADD_RANGE: 2419 return len == sizeof(struct ext4_fc_add_range); 2420 case EXT4_FC_TAG_DEL_RANGE: 2421 return len == sizeof(struct ext4_fc_del_range); 2422 case EXT4_FC_TAG_CREAT: 2423 case EXT4_FC_TAG_LINK: 2424 case EXT4_FC_TAG_UNLINK: 2425 len -= sizeof(struct ext4_fc_dentry_info); 2426 return len >= 1 && len <= EXT4_NAME_LEN; 2427 case EXT4_FC_TAG_INODE: 2428 len -= sizeof(struct ext4_fc_inode); 2429 return len >= EXT4_GOOD_OLD_INODE_SIZE && 2430 len <= sbi->s_inode_size; 2431 case EXT4_FC_TAG_PAD: 2432 return true; /* padding can have any length */ 2433 case EXT4_FC_TAG_TAIL: 2434 return len >= sizeof(struct ext4_fc_tail); 2435 case EXT4_FC_TAG_HEAD: 2436 return len == sizeof(struct ext4_fc_head); 2437 } 2438 return false; 2439 } 2440 2441 /* 2442 * Recovery Scan phase handler 2443 * 2444 * This function is called during the scan phase and is responsible 2445 * for doing following things: 2446 * - Make sure the fast commit area has valid tags for replay 2447 * - Count number of tags that need to be replayed by the replay handler 2448 * - Verify CRC 2449 * - Create a list of excluded blocks for allocation during replay phase 2450 * 2451 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2452 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2453 * to indicate that scan has finished and JBD2 can now start replay phase. 2454 * It returns a negative error to indicate that there was an error. At the end 2455 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2456 * to indicate the number of tags that need to replayed during the replay phase. 2457 */ 2458 static int ext4_fc_replay_scan(journal_t *journal, 2459 struct buffer_head *bh, int off, 2460 tid_t expected_tid) 2461 { 2462 struct super_block *sb = journal->j_private; 2463 struct ext4_sb_info *sbi = EXT4_SB(sb); 2464 struct ext4_fc_replay_state *state; 2465 int ret = JBD2_FC_REPLAY_CONTINUE; 2466 struct ext4_fc_add_range ext; 2467 struct ext4_fc_tl_mem tl; 2468 struct ext4_fc_tail tail; 2469 __u8 *start, *end, *cur, *val; 2470 struct ext4_fc_head head; 2471 struct ext4_extent *ex; 2472 2473 state = &sbi->s_fc_replay_state; 2474 2475 start = (u8 *)bh->b_data; 2476 end = start + journal->j_blocksize; 2477 2478 if (state->fc_replay_expected_off == 0) { 2479 state->fc_cur_tag = 0; 2480 state->fc_replay_num_tags = 0; 2481 state->fc_crc = 0; 2482 state->fc_regions = NULL; 2483 state->fc_regions_valid = state->fc_regions_used = 2484 state->fc_regions_size = 0; 2485 /* Check if we can stop early */ 2486 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2487 != EXT4_FC_TAG_HEAD) 2488 return 0; 2489 } 2490 2491 if (off != state->fc_replay_expected_off) { 2492 ret = -EFSCORRUPTED; 2493 goto out_err; 2494 } 2495 2496 state->fc_replay_expected_off++; 2497 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2498 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2499 ext4_fc_get_tl(&tl, cur); 2500 val = cur + EXT4_FC_TAG_BASE_LEN; 2501 if (tl.fc_len > end - val || 2502 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2503 ret = state->fc_replay_num_tags ? 2504 JBD2_FC_REPLAY_STOP : -ECANCELED; 2505 goto out_err; 2506 } 2507 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2508 tag2str(tl.fc_tag), bh->b_blocknr); 2509 switch (tl.fc_tag) { 2510 case EXT4_FC_TAG_ADD_RANGE: 2511 memcpy(&ext, val, sizeof(ext)); 2512 ex = (struct ext4_extent *)&ext.fc_ex; 2513 ret = ext4_fc_record_regions(sb, 2514 le32_to_cpu(ext.fc_ino), 2515 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2516 ext4_ext_get_actual_len(ex), 0); 2517 if (ret < 0) 2518 break; 2519 ret = JBD2_FC_REPLAY_CONTINUE; 2520 fallthrough; 2521 case EXT4_FC_TAG_DEL_RANGE: 2522 case EXT4_FC_TAG_LINK: 2523 case EXT4_FC_TAG_UNLINK: 2524 case EXT4_FC_TAG_CREAT: 2525 case EXT4_FC_TAG_INODE: 2526 case EXT4_FC_TAG_PAD: 2527 state->fc_cur_tag++; 2528 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2529 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2530 break; 2531 case EXT4_FC_TAG_TAIL: 2532 state->fc_cur_tag++; 2533 memcpy(&tail, val, sizeof(tail)); 2534 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2535 EXT4_FC_TAG_BASE_LEN + 2536 offsetof(struct ext4_fc_tail, 2537 fc_crc)); 2538 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2539 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2540 state->fc_replay_num_tags = state->fc_cur_tag; 2541 state->fc_regions_valid = 2542 state->fc_regions_used; 2543 } else { 2544 ret = state->fc_replay_num_tags ? 2545 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2546 } 2547 state->fc_crc = 0; 2548 break; 2549 case EXT4_FC_TAG_HEAD: 2550 memcpy(&head, val, sizeof(head)); 2551 if (le32_to_cpu(head.fc_features) & 2552 ~EXT4_FC_SUPPORTED_FEATURES) { 2553 ret = -EOPNOTSUPP; 2554 break; 2555 } 2556 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2557 ret = JBD2_FC_REPLAY_STOP; 2558 break; 2559 } 2560 state->fc_cur_tag++; 2561 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2562 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2563 break; 2564 default: 2565 ret = state->fc_replay_num_tags ? 2566 JBD2_FC_REPLAY_STOP : -ECANCELED; 2567 } 2568 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2569 break; 2570 } 2571 2572 out_err: 2573 trace_ext4_fc_replay_scan(sb, ret, off); 2574 return ret; 2575 } 2576 2577 /* 2578 * Main recovery path entry point. 2579 * The meaning of return codes is similar as above. 2580 */ 2581 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2582 enum passtype pass, int off, tid_t expected_tid) 2583 { 2584 struct super_block *sb = journal->j_private; 2585 struct ext4_sb_info *sbi = EXT4_SB(sb); 2586 struct ext4_fc_tl_mem tl; 2587 __u8 *start, *end, *cur, *val; 2588 int ret = JBD2_FC_REPLAY_CONTINUE; 2589 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2590 struct ext4_fc_tail tail; 2591 2592 if (pass == PASS_SCAN) { 2593 state->fc_current_pass = PASS_SCAN; 2594 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2595 } 2596 2597 if (state->fc_current_pass != pass) { 2598 state->fc_current_pass = pass; 2599 sbi->s_mount_state |= EXT4_FC_REPLAY; 2600 } 2601 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2602 ext4_debug("Replay stops\n"); 2603 ext4_fc_set_bitmaps_and_counters(sb); 2604 return 0; 2605 } 2606 2607 #ifdef CONFIG_EXT4_DEBUG 2608 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2609 pr_warn("Dropping fc block %d because max_replay set\n", off); 2610 return JBD2_FC_REPLAY_STOP; 2611 } 2612 #endif 2613 2614 start = (u8 *)bh->b_data; 2615 end = start + journal->j_blocksize; 2616 2617 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2618 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2619 ext4_fc_get_tl(&tl, cur); 2620 val = cur + EXT4_FC_TAG_BASE_LEN; 2621 2622 if (state->fc_replay_num_tags == 0) { 2623 ret = JBD2_FC_REPLAY_STOP; 2624 ext4_fc_set_bitmaps_and_counters(sb); 2625 break; 2626 } 2627 2628 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2629 state->fc_replay_num_tags--; 2630 switch (tl.fc_tag) { 2631 case EXT4_FC_TAG_LINK: 2632 ret = ext4_fc_replay_link(sb, &tl, val); 2633 break; 2634 case EXT4_FC_TAG_UNLINK: 2635 ret = ext4_fc_replay_unlink(sb, &tl, val); 2636 break; 2637 case EXT4_FC_TAG_ADD_RANGE: 2638 ret = ext4_fc_replay_add_range(sb, val); 2639 break; 2640 case EXT4_FC_TAG_CREAT: 2641 ret = ext4_fc_replay_create(sb, &tl, val); 2642 break; 2643 case EXT4_FC_TAG_DEL_RANGE: 2644 ret = ext4_fc_replay_del_range(sb, val); 2645 break; 2646 case EXT4_FC_TAG_INODE: 2647 ret = ext4_fc_replay_inode(sb, &tl, val); 2648 break; 2649 case EXT4_FC_TAG_PAD: 2650 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2651 tl.fc_len, 0); 2652 break; 2653 case EXT4_FC_TAG_TAIL: 2654 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2655 0, tl.fc_len, 0); 2656 memcpy(&tail, val, sizeof(tail)); 2657 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2658 break; 2659 case EXT4_FC_TAG_HEAD: 2660 break; 2661 default: 2662 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2663 ret = -ECANCELED; 2664 break; 2665 } 2666 if (ret < 0) 2667 break; 2668 ret = JBD2_FC_REPLAY_CONTINUE; 2669 } 2670 return ret; 2671 } 2672 2673 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2674 { 2675 /* 2676 * We set replay callback even if fast commit disabled because we may 2677 * could still have fast commit blocks that need to be replayed even if 2678 * fast commit has now been turned off. 2679 */ 2680 journal->j_fc_replay_callback = ext4_fc_replay; 2681 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2682 return; 2683 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2684 } 2685 2686 static const char * const fc_ineligible_reasons[] = { 2687 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2688 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2689 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2690 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2691 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2692 [EXT4_FC_REASON_RESIZE] = "Resize", 2693 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2694 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2695 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2696 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2697 [EXT4_FC_REASON_MIGRATE] = "Inode format migration", 2698 [EXT4_FC_REASON_VERITY] = "fs-verity enable", 2699 [EXT4_FC_REASON_MOVE_EXT] = "Move extents", 2700 }; 2701 2702 int ext4_fc_info_show(struct seq_file *seq, void *v) 2703 { 2704 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2705 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2706 struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats; 2707 u64 lock_avg_ns = 0; 2708 u64 lock_updates_samples; 2709 u64 lock_updates_ns_total; 2710 u64 lock_updates_ns_max; 2711 int i; 2712 2713 if (v != SEQ_START_TOKEN) 2714 return 0; 2715 2716 lock_updates_samples = 2717 atomic64_read(&snap_stats->lock_updates_samples); 2718 lock_updates_ns_total = 2719 atomic64_read(&snap_stats->lock_updates_ns_total); 2720 lock_updates_ns_max = 2721 atomic64_read(&snap_stats->lock_updates_ns_max); 2722 if (lock_updates_samples) 2723 lock_avg_ns = div64_u64(lock_updates_ns_total, 2724 lock_updates_samples); 2725 2726 seq_printf(seq, 2727 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2728 stats->fc_num_commits, stats->fc_ineligible_commits, 2729 stats->fc_numblks, 2730 div_u64(stats->s_fc_avg_commit_time, 1000)); 2731 seq_puts(seq, "Ineligible reasons:\n"); 2732 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2733 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2734 stats->fc_ineligible_reason_count[i]); 2735 2736 seq_printf(seq, 2737 "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n", 2738 atomic64_read(&snap_stats->snap_inodes), 2739 atomic64_read(&snap_stats->snap_ranges), 2740 div_u64(lock_avg_ns, 1000), 2741 div_u64(lock_updates_ns_max, 1000)); 2742 seq_printf(seq, 2743 "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n", 2744 atomic64_read(&snap_stats->snap_fail_es_miss), 2745 atomic64_read(&snap_stats->snap_fail_es_delayed), 2746 atomic64_read(&snap_stats->snap_fail_es_other), 2747 atomic64_read(&snap_stats->snap_fail_inodes_cap), 2748 atomic64_read(&snap_stats->snap_fail_ranges_cap), 2749 atomic64_read(&snap_stats->snap_fail_nomem), 2750 atomic64_read(&snap_stats->snap_fail_inode_loc), 2751 atomic64_read(&snap_stats->snap_fail_no_snap)); 2752 2753 return 0; 2754 } 2755 2756 int __init ext4_fc_init_dentry_cache(void) 2757 { 2758 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2759 SLAB_RECLAIM_ACCOUNT); 2760 2761 if (!ext4_fc_dentry_cachep) 2762 return -ENOMEM; 2763 2764 ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT); 2765 if (!ext4_fc_range_cachep) { 2766 kmem_cache_destroy(ext4_fc_dentry_cachep); 2767 return -ENOMEM; 2768 } 2769 2770 return 0; 2771 } 2772 2773 void ext4_fc_destroy_dentry_cache(void) 2774 { 2775 kmem_cache_destroy(ext4_fc_range_cachep); 2776 kmem_cache_destroy(ext4_fc_dentry_cachep); 2777 } 2778