1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 static bool ext4_fc_disabled(struct super_block *sb) 233 { 234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 236 } 237 238 /* 239 * Inform Ext4's fast about start of an inode update 240 * 241 * This function is called by the high level call VFS callbacks before 242 * performing any inode update. This function blocks if there's an ongoing 243 * fast commit on the inode in question. 244 */ 245 void ext4_fc_start_update(struct inode *inode) 246 { 247 struct ext4_inode_info *ei = EXT4_I(inode); 248 249 if (ext4_fc_disabled(inode->i_sb)) 250 return; 251 252 restart: 253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 254 if (list_empty(&ei->i_fc_list)) 255 goto out; 256 257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 258 ext4_fc_wait_committing_inode(inode); 259 goto restart; 260 } 261 out: 262 atomic_inc(&ei->i_fc_updates); 263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 264 } 265 266 /* 267 * Stop inode update and wake up waiting fast commits if any. 268 */ 269 void ext4_fc_stop_update(struct inode *inode) 270 { 271 struct ext4_inode_info *ei = EXT4_I(inode); 272 273 if (ext4_fc_disabled(inode->i_sb)) 274 return; 275 276 if (atomic_dec_and_test(&ei->i_fc_updates)) 277 wake_up_all(&ei->i_fc_wait); 278 } 279 280 /* 281 * Remove inode from fast commit list. If the inode is being committed 282 * we wait until inode commit is done. 283 */ 284 void ext4_fc_del(struct inode *inode) 285 { 286 struct ext4_inode_info *ei = EXT4_I(inode); 287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 288 struct ext4_fc_dentry_update *fc_dentry; 289 290 if (ext4_fc_disabled(inode->i_sb)) 291 return; 292 293 restart: 294 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 296 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 297 return; 298 } 299 300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 301 ext4_fc_wait_committing_inode(inode); 302 goto restart; 303 } 304 305 if (!list_empty(&ei->i_fc_list)) 306 list_del_init(&ei->i_fc_list); 307 308 /* 309 * Since this inode is getting removed, let's also remove all FC 310 * dentry create references, since it is not needed to log it anyways. 311 */ 312 if (list_empty(&ei->i_fc_dilist)) { 313 spin_unlock(&sbi->s_fc_lock); 314 return; 315 } 316 317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 319 list_del_init(&fc_dentry->fcd_list); 320 list_del_init(&fc_dentry->fcd_dilist); 321 322 WARN_ON(!list_empty(&ei->i_fc_dilist)); 323 spin_unlock(&sbi->s_fc_lock); 324 325 if (fc_dentry->fcd_name.name && 326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 327 kfree(fc_dentry->fcd_name.name); 328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 329 330 return; 331 } 332 333 /* 334 * Mark file system as fast commit ineligible, and record latest 335 * ineligible transaction tid. This means until the recorded 336 * transaction, commit operation would result in a full jbd2 commit. 337 */ 338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 339 { 340 struct ext4_sb_info *sbi = EXT4_SB(sb); 341 tid_t tid; 342 343 if (ext4_fc_disabled(sb)) 344 return; 345 346 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 347 if (handle && !IS_ERR(handle)) 348 tid = handle->h_transaction->t_tid; 349 else { 350 read_lock(&sbi->s_journal->j_state_lock); 351 tid = sbi->s_journal->j_running_transaction ? 352 sbi->s_journal->j_running_transaction->t_tid : 0; 353 read_unlock(&sbi->s_journal->j_state_lock); 354 } 355 spin_lock(&sbi->s_fc_lock); 356 if (sbi->s_fc_ineligible_tid < tid) 357 sbi->s_fc_ineligible_tid = tid; 358 spin_unlock(&sbi->s_fc_lock); 359 WARN_ON(reason >= EXT4_FC_REASON_MAX); 360 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 361 } 362 363 /* 364 * Generic fast commit tracking function. If this is the first time this we are 365 * called after a full commit, we initialize fast commit fields and then call 366 * __fc_track_fn() with update = 0. If we have already been called after a full 367 * commit, we pass update = 1. Based on that, the track function can determine 368 * if it needs to track a field for the first time or if it needs to just 369 * update the previously tracked value. 370 * 371 * If enqueue is set, this function enqueues the inode in fast commit list. 372 */ 373 static int ext4_fc_track_template( 374 handle_t *handle, struct inode *inode, 375 int (*__fc_track_fn)(struct inode *, void *, bool), 376 void *args, int enqueue) 377 { 378 bool update = false; 379 struct ext4_inode_info *ei = EXT4_I(inode); 380 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 381 tid_t tid = 0; 382 int ret; 383 384 tid = handle->h_transaction->t_tid; 385 mutex_lock(&ei->i_fc_lock); 386 if (tid == ei->i_sync_tid) { 387 update = true; 388 } else { 389 ext4_fc_reset_inode(inode); 390 ei->i_sync_tid = tid; 391 } 392 ret = __fc_track_fn(inode, args, update); 393 mutex_unlock(&ei->i_fc_lock); 394 395 if (!enqueue) 396 return ret; 397 398 spin_lock(&sbi->s_fc_lock); 399 if (list_empty(&EXT4_I(inode)->i_fc_list)) 400 list_add_tail(&EXT4_I(inode)->i_fc_list, 401 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 402 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 403 &sbi->s_fc_q[FC_Q_STAGING] : 404 &sbi->s_fc_q[FC_Q_MAIN]); 405 spin_unlock(&sbi->s_fc_lock); 406 407 return ret; 408 } 409 410 struct __track_dentry_update_args { 411 struct dentry *dentry; 412 int op; 413 }; 414 415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 416 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 417 { 418 struct ext4_fc_dentry_update *node; 419 struct ext4_inode_info *ei = EXT4_I(inode); 420 struct __track_dentry_update_args *dentry_update = 421 (struct __track_dentry_update_args *)arg; 422 struct dentry *dentry = dentry_update->dentry; 423 struct inode *dir = dentry->d_parent->d_inode; 424 struct super_block *sb = inode->i_sb; 425 struct ext4_sb_info *sbi = EXT4_SB(sb); 426 427 mutex_unlock(&ei->i_fc_lock); 428 429 if (IS_ENCRYPTED(dir)) { 430 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 431 NULL); 432 mutex_lock(&ei->i_fc_lock); 433 return -EOPNOTSUPP; 434 } 435 436 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 437 if (!node) { 438 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 439 mutex_lock(&ei->i_fc_lock); 440 return -ENOMEM; 441 } 442 443 node->fcd_op = dentry_update->op; 444 node->fcd_parent = dir->i_ino; 445 node->fcd_ino = inode->i_ino; 446 if (dentry->d_name.len > DNAME_INLINE_LEN) { 447 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 448 if (!node->fcd_name.name) { 449 kmem_cache_free(ext4_fc_dentry_cachep, node); 450 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 451 mutex_lock(&ei->i_fc_lock); 452 return -ENOMEM; 453 } 454 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 455 dentry->d_name.len); 456 } else { 457 memcpy(node->fcd_iname, dentry->d_name.name, 458 dentry->d_name.len); 459 node->fcd_name.name = node->fcd_iname; 460 } 461 node->fcd_name.len = dentry->d_name.len; 462 INIT_LIST_HEAD(&node->fcd_dilist); 463 spin_lock(&sbi->s_fc_lock); 464 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 465 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 466 list_add_tail(&node->fcd_list, 467 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 468 else 469 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 470 471 /* 472 * This helps us keep a track of all fc_dentry updates which is part of 473 * this ext4 inode. So in case the inode is getting unlinked, before 474 * even we get a chance to fsync, we could remove all fc_dentry 475 * references while evicting the inode in ext4_fc_del(). 476 * Also with this, we don't need to loop over all the inodes in 477 * sbi->s_fc_q to get the corresponding inode in 478 * ext4_fc_commit_dentry_updates(). 479 */ 480 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 481 WARN_ON(!list_empty(&ei->i_fc_dilist)); 482 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 483 } 484 spin_unlock(&sbi->s_fc_lock); 485 mutex_lock(&ei->i_fc_lock); 486 487 return 0; 488 } 489 490 void __ext4_fc_track_unlink(handle_t *handle, 491 struct inode *inode, struct dentry *dentry) 492 { 493 struct __track_dentry_update_args args; 494 int ret; 495 496 args.dentry = dentry; 497 args.op = EXT4_FC_TAG_UNLINK; 498 499 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 500 (void *)&args, 0); 501 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 502 } 503 504 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 505 { 506 struct inode *inode = d_inode(dentry); 507 508 if (ext4_fc_disabled(inode->i_sb)) 509 return; 510 511 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 512 return; 513 514 __ext4_fc_track_unlink(handle, inode, dentry); 515 } 516 517 void __ext4_fc_track_link(handle_t *handle, 518 struct inode *inode, struct dentry *dentry) 519 { 520 struct __track_dentry_update_args args; 521 int ret; 522 523 args.dentry = dentry; 524 args.op = EXT4_FC_TAG_LINK; 525 526 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 527 (void *)&args, 0); 528 trace_ext4_fc_track_link(handle, inode, dentry, ret); 529 } 530 531 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 532 { 533 struct inode *inode = d_inode(dentry); 534 535 if (ext4_fc_disabled(inode->i_sb)) 536 return; 537 538 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 539 return; 540 541 __ext4_fc_track_link(handle, inode, dentry); 542 } 543 544 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 545 struct dentry *dentry) 546 { 547 struct __track_dentry_update_args args; 548 int ret; 549 550 args.dentry = dentry; 551 args.op = EXT4_FC_TAG_CREAT; 552 553 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 554 (void *)&args, 0); 555 trace_ext4_fc_track_create(handle, inode, dentry, ret); 556 } 557 558 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 559 { 560 struct inode *inode = d_inode(dentry); 561 562 if (ext4_fc_disabled(inode->i_sb)) 563 return; 564 565 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 566 return; 567 568 __ext4_fc_track_create(handle, inode, dentry); 569 } 570 571 /* __track_fn for inode tracking */ 572 static int __track_inode(struct inode *inode, void *arg, bool update) 573 { 574 if (update) 575 return -EEXIST; 576 577 EXT4_I(inode)->i_fc_lblk_len = 0; 578 579 return 0; 580 } 581 582 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 583 { 584 int ret; 585 586 if (S_ISDIR(inode->i_mode)) 587 return; 588 589 if (ext4_fc_disabled(inode->i_sb)) 590 return; 591 592 if (ext4_should_journal_data(inode)) { 593 ext4_fc_mark_ineligible(inode->i_sb, 594 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 595 return; 596 } 597 598 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 599 return; 600 601 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 602 trace_ext4_fc_track_inode(handle, inode, ret); 603 } 604 605 struct __track_range_args { 606 ext4_lblk_t start, end; 607 }; 608 609 /* __track_fn for tracking data updates */ 610 static int __track_range(struct inode *inode, void *arg, bool update) 611 { 612 struct ext4_inode_info *ei = EXT4_I(inode); 613 ext4_lblk_t oldstart; 614 struct __track_range_args *__arg = 615 (struct __track_range_args *)arg; 616 617 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 618 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 619 return -ECANCELED; 620 } 621 622 oldstart = ei->i_fc_lblk_start; 623 624 if (update && ei->i_fc_lblk_len > 0) { 625 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 626 ei->i_fc_lblk_len = 627 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 628 ei->i_fc_lblk_start + 1; 629 } else { 630 ei->i_fc_lblk_start = __arg->start; 631 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 632 } 633 634 return 0; 635 } 636 637 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 638 ext4_lblk_t end) 639 { 640 struct __track_range_args args; 641 int ret; 642 643 if (S_ISDIR(inode->i_mode)) 644 return; 645 646 if (ext4_fc_disabled(inode->i_sb)) 647 return; 648 649 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 650 return; 651 652 args.start = start; 653 args.end = end; 654 655 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 656 657 trace_ext4_fc_track_range(handle, inode, start, end, ret); 658 } 659 660 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 661 { 662 blk_opf_t write_flags = REQ_SYNC; 663 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 664 665 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 666 if (test_opt(sb, BARRIER) && is_tail) 667 write_flags |= REQ_FUA | REQ_PREFLUSH; 668 lock_buffer(bh); 669 set_buffer_dirty(bh); 670 set_buffer_uptodate(bh); 671 bh->b_end_io = ext4_end_buffer_io_sync; 672 submit_bh(REQ_OP_WRITE | write_flags, bh); 673 EXT4_SB(sb)->s_fc_bh = NULL; 674 } 675 676 /* Ext4 commit path routines */ 677 678 /* 679 * Allocate len bytes on a fast commit buffer. 680 * 681 * During the commit time this function is used to manage fast commit 682 * block space. We don't split a fast commit log onto different 683 * blocks. So this function makes sure that if there's not enough space 684 * on the current block, the remaining space in the current block is 685 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 686 * new block is from jbd2 and CRC is updated to reflect the padding 687 * we added. 688 */ 689 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 690 { 691 struct ext4_fc_tl tl; 692 struct ext4_sb_info *sbi = EXT4_SB(sb); 693 struct buffer_head *bh; 694 int bsize = sbi->s_journal->j_blocksize; 695 int ret, off = sbi->s_fc_bytes % bsize; 696 int remaining; 697 u8 *dst; 698 699 /* 700 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 701 * cannot fulfill the request. 702 */ 703 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 704 return NULL; 705 706 if (!sbi->s_fc_bh) { 707 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 708 if (ret) 709 return NULL; 710 sbi->s_fc_bh = bh; 711 } 712 dst = sbi->s_fc_bh->b_data + off; 713 714 /* 715 * Allocate the bytes in the current block if we can do so while still 716 * leaving enough space for a PAD tlv. 717 */ 718 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 719 if (len <= remaining) { 720 sbi->s_fc_bytes += len; 721 return dst; 722 } 723 724 /* 725 * Else, terminate the current block with a PAD tlv, then allocate a new 726 * block and allocate the bytes at the start of that new block. 727 */ 728 729 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 730 tl.fc_len = cpu_to_le16(remaining); 731 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 732 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 733 *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); 734 735 ext4_fc_submit_bh(sb, false); 736 737 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 738 if (ret) 739 return NULL; 740 sbi->s_fc_bh = bh; 741 sbi->s_fc_bytes += bsize - off + len; 742 return sbi->s_fc_bh->b_data; 743 } 744 745 /* 746 * Complete a fast commit by writing tail tag. 747 * 748 * Writing tail tag marks the end of a fast commit. In order to guarantee 749 * atomicity, after writing tail tag, even if there's space remaining 750 * in the block, next commit shouldn't use it. That's why tail tag 751 * has the length as that of the remaining space on the block. 752 */ 753 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 754 { 755 struct ext4_sb_info *sbi = EXT4_SB(sb); 756 struct ext4_fc_tl tl; 757 struct ext4_fc_tail tail; 758 int off, bsize = sbi->s_journal->j_blocksize; 759 u8 *dst; 760 761 /* 762 * ext4_fc_reserve_space takes care of allocating an extra block if 763 * there's no enough space on this block for accommodating this tail. 764 */ 765 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 766 if (!dst) 767 return -ENOSPC; 768 769 off = sbi->s_fc_bytes % bsize; 770 771 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 772 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 773 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 774 775 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 776 dst += EXT4_FC_TAG_BASE_LEN; 777 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 778 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 779 dst += sizeof(tail.fc_tid); 780 crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, 781 dst - (u8 *)sbi->s_fc_bh->b_data); 782 tail.fc_crc = cpu_to_le32(crc); 783 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 784 dst += sizeof(tail.fc_crc); 785 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 786 787 ext4_fc_submit_bh(sb, true); 788 789 return 0; 790 } 791 792 /* 793 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 794 * Returns false if there's not enough space. 795 */ 796 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 797 u32 *crc) 798 { 799 struct ext4_fc_tl tl; 800 u8 *dst; 801 802 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 803 if (!dst) 804 return false; 805 806 tl.fc_tag = cpu_to_le16(tag); 807 tl.fc_len = cpu_to_le16(len); 808 809 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 810 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 811 812 return true; 813 } 814 815 /* Same as above, but adds dentry tlv. */ 816 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 817 struct ext4_fc_dentry_update *fc_dentry) 818 { 819 struct ext4_fc_dentry_info fcd; 820 struct ext4_fc_tl tl; 821 int dlen = fc_dentry->fcd_name.len; 822 u8 *dst = ext4_fc_reserve_space(sb, 823 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 824 825 if (!dst) 826 return false; 827 828 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 829 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 830 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 831 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 832 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 833 dst += EXT4_FC_TAG_BASE_LEN; 834 memcpy(dst, &fcd, sizeof(fcd)); 835 dst += sizeof(fcd); 836 memcpy(dst, fc_dentry->fcd_name.name, dlen); 837 838 return true; 839 } 840 841 /* 842 * Writes inode in the fast commit space under TLV with tag @tag. 843 * Returns 0 on success, error on failure. 844 */ 845 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 846 { 847 struct ext4_inode_info *ei = EXT4_I(inode); 848 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 849 int ret; 850 struct ext4_iloc iloc; 851 struct ext4_fc_inode fc_inode; 852 struct ext4_fc_tl tl; 853 u8 *dst; 854 855 ret = ext4_get_inode_loc(inode, &iloc); 856 if (ret) 857 return ret; 858 859 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 860 inode_len = EXT4_INODE_SIZE(inode->i_sb); 861 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 862 inode_len += ei->i_extra_isize; 863 864 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 865 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 866 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 867 868 ret = -ECANCELED; 869 dst = ext4_fc_reserve_space(inode->i_sb, 870 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 871 if (!dst) 872 goto err; 873 874 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 875 dst += EXT4_FC_TAG_BASE_LEN; 876 memcpy(dst, &fc_inode, sizeof(fc_inode)); 877 dst += sizeof(fc_inode); 878 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); 879 ret = 0; 880 err: 881 brelse(iloc.bh); 882 return ret; 883 } 884 885 /* 886 * Writes updated data ranges for the inode in question. Updates CRC. 887 * Returns 0 on success, error otherwise. 888 */ 889 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 890 { 891 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 892 struct ext4_inode_info *ei = EXT4_I(inode); 893 struct ext4_map_blocks map; 894 struct ext4_fc_add_range fc_ext; 895 struct ext4_fc_del_range lrange; 896 struct ext4_extent *ex; 897 int ret; 898 899 mutex_lock(&ei->i_fc_lock); 900 if (ei->i_fc_lblk_len == 0) { 901 mutex_unlock(&ei->i_fc_lock); 902 return 0; 903 } 904 old_blk_size = ei->i_fc_lblk_start; 905 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 906 ei->i_fc_lblk_len = 0; 907 mutex_unlock(&ei->i_fc_lock); 908 909 cur_lblk_off = old_blk_size; 910 ext4_debug("will try writing %d to %d for inode %ld\n", 911 cur_lblk_off, new_blk_size, inode->i_ino); 912 913 while (cur_lblk_off <= new_blk_size) { 914 map.m_lblk = cur_lblk_off; 915 map.m_len = new_blk_size - cur_lblk_off + 1; 916 ret = ext4_map_blocks(NULL, inode, &map, 0); 917 if (ret < 0) 918 return -ECANCELED; 919 920 if (map.m_len == 0) { 921 cur_lblk_off++; 922 continue; 923 } 924 925 if (ret == 0) { 926 lrange.fc_ino = cpu_to_le32(inode->i_ino); 927 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 928 lrange.fc_len = cpu_to_le32(map.m_len); 929 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 930 sizeof(lrange), (u8 *)&lrange, crc)) 931 return -ENOSPC; 932 } else { 933 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 934 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 935 936 /* Limit the number of blocks in one extent */ 937 map.m_len = min(max, map.m_len); 938 939 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 940 ex = (struct ext4_extent *)&fc_ext.fc_ex; 941 ex->ee_block = cpu_to_le32(map.m_lblk); 942 ex->ee_len = cpu_to_le16(map.m_len); 943 ext4_ext_store_pblock(ex, map.m_pblk); 944 if (map.m_flags & EXT4_MAP_UNWRITTEN) 945 ext4_ext_mark_unwritten(ex); 946 else 947 ext4_ext_mark_initialized(ex); 948 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 949 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 950 return -ENOSPC; 951 } 952 953 cur_lblk_off += map.m_len; 954 } 955 956 return 0; 957 } 958 959 960 /* Submit data for all the fast commit inodes */ 961 static int ext4_fc_submit_inode_data_all(journal_t *journal) 962 { 963 struct super_block *sb = journal->j_private; 964 struct ext4_sb_info *sbi = EXT4_SB(sb); 965 struct ext4_inode_info *ei; 966 int ret = 0; 967 968 spin_lock(&sbi->s_fc_lock); 969 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 970 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 971 while (atomic_read(&ei->i_fc_updates)) { 972 DEFINE_WAIT(wait); 973 974 prepare_to_wait(&ei->i_fc_wait, &wait, 975 TASK_UNINTERRUPTIBLE); 976 if (atomic_read(&ei->i_fc_updates)) { 977 spin_unlock(&sbi->s_fc_lock); 978 schedule(); 979 spin_lock(&sbi->s_fc_lock); 980 } 981 finish_wait(&ei->i_fc_wait, &wait); 982 } 983 spin_unlock(&sbi->s_fc_lock); 984 ret = jbd2_submit_inode_data(journal, ei->jinode); 985 if (ret) 986 return ret; 987 spin_lock(&sbi->s_fc_lock); 988 } 989 spin_unlock(&sbi->s_fc_lock); 990 991 return ret; 992 } 993 994 /* Wait for completion of data for all the fast commit inodes */ 995 static int ext4_fc_wait_inode_data_all(journal_t *journal) 996 { 997 struct super_block *sb = journal->j_private; 998 struct ext4_sb_info *sbi = EXT4_SB(sb); 999 struct ext4_inode_info *pos, *n; 1000 int ret = 0; 1001 1002 spin_lock(&sbi->s_fc_lock); 1003 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1004 if (!ext4_test_inode_state(&pos->vfs_inode, 1005 EXT4_STATE_FC_COMMITTING)) 1006 continue; 1007 spin_unlock(&sbi->s_fc_lock); 1008 1009 ret = jbd2_wait_inode_data(journal, pos->jinode); 1010 if (ret) 1011 return ret; 1012 spin_lock(&sbi->s_fc_lock); 1013 } 1014 spin_unlock(&sbi->s_fc_lock); 1015 1016 return 0; 1017 } 1018 1019 /* Commit all the directory entry updates */ 1020 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1021 __acquires(&sbi->s_fc_lock) 1022 __releases(&sbi->s_fc_lock) 1023 { 1024 struct super_block *sb = journal->j_private; 1025 struct ext4_sb_info *sbi = EXT4_SB(sb); 1026 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1027 struct inode *inode; 1028 struct ext4_inode_info *ei; 1029 int ret; 1030 1031 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1032 return 0; 1033 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1034 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1035 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1036 spin_unlock(&sbi->s_fc_lock); 1037 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1038 ret = -ENOSPC; 1039 goto lock_and_exit; 1040 } 1041 spin_lock(&sbi->s_fc_lock); 1042 continue; 1043 } 1044 /* 1045 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1046 * corresponding inode pointer 1047 */ 1048 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1049 ei = list_first_entry(&fc_dentry->fcd_dilist, 1050 struct ext4_inode_info, i_fc_dilist); 1051 inode = &ei->vfs_inode; 1052 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1053 1054 spin_unlock(&sbi->s_fc_lock); 1055 1056 /* 1057 * We first write the inode and then the create dirent. This 1058 * allows the recovery code to create an unnamed inode first 1059 * and then link it to a directory entry. This allows us 1060 * to use namei.c routines almost as is and simplifies 1061 * the recovery code. 1062 */ 1063 ret = ext4_fc_write_inode(inode, crc); 1064 if (ret) 1065 goto lock_and_exit; 1066 1067 ret = ext4_fc_write_inode_data(inode, crc); 1068 if (ret) 1069 goto lock_and_exit; 1070 1071 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1072 ret = -ENOSPC; 1073 goto lock_and_exit; 1074 } 1075 1076 spin_lock(&sbi->s_fc_lock); 1077 } 1078 return 0; 1079 lock_and_exit: 1080 spin_lock(&sbi->s_fc_lock); 1081 return ret; 1082 } 1083 1084 static int ext4_fc_perform_commit(journal_t *journal) 1085 { 1086 struct super_block *sb = journal->j_private; 1087 struct ext4_sb_info *sbi = EXT4_SB(sb); 1088 struct ext4_inode_info *iter; 1089 struct ext4_fc_head head; 1090 struct inode *inode; 1091 struct blk_plug plug; 1092 int ret = 0; 1093 u32 crc = 0; 1094 1095 ret = ext4_fc_submit_inode_data_all(journal); 1096 if (ret) 1097 return ret; 1098 1099 ret = ext4_fc_wait_inode_data_all(journal); 1100 if (ret) 1101 return ret; 1102 1103 /* 1104 * If file system device is different from journal device, issue a cache 1105 * flush before we start writing fast commit blocks. 1106 */ 1107 if (journal->j_fs_dev != journal->j_dev) 1108 blkdev_issue_flush(journal->j_fs_dev); 1109 1110 blk_start_plug(&plug); 1111 if (sbi->s_fc_bytes == 0) { 1112 /* 1113 * Add a head tag only if this is the first fast commit 1114 * in this TID. 1115 */ 1116 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1117 head.fc_tid = cpu_to_le32( 1118 sbi->s_journal->j_running_transaction->t_tid); 1119 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1120 (u8 *)&head, &crc)) { 1121 ret = -ENOSPC; 1122 goto out; 1123 } 1124 } 1125 1126 spin_lock(&sbi->s_fc_lock); 1127 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1128 if (ret) { 1129 spin_unlock(&sbi->s_fc_lock); 1130 goto out; 1131 } 1132 1133 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1134 inode = &iter->vfs_inode; 1135 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1136 continue; 1137 1138 spin_unlock(&sbi->s_fc_lock); 1139 ret = ext4_fc_write_inode_data(inode, &crc); 1140 if (ret) 1141 goto out; 1142 ret = ext4_fc_write_inode(inode, &crc); 1143 if (ret) 1144 goto out; 1145 spin_lock(&sbi->s_fc_lock); 1146 } 1147 spin_unlock(&sbi->s_fc_lock); 1148 1149 ret = ext4_fc_write_tail(sb, crc); 1150 1151 out: 1152 blk_finish_plug(&plug); 1153 return ret; 1154 } 1155 1156 static void ext4_fc_update_stats(struct super_block *sb, int status, 1157 u64 commit_time, int nblks, tid_t commit_tid) 1158 { 1159 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1160 1161 ext4_debug("Fast commit ended with status = %d for tid %u", 1162 status, commit_tid); 1163 if (status == EXT4_FC_STATUS_OK) { 1164 stats->fc_num_commits++; 1165 stats->fc_numblks += nblks; 1166 if (likely(stats->s_fc_avg_commit_time)) 1167 stats->s_fc_avg_commit_time = 1168 (commit_time + 1169 stats->s_fc_avg_commit_time * 3) / 4; 1170 else 1171 stats->s_fc_avg_commit_time = commit_time; 1172 } else if (status == EXT4_FC_STATUS_FAILED || 1173 status == EXT4_FC_STATUS_INELIGIBLE) { 1174 if (status == EXT4_FC_STATUS_FAILED) 1175 stats->fc_failed_commits++; 1176 stats->fc_ineligible_commits++; 1177 } else { 1178 stats->fc_skipped_commits++; 1179 } 1180 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1181 } 1182 1183 /* 1184 * The main commit entry point. Performs a fast commit for transaction 1185 * commit_tid if needed. If it's not possible to perform a fast commit 1186 * due to various reasons, we fall back to full commit. Returns 0 1187 * on success, error otherwise. 1188 */ 1189 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1190 { 1191 struct super_block *sb = journal->j_private; 1192 struct ext4_sb_info *sbi = EXT4_SB(sb); 1193 int nblks = 0, ret, bsize = journal->j_blocksize; 1194 int subtid = atomic_read(&sbi->s_fc_subtid); 1195 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1196 ktime_t start_time, commit_time; 1197 1198 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1199 return jbd2_complete_transaction(journal, commit_tid); 1200 1201 trace_ext4_fc_commit_start(sb, commit_tid); 1202 1203 start_time = ktime_get(); 1204 1205 restart_fc: 1206 ret = jbd2_fc_begin_commit(journal, commit_tid); 1207 if (ret == -EALREADY) { 1208 /* There was an ongoing commit, check if we need to restart */ 1209 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1210 commit_tid > journal->j_commit_sequence) 1211 goto restart_fc; 1212 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1213 commit_tid); 1214 return 0; 1215 } else if (ret) { 1216 /* 1217 * Commit couldn't start. Just update stats and perform a 1218 * full commit. 1219 */ 1220 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1221 commit_tid); 1222 return jbd2_complete_transaction(journal, commit_tid); 1223 } 1224 1225 /* 1226 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1227 * if we are fast commit ineligible. 1228 */ 1229 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1230 status = EXT4_FC_STATUS_INELIGIBLE; 1231 goto fallback; 1232 } 1233 1234 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1235 ret = ext4_fc_perform_commit(journal); 1236 if (ret < 0) { 1237 status = EXT4_FC_STATUS_FAILED; 1238 goto fallback; 1239 } 1240 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1241 ret = jbd2_fc_wait_bufs(journal, nblks); 1242 if (ret < 0) { 1243 status = EXT4_FC_STATUS_FAILED; 1244 goto fallback; 1245 } 1246 atomic_inc(&sbi->s_fc_subtid); 1247 ret = jbd2_fc_end_commit(journal); 1248 /* 1249 * weight the commit time higher than the average time so we 1250 * don't react too strongly to vast changes in the commit time 1251 */ 1252 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1253 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1254 return ret; 1255 1256 fallback: 1257 ret = jbd2_fc_end_commit_fallback(journal); 1258 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1259 return ret; 1260 } 1261 1262 /* 1263 * Fast commit cleanup routine. This is called after every fast commit and 1264 * full commit. full is true if we are called after a full commit. 1265 */ 1266 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1267 { 1268 struct super_block *sb = journal->j_private; 1269 struct ext4_sb_info *sbi = EXT4_SB(sb); 1270 struct ext4_inode_info *iter, *iter_n; 1271 struct ext4_fc_dentry_update *fc_dentry; 1272 1273 if (full && sbi->s_fc_bh) 1274 sbi->s_fc_bh = NULL; 1275 1276 trace_ext4_fc_cleanup(journal, full, tid); 1277 jbd2_fc_release_bufs(journal); 1278 1279 spin_lock(&sbi->s_fc_lock); 1280 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1281 i_fc_list) { 1282 list_del_init(&iter->i_fc_list); 1283 ext4_clear_inode_state(&iter->vfs_inode, 1284 EXT4_STATE_FC_COMMITTING); 1285 if (iter->i_sync_tid <= tid) 1286 ext4_fc_reset_inode(&iter->vfs_inode); 1287 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1288 smp_mb(); 1289 #if (BITS_PER_LONG < 64) 1290 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1291 #else 1292 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1293 #endif 1294 } 1295 1296 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1297 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1298 struct ext4_fc_dentry_update, 1299 fcd_list); 1300 list_del_init(&fc_dentry->fcd_list); 1301 list_del_init(&fc_dentry->fcd_dilist); 1302 spin_unlock(&sbi->s_fc_lock); 1303 1304 if (fc_dentry->fcd_name.name && 1305 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1306 kfree(fc_dentry->fcd_name.name); 1307 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1308 spin_lock(&sbi->s_fc_lock); 1309 } 1310 1311 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1312 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1313 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1314 &sbi->s_fc_q[FC_Q_MAIN]); 1315 1316 if (tid >= sbi->s_fc_ineligible_tid) { 1317 sbi->s_fc_ineligible_tid = 0; 1318 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1319 } 1320 1321 if (full) 1322 sbi->s_fc_bytes = 0; 1323 spin_unlock(&sbi->s_fc_lock); 1324 trace_ext4_fc_stats(sb); 1325 } 1326 1327 /* Ext4 Replay Path Routines */ 1328 1329 /* Helper struct for dentry replay routines */ 1330 struct dentry_info_args { 1331 int parent_ino, dname_len, ino, inode_len; 1332 char *dname; 1333 }; 1334 1335 static inline void tl_to_darg(struct dentry_info_args *darg, 1336 struct ext4_fc_tl *tl, u8 *val) 1337 { 1338 struct ext4_fc_dentry_info fcd; 1339 1340 memcpy(&fcd, val, sizeof(fcd)); 1341 1342 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1343 darg->ino = le32_to_cpu(fcd.fc_ino); 1344 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1345 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1346 } 1347 1348 static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val) 1349 { 1350 memcpy(tl, val, EXT4_FC_TAG_BASE_LEN); 1351 tl->fc_len = le16_to_cpu(tl->fc_len); 1352 tl->fc_tag = le16_to_cpu(tl->fc_tag); 1353 } 1354 1355 /* Unlink replay function */ 1356 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, 1357 u8 *val) 1358 { 1359 struct inode *inode, *old_parent; 1360 struct qstr entry; 1361 struct dentry_info_args darg; 1362 int ret = 0; 1363 1364 tl_to_darg(&darg, tl, val); 1365 1366 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1367 darg.parent_ino, darg.dname_len); 1368 1369 entry.name = darg.dname; 1370 entry.len = darg.dname_len; 1371 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1372 1373 if (IS_ERR(inode)) { 1374 ext4_debug("Inode %d not found", darg.ino); 1375 return 0; 1376 } 1377 1378 old_parent = ext4_iget(sb, darg.parent_ino, 1379 EXT4_IGET_NORMAL); 1380 if (IS_ERR(old_parent)) { 1381 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1382 iput(inode); 1383 return 0; 1384 } 1385 1386 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1387 /* -ENOENT ok coz it might not exist anymore. */ 1388 if (ret == -ENOENT) 1389 ret = 0; 1390 iput(old_parent); 1391 iput(inode); 1392 return ret; 1393 } 1394 1395 static int ext4_fc_replay_link_internal(struct super_block *sb, 1396 struct dentry_info_args *darg, 1397 struct inode *inode) 1398 { 1399 struct inode *dir = NULL; 1400 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1401 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1402 int ret = 0; 1403 1404 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1405 if (IS_ERR(dir)) { 1406 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1407 dir = NULL; 1408 goto out; 1409 } 1410 1411 dentry_dir = d_obtain_alias(dir); 1412 if (IS_ERR(dentry_dir)) { 1413 ext4_debug("Failed to obtain dentry"); 1414 dentry_dir = NULL; 1415 goto out; 1416 } 1417 1418 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1419 if (!dentry_inode) { 1420 ext4_debug("Inode dentry not created."); 1421 ret = -ENOMEM; 1422 goto out; 1423 } 1424 1425 ret = __ext4_link(dir, inode, dentry_inode); 1426 /* 1427 * It's possible that link already existed since data blocks 1428 * for the dir in question got persisted before we crashed OR 1429 * we replayed this tag and crashed before the entire replay 1430 * could complete. 1431 */ 1432 if (ret && ret != -EEXIST) { 1433 ext4_debug("Failed to link\n"); 1434 goto out; 1435 } 1436 1437 ret = 0; 1438 out: 1439 if (dentry_dir) { 1440 d_drop(dentry_dir); 1441 dput(dentry_dir); 1442 } else if (dir) { 1443 iput(dir); 1444 } 1445 if (dentry_inode) { 1446 d_drop(dentry_inode); 1447 dput(dentry_inode); 1448 } 1449 1450 return ret; 1451 } 1452 1453 /* Link replay function */ 1454 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, 1455 u8 *val) 1456 { 1457 struct inode *inode; 1458 struct dentry_info_args darg; 1459 int ret = 0; 1460 1461 tl_to_darg(&darg, tl, val); 1462 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1463 darg.parent_ino, darg.dname_len); 1464 1465 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1466 if (IS_ERR(inode)) { 1467 ext4_debug("Inode not found."); 1468 return 0; 1469 } 1470 1471 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1472 iput(inode); 1473 return ret; 1474 } 1475 1476 /* 1477 * Record all the modified inodes during replay. We use this later to setup 1478 * block bitmaps correctly. 1479 */ 1480 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1481 { 1482 struct ext4_fc_replay_state *state; 1483 int i; 1484 1485 state = &EXT4_SB(sb)->s_fc_replay_state; 1486 for (i = 0; i < state->fc_modified_inodes_used; i++) 1487 if (state->fc_modified_inodes[i] == ino) 1488 return 0; 1489 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1490 int *fc_modified_inodes; 1491 1492 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1493 sizeof(int) * (state->fc_modified_inodes_size + 1494 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1495 GFP_KERNEL); 1496 if (!fc_modified_inodes) 1497 return -ENOMEM; 1498 state->fc_modified_inodes = fc_modified_inodes; 1499 state->fc_modified_inodes_size += 1500 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1501 } 1502 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1503 return 0; 1504 } 1505 1506 /* 1507 * Inode replay function 1508 */ 1509 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, 1510 u8 *val) 1511 { 1512 struct ext4_fc_inode fc_inode; 1513 struct ext4_inode *raw_inode; 1514 struct ext4_inode *raw_fc_inode; 1515 struct inode *inode = NULL; 1516 struct ext4_iloc iloc; 1517 int inode_len, ino, ret, tag = tl->fc_tag; 1518 struct ext4_extent_header *eh; 1519 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1520 1521 memcpy(&fc_inode, val, sizeof(fc_inode)); 1522 1523 ino = le32_to_cpu(fc_inode.fc_ino); 1524 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1525 1526 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1527 if (!IS_ERR(inode)) { 1528 ext4_ext_clear_bb(inode); 1529 iput(inode); 1530 } 1531 inode = NULL; 1532 1533 ret = ext4_fc_record_modified_inode(sb, ino); 1534 if (ret) 1535 goto out; 1536 1537 raw_fc_inode = (struct ext4_inode *) 1538 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1539 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1540 if (ret) 1541 goto out; 1542 1543 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1544 raw_inode = ext4_raw_inode(&iloc); 1545 1546 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1547 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1548 inode_len - off_gen); 1549 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1550 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1551 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1552 memset(eh, 0, sizeof(*eh)); 1553 eh->eh_magic = EXT4_EXT_MAGIC; 1554 eh->eh_max = cpu_to_le16( 1555 (sizeof(raw_inode->i_block) - 1556 sizeof(struct ext4_extent_header)) 1557 / sizeof(struct ext4_extent)); 1558 } 1559 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1560 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1561 sizeof(raw_inode->i_block)); 1562 } 1563 1564 /* Immediately update the inode on disk. */ 1565 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1566 if (ret) 1567 goto out; 1568 ret = sync_dirty_buffer(iloc.bh); 1569 if (ret) 1570 goto out; 1571 ret = ext4_mark_inode_used(sb, ino); 1572 if (ret) 1573 goto out; 1574 1575 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1576 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1577 if (IS_ERR(inode)) { 1578 ext4_debug("Inode not found."); 1579 return -EFSCORRUPTED; 1580 } 1581 1582 /* 1583 * Our allocator could have made different decisions than before 1584 * crashing. This should be fixed but until then, we calculate 1585 * the number of blocks the inode. 1586 */ 1587 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1588 ext4_ext_replay_set_iblocks(inode); 1589 1590 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1591 ext4_reset_inode_seed(inode); 1592 1593 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1594 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1595 sync_dirty_buffer(iloc.bh); 1596 brelse(iloc.bh); 1597 out: 1598 iput(inode); 1599 if (!ret) 1600 blkdev_issue_flush(sb->s_bdev); 1601 1602 return 0; 1603 } 1604 1605 /* 1606 * Dentry create replay function. 1607 * 1608 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1609 * inode for which we are trying to create a dentry here, should already have 1610 * been replayed before we start here. 1611 */ 1612 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, 1613 u8 *val) 1614 { 1615 int ret = 0; 1616 struct inode *inode = NULL; 1617 struct inode *dir = NULL; 1618 struct dentry_info_args darg; 1619 1620 tl_to_darg(&darg, tl, val); 1621 1622 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1623 darg.parent_ino, darg.dname_len); 1624 1625 /* This takes care of update group descriptor and other metadata */ 1626 ret = ext4_mark_inode_used(sb, darg.ino); 1627 if (ret) 1628 goto out; 1629 1630 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1631 if (IS_ERR(inode)) { 1632 ext4_debug("inode %d not found.", darg.ino); 1633 inode = NULL; 1634 ret = -EINVAL; 1635 goto out; 1636 } 1637 1638 if (S_ISDIR(inode->i_mode)) { 1639 /* 1640 * If we are creating a directory, we need to make sure that the 1641 * dot and dot dot dirents are setup properly. 1642 */ 1643 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1644 if (IS_ERR(dir)) { 1645 ext4_debug("Dir %d not found.", darg.ino); 1646 goto out; 1647 } 1648 ret = ext4_init_new_dir(NULL, dir, inode); 1649 iput(dir); 1650 if (ret) { 1651 ret = 0; 1652 goto out; 1653 } 1654 } 1655 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1656 if (ret) 1657 goto out; 1658 set_nlink(inode, 1); 1659 ext4_mark_inode_dirty(NULL, inode); 1660 out: 1661 iput(inode); 1662 return ret; 1663 } 1664 1665 /* 1666 * Record physical disk regions which are in use as per fast commit area, 1667 * and used by inodes during replay phase. Our simple replay phase 1668 * allocator excludes these regions from allocation. 1669 */ 1670 int ext4_fc_record_regions(struct super_block *sb, int ino, 1671 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1672 { 1673 struct ext4_fc_replay_state *state; 1674 struct ext4_fc_alloc_region *region; 1675 1676 state = &EXT4_SB(sb)->s_fc_replay_state; 1677 /* 1678 * during replay phase, the fc_regions_valid may not same as 1679 * fc_regions_used, update it when do new additions. 1680 */ 1681 if (replay && state->fc_regions_used != state->fc_regions_valid) 1682 state->fc_regions_used = state->fc_regions_valid; 1683 if (state->fc_regions_used == state->fc_regions_size) { 1684 struct ext4_fc_alloc_region *fc_regions; 1685 1686 fc_regions = krealloc(state->fc_regions, 1687 sizeof(struct ext4_fc_alloc_region) * 1688 (state->fc_regions_size + 1689 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1690 GFP_KERNEL); 1691 if (!fc_regions) 1692 return -ENOMEM; 1693 state->fc_regions_size += 1694 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1695 state->fc_regions = fc_regions; 1696 } 1697 region = &state->fc_regions[state->fc_regions_used++]; 1698 region->ino = ino; 1699 region->lblk = lblk; 1700 region->pblk = pblk; 1701 region->len = len; 1702 1703 if (replay) 1704 state->fc_regions_valid++; 1705 1706 return 0; 1707 } 1708 1709 /* Replay add range tag */ 1710 static int ext4_fc_replay_add_range(struct super_block *sb, 1711 struct ext4_fc_tl *tl, u8 *val) 1712 { 1713 struct ext4_fc_add_range fc_add_ex; 1714 struct ext4_extent newex, *ex; 1715 struct inode *inode; 1716 ext4_lblk_t start, cur; 1717 int remaining, len; 1718 ext4_fsblk_t start_pblk; 1719 struct ext4_map_blocks map; 1720 struct ext4_ext_path *path = NULL; 1721 int ret; 1722 1723 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1724 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1725 1726 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1727 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1728 ext4_ext_get_actual_len(ex)); 1729 1730 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1731 if (IS_ERR(inode)) { 1732 ext4_debug("Inode not found."); 1733 return 0; 1734 } 1735 1736 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1737 if (ret) 1738 goto out; 1739 1740 start = le32_to_cpu(ex->ee_block); 1741 start_pblk = ext4_ext_pblock(ex); 1742 len = ext4_ext_get_actual_len(ex); 1743 1744 cur = start; 1745 remaining = len; 1746 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1747 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1748 inode->i_ino); 1749 1750 while (remaining > 0) { 1751 map.m_lblk = cur; 1752 map.m_len = remaining; 1753 map.m_pblk = 0; 1754 ret = ext4_map_blocks(NULL, inode, &map, 0); 1755 1756 if (ret < 0) 1757 goto out; 1758 1759 if (ret == 0) { 1760 /* Range is not mapped */ 1761 path = ext4_find_extent(inode, cur, NULL, 0); 1762 if (IS_ERR(path)) 1763 goto out; 1764 memset(&newex, 0, sizeof(newex)); 1765 newex.ee_block = cpu_to_le32(cur); 1766 ext4_ext_store_pblock( 1767 &newex, start_pblk + cur - start); 1768 newex.ee_len = cpu_to_le16(map.m_len); 1769 if (ext4_ext_is_unwritten(ex)) 1770 ext4_ext_mark_unwritten(&newex); 1771 down_write(&EXT4_I(inode)->i_data_sem); 1772 ret = ext4_ext_insert_extent( 1773 NULL, inode, &path, &newex, 0); 1774 up_write((&EXT4_I(inode)->i_data_sem)); 1775 ext4_free_ext_path(path); 1776 if (ret) 1777 goto out; 1778 goto next; 1779 } 1780 1781 if (start_pblk + cur - start != map.m_pblk) { 1782 /* 1783 * Logical to physical mapping changed. This can happen 1784 * if this range was removed and then reallocated to 1785 * map to new physical blocks during a fast commit. 1786 */ 1787 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1788 ext4_ext_is_unwritten(ex), 1789 start_pblk + cur - start); 1790 if (ret) 1791 goto out; 1792 /* 1793 * Mark the old blocks as free since they aren't used 1794 * anymore. We maintain an array of all the modified 1795 * inodes. In case these blocks are still used at either 1796 * a different logical range in the same inode or in 1797 * some different inode, we will mark them as allocated 1798 * at the end of the FC replay using our array of 1799 * modified inodes. 1800 */ 1801 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1802 goto next; 1803 } 1804 1805 /* Range is mapped and needs a state change */ 1806 ext4_debug("Converting from %ld to %d %lld", 1807 map.m_flags & EXT4_MAP_UNWRITTEN, 1808 ext4_ext_is_unwritten(ex), map.m_pblk); 1809 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1810 ext4_ext_is_unwritten(ex), map.m_pblk); 1811 if (ret) 1812 goto out; 1813 /* 1814 * We may have split the extent tree while toggling the state. 1815 * Try to shrink the extent tree now. 1816 */ 1817 ext4_ext_replay_shrink_inode(inode, start + len); 1818 next: 1819 cur += map.m_len; 1820 remaining -= map.m_len; 1821 } 1822 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1823 sb->s_blocksize_bits); 1824 out: 1825 iput(inode); 1826 return 0; 1827 } 1828 1829 /* Replay DEL_RANGE tag */ 1830 static int 1831 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, 1832 u8 *val) 1833 { 1834 struct inode *inode; 1835 struct ext4_fc_del_range lrange; 1836 struct ext4_map_blocks map; 1837 ext4_lblk_t cur, remaining; 1838 int ret; 1839 1840 memcpy(&lrange, val, sizeof(lrange)); 1841 cur = le32_to_cpu(lrange.fc_lblk); 1842 remaining = le32_to_cpu(lrange.fc_len); 1843 1844 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1845 le32_to_cpu(lrange.fc_ino), cur, remaining); 1846 1847 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1848 if (IS_ERR(inode)) { 1849 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1850 return 0; 1851 } 1852 1853 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1854 if (ret) 1855 goto out; 1856 1857 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", 1858 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1859 le32_to_cpu(lrange.fc_len)); 1860 while (remaining > 0) { 1861 map.m_lblk = cur; 1862 map.m_len = remaining; 1863 1864 ret = ext4_map_blocks(NULL, inode, &map, 0); 1865 if (ret < 0) 1866 goto out; 1867 if (ret > 0) { 1868 remaining -= ret; 1869 cur += ret; 1870 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1871 } else { 1872 remaining -= map.m_len; 1873 cur += map.m_len; 1874 } 1875 } 1876 1877 down_write(&EXT4_I(inode)->i_data_sem); 1878 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1879 le32_to_cpu(lrange.fc_lblk) + 1880 le32_to_cpu(lrange.fc_len) - 1); 1881 up_write(&EXT4_I(inode)->i_data_sem); 1882 if (ret) 1883 goto out; 1884 ext4_ext_replay_shrink_inode(inode, 1885 i_size_read(inode) >> sb->s_blocksize_bits); 1886 ext4_mark_inode_dirty(NULL, inode); 1887 out: 1888 iput(inode); 1889 return 0; 1890 } 1891 1892 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1893 { 1894 struct ext4_fc_replay_state *state; 1895 struct inode *inode; 1896 struct ext4_ext_path *path = NULL; 1897 struct ext4_map_blocks map; 1898 int i, ret, j; 1899 ext4_lblk_t cur, end; 1900 1901 state = &EXT4_SB(sb)->s_fc_replay_state; 1902 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1903 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1904 EXT4_IGET_NORMAL); 1905 if (IS_ERR(inode)) { 1906 ext4_debug("Inode %d not found.", 1907 state->fc_modified_inodes[i]); 1908 continue; 1909 } 1910 cur = 0; 1911 end = EXT_MAX_BLOCKS; 1912 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1913 iput(inode); 1914 continue; 1915 } 1916 while (cur < end) { 1917 map.m_lblk = cur; 1918 map.m_len = end - cur; 1919 1920 ret = ext4_map_blocks(NULL, inode, &map, 0); 1921 if (ret < 0) 1922 break; 1923 1924 if (ret > 0) { 1925 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1926 if (!IS_ERR(path)) { 1927 for (j = 0; j < path->p_depth; j++) 1928 ext4_mb_mark_bb(inode->i_sb, 1929 path[j].p_block, 1, 1); 1930 ext4_free_ext_path(path); 1931 } 1932 cur += ret; 1933 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1934 map.m_len, 1); 1935 } else { 1936 cur = cur + (map.m_len ? map.m_len : 1); 1937 } 1938 } 1939 iput(inode); 1940 } 1941 } 1942 1943 /* 1944 * Check if block is in excluded regions for block allocation. The simple 1945 * allocator that runs during replay phase is calls this function to see 1946 * if it is okay to use a block. 1947 */ 1948 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1949 { 1950 int i; 1951 struct ext4_fc_replay_state *state; 1952 1953 state = &EXT4_SB(sb)->s_fc_replay_state; 1954 for (i = 0; i < state->fc_regions_valid; i++) { 1955 if (state->fc_regions[i].ino == 0 || 1956 state->fc_regions[i].len == 0) 1957 continue; 1958 if (in_range(blk, state->fc_regions[i].pblk, 1959 state->fc_regions[i].len)) 1960 return true; 1961 } 1962 return false; 1963 } 1964 1965 /* Cleanup function called after replay */ 1966 void ext4_fc_replay_cleanup(struct super_block *sb) 1967 { 1968 struct ext4_sb_info *sbi = EXT4_SB(sb); 1969 1970 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1971 kfree(sbi->s_fc_replay_state.fc_regions); 1972 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1973 } 1974 1975 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 1976 int tag, int len) 1977 { 1978 switch (tag) { 1979 case EXT4_FC_TAG_ADD_RANGE: 1980 return len == sizeof(struct ext4_fc_add_range); 1981 case EXT4_FC_TAG_DEL_RANGE: 1982 return len == sizeof(struct ext4_fc_del_range); 1983 case EXT4_FC_TAG_CREAT: 1984 case EXT4_FC_TAG_LINK: 1985 case EXT4_FC_TAG_UNLINK: 1986 len -= sizeof(struct ext4_fc_dentry_info); 1987 return len >= 1 && len <= EXT4_NAME_LEN; 1988 case EXT4_FC_TAG_INODE: 1989 len -= sizeof(struct ext4_fc_inode); 1990 return len >= EXT4_GOOD_OLD_INODE_SIZE && 1991 len <= sbi->s_inode_size; 1992 case EXT4_FC_TAG_PAD: 1993 return true; /* padding can have any length */ 1994 case EXT4_FC_TAG_TAIL: 1995 return len >= sizeof(struct ext4_fc_tail); 1996 case EXT4_FC_TAG_HEAD: 1997 return len == sizeof(struct ext4_fc_head); 1998 } 1999 return false; 2000 } 2001 2002 /* 2003 * Recovery Scan phase handler 2004 * 2005 * This function is called during the scan phase and is responsible 2006 * for doing following things: 2007 * - Make sure the fast commit area has valid tags for replay 2008 * - Count number of tags that need to be replayed by the replay handler 2009 * - Verify CRC 2010 * - Create a list of excluded blocks for allocation during replay phase 2011 * 2012 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2013 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2014 * to indicate that scan has finished and JBD2 can now start replay phase. 2015 * It returns a negative error to indicate that there was an error. At the end 2016 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2017 * to indicate the number of tags that need to replayed during the replay phase. 2018 */ 2019 static int ext4_fc_replay_scan(journal_t *journal, 2020 struct buffer_head *bh, int off, 2021 tid_t expected_tid) 2022 { 2023 struct super_block *sb = journal->j_private; 2024 struct ext4_sb_info *sbi = EXT4_SB(sb); 2025 struct ext4_fc_replay_state *state; 2026 int ret = JBD2_FC_REPLAY_CONTINUE; 2027 struct ext4_fc_add_range ext; 2028 struct ext4_fc_tl tl; 2029 struct ext4_fc_tail tail; 2030 __u8 *start, *end, *cur, *val; 2031 struct ext4_fc_head head; 2032 struct ext4_extent *ex; 2033 2034 state = &sbi->s_fc_replay_state; 2035 2036 start = (u8 *)bh->b_data; 2037 end = start + journal->j_blocksize; 2038 2039 if (state->fc_replay_expected_off == 0) { 2040 state->fc_cur_tag = 0; 2041 state->fc_replay_num_tags = 0; 2042 state->fc_crc = 0; 2043 state->fc_regions = NULL; 2044 state->fc_regions_valid = state->fc_regions_used = 2045 state->fc_regions_size = 0; 2046 /* Check if we can stop early */ 2047 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2048 != EXT4_FC_TAG_HEAD) 2049 return 0; 2050 } 2051 2052 if (off != state->fc_replay_expected_off) { 2053 ret = -EFSCORRUPTED; 2054 goto out_err; 2055 } 2056 2057 state->fc_replay_expected_off++; 2058 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2059 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2060 ext4_fc_get_tl(&tl, cur); 2061 val = cur + EXT4_FC_TAG_BASE_LEN; 2062 if (tl.fc_len > end - val || 2063 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2064 ret = state->fc_replay_num_tags ? 2065 JBD2_FC_REPLAY_STOP : -ECANCELED; 2066 goto out_err; 2067 } 2068 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2069 tag2str(tl.fc_tag), bh->b_blocknr); 2070 switch (tl.fc_tag) { 2071 case EXT4_FC_TAG_ADD_RANGE: 2072 memcpy(&ext, val, sizeof(ext)); 2073 ex = (struct ext4_extent *)&ext.fc_ex; 2074 ret = ext4_fc_record_regions(sb, 2075 le32_to_cpu(ext.fc_ino), 2076 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2077 ext4_ext_get_actual_len(ex), 0); 2078 if (ret < 0) 2079 break; 2080 ret = JBD2_FC_REPLAY_CONTINUE; 2081 fallthrough; 2082 case EXT4_FC_TAG_DEL_RANGE: 2083 case EXT4_FC_TAG_LINK: 2084 case EXT4_FC_TAG_UNLINK: 2085 case EXT4_FC_TAG_CREAT: 2086 case EXT4_FC_TAG_INODE: 2087 case EXT4_FC_TAG_PAD: 2088 state->fc_cur_tag++; 2089 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2090 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2091 break; 2092 case EXT4_FC_TAG_TAIL: 2093 state->fc_cur_tag++; 2094 memcpy(&tail, val, sizeof(tail)); 2095 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2096 EXT4_FC_TAG_BASE_LEN + 2097 offsetof(struct ext4_fc_tail, 2098 fc_crc)); 2099 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2100 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2101 state->fc_replay_num_tags = state->fc_cur_tag; 2102 state->fc_regions_valid = 2103 state->fc_regions_used; 2104 } else { 2105 ret = state->fc_replay_num_tags ? 2106 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2107 } 2108 state->fc_crc = 0; 2109 break; 2110 case EXT4_FC_TAG_HEAD: 2111 memcpy(&head, val, sizeof(head)); 2112 if (le32_to_cpu(head.fc_features) & 2113 ~EXT4_FC_SUPPORTED_FEATURES) { 2114 ret = -EOPNOTSUPP; 2115 break; 2116 } 2117 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2118 ret = JBD2_FC_REPLAY_STOP; 2119 break; 2120 } 2121 state->fc_cur_tag++; 2122 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2123 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2124 break; 2125 default: 2126 ret = state->fc_replay_num_tags ? 2127 JBD2_FC_REPLAY_STOP : -ECANCELED; 2128 } 2129 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2130 break; 2131 } 2132 2133 out_err: 2134 trace_ext4_fc_replay_scan(sb, ret, off); 2135 return ret; 2136 } 2137 2138 /* 2139 * Main recovery path entry point. 2140 * The meaning of return codes is similar as above. 2141 */ 2142 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2143 enum passtype pass, int off, tid_t expected_tid) 2144 { 2145 struct super_block *sb = journal->j_private; 2146 struct ext4_sb_info *sbi = EXT4_SB(sb); 2147 struct ext4_fc_tl tl; 2148 __u8 *start, *end, *cur, *val; 2149 int ret = JBD2_FC_REPLAY_CONTINUE; 2150 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2151 struct ext4_fc_tail tail; 2152 2153 if (pass == PASS_SCAN) { 2154 state->fc_current_pass = PASS_SCAN; 2155 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2156 } 2157 2158 if (state->fc_current_pass != pass) { 2159 state->fc_current_pass = pass; 2160 sbi->s_mount_state |= EXT4_FC_REPLAY; 2161 } 2162 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2163 ext4_debug("Replay stops\n"); 2164 ext4_fc_set_bitmaps_and_counters(sb); 2165 return 0; 2166 } 2167 2168 #ifdef CONFIG_EXT4_DEBUG 2169 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2170 pr_warn("Dropping fc block %d because max_replay set\n", off); 2171 return JBD2_FC_REPLAY_STOP; 2172 } 2173 #endif 2174 2175 start = (u8 *)bh->b_data; 2176 end = start + journal->j_blocksize; 2177 2178 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2179 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2180 ext4_fc_get_tl(&tl, cur); 2181 val = cur + EXT4_FC_TAG_BASE_LEN; 2182 2183 if (state->fc_replay_num_tags == 0) { 2184 ret = JBD2_FC_REPLAY_STOP; 2185 ext4_fc_set_bitmaps_and_counters(sb); 2186 break; 2187 } 2188 2189 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2190 state->fc_replay_num_tags--; 2191 switch (tl.fc_tag) { 2192 case EXT4_FC_TAG_LINK: 2193 ret = ext4_fc_replay_link(sb, &tl, val); 2194 break; 2195 case EXT4_FC_TAG_UNLINK: 2196 ret = ext4_fc_replay_unlink(sb, &tl, val); 2197 break; 2198 case EXT4_FC_TAG_ADD_RANGE: 2199 ret = ext4_fc_replay_add_range(sb, &tl, val); 2200 break; 2201 case EXT4_FC_TAG_CREAT: 2202 ret = ext4_fc_replay_create(sb, &tl, val); 2203 break; 2204 case EXT4_FC_TAG_DEL_RANGE: 2205 ret = ext4_fc_replay_del_range(sb, &tl, val); 2206 break; 2207 case EXT4_FC_TAG_INODE: 2208 ret = ext4_fc_replay_inode(sb, &tl, val); 2209 break; 2210 case EXT4_FC_TAG_PAD: 2211 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2212 tl.fc_len, 0); 2213 break; 2214 case EXT4_FC_TAG_TAIL: 2215 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2216 0, tl.fc_len, 0); 2217 memcpy(&tail, val, sizeof(tail)); 2218 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2219 break; 2220 case EXT4_FC_TAG_HEAD: 2221 break; 2222 default: 2223 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2224 ret = -ECANCELED; 2225 break; 2226 } 2227 if (ret < 0) 2228 break; 2229 ret = JBD2_FC_REPLAY_CONTINUE; 2230 } 2231 return ret; 2232 } 2233 2234 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2235 { 2236 /* 2237 * We set replay callback even if fast commit disabled because we may 2238 * could still have fast commit blocks that need to be replayed even if 2239 * fast commit has now been turned off. 2240 */ 2241 journal->j_fc_replay_callback = ext4_fc_replay; 2242 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2243 return; 2244 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2245 } 2246 2247 static const char * const fc_ineligible_reasons[] = { 2248 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2249 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2250 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2251 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2252 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2253 [EXT4_FC_REASON_RESIZE] = "Resize", 2254 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2255 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2256 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2257 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2258 }; 2259 2260 int ext4_fc_info_show(struct seq_file *seq, void *v) 2261 { 2262 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2263 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2264 int i; 2265 2266 if (v != SEQ_START_TOKEN) 2267 return 0; 2268 2269 seq_printf(seq, 2270 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2271 stats->fc_num_commits, stats->fc_ineligible_commits, 2272 stats->fc_numblks, 2273 div_u64(stats->s_fc_avg_commit_time, 1000)); 2274 seq_puts(seq, "Ineligible reasons:\n"); 2275 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2276 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2277 stats->fc_ineligible_reason_count[i]); 2278 2279 return 0; 2280 } 2281 2282 int __init ext4_fc_init_dentry_cache(void) 2283 { 2284 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2285 SLAB_RECLAIM_ACCOUNT); 2286 2287 if (ext4_fc_dentry_cachep == NULL) 2288 return -ENOMEM; 2289 2290 return 0; 2291 } 2292 2293 void ext4_fc_destroy_dentry_cache(void) 2294 { 2295 kmem_cache_destroy(ext4_fc_dentry_cachep); 2296 } 2297