1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * 69 * Not all operations are supported by fast commits today (e.g extended 70 * attributes). Fast commit ineligibility is marked by calling 71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 72 * to full commit. 73 * 74 * Atomicity of commits 75 * -------------------- 76 * In order to guarantee atomicity during the commit operation, fast commit 77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 78 * tag contains CRC of the contents and TID of the transaction after which 79 * this fast commit should be applied. Recovery code replays fast commit 80 * logs only if there's at least 1 valid tail present. For every fast commit 81 * operation, there is 1 tail. This means, we may end up with multiple tails 82 * in the fast commit space. Here's an example: 83 * 84 * - Create a new file A and remove existing file B 85 * - fsync() 86 * - Append contents to file A 87 * - Truncate file A 88 * - fsync() 89 * 90 * The fast commit space at the end of above operations would look like this: 91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 93 * 94 * Replay code should thus check for all the valid tails in the FC area. 95 * 96 * Fast Commit Replay Idempotence 97 * ------------------------------ 98 * 99 * Fast commits tags are idempotent in nature provided the recovery code follows 100 * certain rules. The guiding principle that the commit path follows while 101 * committing is that it stores the result of a particular operation instead of 102 * storing the procedure. 103 * 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 105 * was associated with inode 10. During fast commit, instead of storing this 106 * operation as a procedure "rename a to b", we store the resulting file system 107 * state as a "series" of outcomes: 108 * 109 * - Link dirent b to inode 10 110 * - Unlink dirent a 111 * - Inode <10> with valid refcount 112 * 113 * Now when recovery code runs, it needs "enforce" this state on the file 114 * system. This is what guarantees idempotence of fast commit replay. 115 * 116 * Let's take an example of a procedure that is not idempotent and see how fast 117 * commits make it idempotent. Consider following sequence of operations: 118 * 119 * rm A; mv B A; read A 120 * (x) (y) (z) 121 * 122 * (x), (y) and (z) are the points at which we can crash. If we store this 123 * sequence of operations as is then the replay is not idempotent. Let's say 124 * while in replay, we crash at (z). During the second replay, file A (which was 125 * actually created as a result of "mv B A" operation) would get deleted. Thus, 126 * file named A would be absent when we try to read A. So, this sequence of 127 * operations is not idempotent. However, as mentioned above, instead of storing 128 * the procedure fast commits store the outcome of each procedure. Thus the fast 129 * commit log for above procedure would be as follows: 130 * 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 132 * inode 11 before the replay) 133 * 134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 135 * (w) (x) (y) (z) 136 * 137 * If we crash at (z), we will have file A linked to inode 11. During the second 138 * replay, we will remove file A (inode 11). But we will create it back and make 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 142 * similarly. Thus, by converting a non-idempotent procedure into a series of 143 * idempotent outcomes, fast commits ensured idempotence during the replay. 144 * 145 * TODOs 146 * ----- 147 * 148 * 0) Fast commit replay path hardening: Fast commit replay code should use 149 * journal handles to make sure all the updates it does during the replay 150 * path are atomic. With that if we crash during fast commit replay, after 151 * trying to do recovery again, we will find a file system where fast commit 152 * area is invalid (because new full commit would be found). In order to deal 153 * with that, fast commit replay code should ensure that the "FC_REPLAY" 154 * superblock state is persisted before starting the replay, so that after 155 * the crash, fast commit recovery code can look at that flag and perform 156 * fast commit recovery even if that area is invalidated by later full 157 * commits. 158 * 159 * 1) Fast commit's commit path locks the entire file system during fast 160 * commit. This has significant performance penalty. Instead of that, we 161 * should use ext4_fc_start/stop_update functions to start inode level 162 * updates from ext4_journal_start/stop. Once we do that we can drop file 163 * system locking during commit path. 164 * 165 * 2) Handle more ineligible cases. 166 */ 167 168 #include <trace/events/ext4.h> 169 static struct kmem_cache *ext4_fc_dentry_cachep; 170 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 172 { 173 BUFFER_TRACE(bh, ""); 174 if (uptodate) { 175 ext4_debug("%s: Block %lld up-to-date", 176 __func__, bh->b_blocknr); 177 set_buffer_uptodate(bh); 178 } else { 179 ext4_debug("%s: Block %lld not up-to-date", 180 __func__, bh->b_blocknr); 181 clear_buffer_uptodate(bh); 182 } 183 184 unlock_buffer(bh); 185 } 186 187 static inline void ext4_fc_reset_inode(struct inode *inode) 188 { 189 struct ext4_inode_info *ei = EXT4_I(inode); 190 191 ei->i_fc_lblk_start = 0; 192 ei->i_fc_lblk_len = 0; 193 } 194 195 void ext4_fc_init_inode(struct inode *inode) 196 { 197 struct ext4_inode_info *ei = EXT4_I(inode); 198 199 ext4_fc_reset_inode(inode); 200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 201 INIT_LIST_HEAD(&ei->i_fc_list); 202 INIT_LIST_HEAD(&ei->i_fc_dilist); 203 init_waitqueue_head(&ei->i_fc_wait); 204 atomic_set(&ei->i_fc_updates, 0); 205 } 206 207 /* This function must be called with sbi->s_fc_lock held. */ 208 static void ext4_fc_wait_committing_inode(struct inode *inode) 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 210 { 211 wait_queue_head_t *wq; 212 struct ext4_inode_info *ei = EXT4_I(inode); 213 214 #if (BITS_PER_LONG < 64) 215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 216 EXT4_STATE_FC_COMMITTING); 217 wq = bit_waitqueue(&ei->i_state_flags, 218 EXT4_STATE_FC_COMMITTING); 219 #else 220 DEFINE_WAIT_BIT(wait, &ei->i_flags, 221 EXT4_STATE_FC_COMMITTING); 222 wq = bit_waitqueue(&ei->i_flags, 223 EXT4_STATE_FC_COMMITTING); 224 #endif 225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 228 schedule(); 229 finish_wait(wq, &wait.wq_entry); 230 } 231 232 static bool ext4_fc_disabled(struct super_block *sb) 233 { 234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 236 } 237 238 /* 239 * Inform Ext4's fast about start of an inode update 240 * 241 * This function is called by the high level call VFS callbacks before 242 * performing any inode update. This function blocks if there's an ongoing 243 * fast commit on the inode in question. 244 */ 245 void ext4_fc_start_update(struct inode *inode) 246 { 247 struct ext4_inode_info *ei = EXT4_I(inode); 248 249 if (ext4_fc_disabled(inode->i_sb)) 250 return; 251 252 restart: 253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 254 if (list_empty(&ei->i_fc_list)) 255 goto out; 256 257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 258 ext4_fc_wait_committing_inode(inode); 259 goto restart; 260 } 261 out: 262 atomic_inc(&ei->i_fc_updates); 263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 264 } 265 266 /* 267 * Stop inode update and wake up waiting fast commits if any. 268 */ 269 void ext4_fc_stop_update(struct inode *inode) 270 { 271 struct ext4_inode_info *ei = EXT4_I(inode); 272 273 if (ext4_fc_disabled(inode->i_sb)) 274 return; 275 276 if (atomic_dec_and_test(&ei->i_fc_updates)) 277 wake_up_all(&ei->i_fc_wait); 278 } 279 280 /* 281 * Remove inode from fast commit list. If the inode is being committed 282 * we wait until inode commit is done. 283 */ 284 void ext4_fc_del(struct inode *inode) 285 { 286 struct ext4_inode_info *ei = EXT4_I(inode); 287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 288 struct ext4_fc_dentry_update *fc_dentry; 289 290 if (ext4_fc_disabled(inode->i_sb)) 291 return; 292 293 restart: 294 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 296 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 297 return; 298 } 299 300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 301 ext4_fc_wait_committing_inode(inode); 302 goto restart; 303 } 304 305 if (!list_empty(&ei->i_fc_list)) 306 list_del_init(&ei->i_fc_list); 307 308 /* 309 * Since this inode is getting removed, let's also remove all FC 310 * dentry create references, since it is not needed to log it anyways. 311 */ 312 if (list_empty(&ei->i_fc_dilist)) { 313 spin_unlock(&sbi->s_fc_lock); 314 return; 315 } 316 317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 319 list_del_init(&fc_dentry->fcd_list); 320 list_del_init(&fc_dentry->fcd_dilist); 321 322 WARN_ON(!list_empty(&ei->i_fc_dilist)); 323 spin_unlock(&sbi->s_fc_lock); 324 325 if (fc_dentry->fcd_name.name && 326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 327 kfree(fc_dentry->fcd_name.name); 328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 329 330 return; 331 } 332 333 /* 334 * Mark file system as fast commit ineligible, and record latest 335 * ineligible transaction tid. This means until the recorded 336 * transaction, commit operation would result in a full jbd2 commit. 337 */ 338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 339 { 340 struct ext4_sb_info *sbi = EXT4_SB(sb); 341 tid_t tid; 342 343 if (ext4_fc_disabled(sb)) 344 return; 345 346 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 347 if (handle && !IS_ERR(handle)) 348 tid = handle->h_transaction->t_tid; 349 else { 350 read_lock(&sbi->s_journal->j_state_lock); 351 tid = sbi->s_journal->j_running_transaction ? 352 sbi->s_journal->j_running_transaction->t_tid : 0; 353 read_unlock(&sbi->s_journal->j_state_lock); 354 } 355 spin_lock(&sbi->s_fc_lock); 356 if (sbi->s_fc_ineligible_tid < tid) 357 sbi->s_fc_ineligible_tid = tid; 358 spin_unlock(&sbi->s_fc_lock); 359 WARN_ON(reason >= EXT4_FC_REASON_MAX); 360 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 361 } 362 363 /* 364 * Generic fast commit tracking function. If this is the first time this we are 365 * called after a full commit, we initialize fast commit fields and then call 366 * __fc_track_fn() with update = 0. If we have already been called after a full 367 * commit, we pass update = 1. Based on that, the track function can determine 368 * if it needs to track a field for the first time or if it needs to just 369 * update the previously tracked value. 370 * 371 * If enqueue is set, this function enqueues the inode in fast commit list. 372 */ 373 static int ext4_fc_track_template( 374 handle_t *handle, struct inode *inode, 375 int (*__fc_track_fn)(struct inode *, void *, bool), 376 void *args, int enqueue) 377 { 378 bool update = false; 379 struct ext4_inode_info *ei = EXT4_I(inode); 380 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 381 tid_t tid = 0; 382 int ret; 383 384 tid = handle->h_transaction->t_tid; 385 mutex_lock(&ei->i_fc_lock); 386 if (tid == ei->i_sync_tid) { 387 update = true; 388 } else { 389 ext4_fc_reset_inode(inode); 390 ei->i_sync_tid = tid; 391 } 392 ret = __fc_track_fn(inode, args, update); 393 mutex_unlock(&ei->i_fc_lock); 394 395 if (!enqueue) 396 return ret; 397 398 spin_lock(&sbi->s_fc_lock); 399 if (list_empty(&EXT4_I(inode)->i_fc_list)) 400 list_add_tail(&EXT4_I(inode)->i_fc_list, 401 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 402 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 403 &sbi->s_fc_q[FC_Q_STAGING] : 404 &sbi->s_fc_q[FC_Q_MAIN]); 405 spin_unlock(&sbi->s_fc_lock); 406 407 return ret; 408 } 409 410 struct __track_dentry_update_args { 411 struct dentry *dentry; 412 int op; 413 }; 414 415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 416 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 417 { 418 struct ext4_fc_dentry_update *node; 419 struct ext4_inode_info *ei = EXT4_I(inode); 420 struct __track_dentry_update_args *dentry_update = 421 (struct __track_dentry_update_args *)arg; 422 struct dentry *dentry = dentry_update->dentry; 423 struct inode *dir = dentry->d_parent->d_inode; 424 struct super_block *sb = inode->i_sb; 425 struct ext4_sb_info *sbi = EXT4_SB(sb); 426 427 mutex_unlock(&ei->i_fc_lock); 428 429 if (IS_ENCRYPTED(dir)) { 430 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 431 NULL); 432 mutex_lock(&ei->i_fc_lock); 433 return -EOPNOTSUPP; 434 } 435 436 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 437 if (!node) { 438 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 439 mutex_lock(&ei->i_fc_lock); 440 return -ENOMEM; 441 } 442 443 node->fcd_op = dentry_update->op; 444 node->fcd_parent = dir->i_ino; 445 node->fcd_ino = inode->i_ino; 446 if (dentry->d_name.len > DNAME_INLINE_LEN) { 447 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 448 if (!node->fcd_name.name) { 449 kmem_cache_free(ext4_fc_dentry_cachep, node); 450 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); 451 mutex_lock(&ei->i_fc_lock); 452 return -ENOMEM; 453 } 454 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 455 dentry->d_name.len); 456 } else { 457 memcpy(node->fcd_iname, dentry->d_name.name, 458 dentry->d_name.len); 459 node->fcd_name.name = node->fcd_iname; 460 } 461 node->fcd_name.len = dentry->d_name.len; 462 INIT_LIST_HEAD(&node->fcd_dilist); 463 spin_lock(&sbi->s_fc_lock); 464 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 465 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 466 list_add_tail(&node->fcd_list, 467 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 468 else 469 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 470 471 /* 472 * This helps us keep a track of all fc_dentry updates which is part of 473 * this ext4 inode. So in case the inode is getting unlinked, before 474 * even we get a chance to fsync, we could remove all fc_dentry 475 * references while evicting the inode in ext4_fc_del(). 476 * Also with this, we don't need to loop over all the inodes in 477 * sbi->s_fc_q to get the corresponding inode in 478 * ext4_fc_commit_dentry_updates(). 479 */ 480 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 481 WARN_ON(!list_empty(&ei->i_fc_dilist)); 482 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 483 } 484 spin_unlock(&sbi->s_fc_lock); 485 mutex_lock(&ei->i_fc_lock); 486 487 return 0; 488 } 489 490 void __ext4_fc_track_unlink(handle_t *handle, 491 struct inode *inode, struct dentry *dentry) 492 { 493 struct __track_dentry_update_args args; 494 int ret; 495 496 args.dentry = dentry; 497 args.op = EXT4_FC_TAG_UNLINK; 498 499 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 500 (void *)&args, 0); 501 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 502 } 503 504 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 505 { 506 struct inode *inode = d_inode(dentry); 507 508 if (ext4_fc_disabled(inode->i_sb)) 509 return; 510 511 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 512 return; 513 514 __ext4_fc_track_unlink(handle, inode, dentry); 515 } 516 517 void __ext4_fc_track_link(handle_t *handle, 518 struct inode *inode, struct dentry *dentry) 519 { 520 struct __track_dentry_update_args args; 521 int ret; 522 523 args.dentry = dentry; 524 args.op = EXT4_FC_TAG_LINK; 525 526 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 527 (void *)&args, 0); 528 trace_ext4_fc_track_link(handle, inode, dentry, ret); 529 } 530 531 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 532 { 533 struct inode *inode = d_inode(dentry); 534 535 if (ext4_fc_disabled(inode->i_sb)) 536 return; 537 538 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 539 return; 540 541 __ext4_fc_track_link(handle, inode, dentry); 542 } 543 544 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 545 struct dentry *dentry) 546 { 547 struct __track_dentry_update_args args; 548 int ret; 549 550 args.dentry = dentry; 551 args.op = EXT4_FC_TAG_CREAT; 552 553 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 554 (void *)&args, 0); 555 trace_ext4_fc_track_create(handle, inode, dentry, ret); 556 } 557 558 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 559 { 560 struct inode *inode = d_inode(dentry); 561 562 if (ext4_fc_disabled(inode->i_sb)) 563 return; 564 565 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 566 return; 567 568 __ext4_fc_track_create(handle, inode, dentry); 569 } 570 571 /* __track_fn for inode tracking */ 572 static int __track_inode(struct inode *inode, void *arg, bool update) 573 { 574 if (update) 575 return -EEXIST; 576 577 EXT4_I(inode)->i_fc_lblk_len = 0; 578 579 return 0; 580 } 581 582 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 583 { 584 int ret; 585 586 if (S_ISDIR(inode->i_mode)) 587 return; 588 589 if (ext4_fc_disabled(inode->i_sb)) 590 return; 591 592 if (ext4_should_journal_data(inode)) { 593 ext4_fc_mark_ineligible(inode->i_sb, 594 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 595 return; 596 } 597 598 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 599 return; 600 601 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 602 trace_ext4_fc_track_inode(handle, inode, ret); 603 } 604 605 struct __track_range_args { 606 ext4_lblk_t start, end; 607 }; 608 609 /* __track_fn for tracking data updates */ 610 static int __track_range(struct inode *inode, void *arg, bool update) 611 { 612 struct ext4_inode_info *ei = EXT4_I(inode); 613 ext4_lblk_t oldstart; 614 struct __track_range_args *__arg = 615 (struct __track_range_args *)arg; 616 617 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 618 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 619 return -ECANCELED; 620 } 621 622 oldstart = ei->i_fc_lblk_start; 623 624 if (update && ei->i_fc_lblk_len > 0) { 625 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 626 ei->i_fc_lblk_len = 627 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 628 ei->i_fc_lblk_start + 1; 629 } else { 630 ei->i_fc_lblk_start = __arg->start; 631 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 632 } 633 634 return 0; 635 } 636 637 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 638 ext4_lblk_t end) 639 { 640 struct __track_range_args args; 641 int ret; 642 643 if (S_ISDIR(inode->i_mode)) 644 return; 645 646 if (ext4_fc_disabled(inode->i_sb)) 647 return; 648 649 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 650 return; 651 652 args.start = start; 653 args.end = end; 654 655 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 656 657 trace_ext4_fc_track_range(handle, inode, start, end, ret); 658 } 659 660 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 661 { 662 blk_opf_t write_flags = REQ_SYNC; 663 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 664 665 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 666 if (test_opt(sb, BARRIER) && is_tail) 667 write_flags |= REQ_FUA | REQ_PREFLUSH; 668 lock_buffer(bh); 669 set_buffer_dirty(bh); 670 set_buffer_uptodate(bh); 671 bh->b_end_io = ext4_end_buffer_io_sync; 672 submit_bh(REQ_OP_WRITE | write_flags, bh); 673 EXT4_SB(sb)->s_fc_bh = NULL; 674 } 675 676 /* Ext4 commit path routines */ 677 678 /* 679 * Allocate len bytes on a fast commit buffer. 680 * 681 * During the commit time this function is used to manage fast commit 682 * block space. We don't split a fast commit log onto different 683 * blocks. So this function makes sure that if there's not enough space 684 * on the current block, the remaining space in the current block is 685 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 686 * new block is from jbd2 and CRC is updated to reflect the padding 687 * we added. 688 */ 689 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 690 { 691 struct ext4_fc_tl tl; 692 struct ext4_sb_info *sbi = EXT4_SB(sb); 693 struct buffer_head *bh; 694 int bsize = sbi->s_journal->j_blocksize; 695 int ret, off = sbi->s_fc_bytes % bsize; 696 int remaining; 697 u8 *dst; 698 699 /* 700 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 701 * cannot fulfill the request. 702 */ 703 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 704 return NULL; 705 706 if (!sbi->s_fc_bh) { 707 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 708 if (ret) 709 return NULL; 710 sbi->s_fc_bh = bh; 711 } 712 dst = sbi->s_fc_bh->b_data + off; 713 714 /* 715 * Allocate the bytes in the current block if we can do so while still 716 * leaving enough space for a PAD tlv. 717 */ 718 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 719 if (len <= remaining) { 720 sbi->s_fc_bytes += len; 721 return dst; 722 } 723 724 /* 725 * Else, terminate the current block with a PAD tlv, then allocate a new 726 * block and allocate the bytes at the start of that new block. 727 */ 728 729 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 730 tl.fc_len = cpu_to_le16(remaining); 731 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 732 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 733 *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); 734 735 ext4_fc_submit_bh(sb, false); 736 737 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 738 if (ret) 739 return NULL; 740 sbi->s_fc_bh = bh; 741 sbi->s_fc_bytes += bsize - off + len; 742 return sbi->s_fc_bh->b_data; 743 } 744 745 /* 746 * Complete a fast commit by writing tail tag. 747 * 748 * Writing tail tag marks the end of a fast commit. In order to guarantee 749 * atomicity, after writing tail tag, even if there's space remaining 750 * in the block, next commit shouldn't use it. That's why tail tag 751 * has the length as that of the remaining space on the block. 752 */ 753 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 754 { 755 struct ext4_sb_info *sbi = EXT4_SB(sb); 756 struct ext4_fc_tl tl; 757 struct ext4_fc_tail tail; 758 int off, bsize = sbi->s_journal->j_blocksize; 759 u8 *dst; 760 761 /* 762 * ext4_fc_reserve_space takes care of allocating an extra block if 763 * there's no enough space on this block for accommodating this tail. 764 */ 765 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 766 if (!dst) 767 return -ENOSPC; 768 769 off = sbi->s_fc_bytes % bsize; 770 771 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 772 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 773 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 774 775 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 776 dst += EXT4_FC_TAG_BASE_LEN; 777 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 778 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 779 dst += sizeof(tail.fc_tid); 780 crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, 781 dst - (u8 *)sbi->s_fc_bh->b_data); 782 tail.fc_crc = cpu_to_le32(crc); 783 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 784 dst += sizeof(tail.fc_crc); 785 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 786 787 ext4_fc_submit_bh(sb, true); 788 789 return 0; 790 } 791 792 /* 793 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 794 * Returns false if there's not enough space. 795 */ 796 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 797 u32 *crc) 798 { 799 struct ext4_fc_tl tl; 800 u8 *dst; 801 802 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 803 if (!dst) 804 return false; 805 806 tl.fc_tag = cpu_to_le16(tag); 807 tl.fc_len = cpu_to_le16(len); 808 809 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 810 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 811 812 return true; 813 } 814 815 /* Same as above, but adds dentry tlv. */ 816 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 817 struct ext4_fc_dentry_update *fc_dentry) 818 { 819 struct ext4_fc_dentry_info fcd; 820 struct ext4_fc_tl tl; 821 int dlen = fc_dentry->fcd_name.len; 822 u8 *dst = ext4_fc_reserve_space(sb, 823 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 824 825 if (!dst) 826 return false; 827 828 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 829 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 830 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 831 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 832 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 833 dst += EXT4_FC_TAG_BASE_LEN; 834 memcpy(dst, &fcd, sizeof(fcd)); 835 dst += sizeof(fcd); 836 memcpy(dst, fc_dentry->fcd_name.name, dlen); 837 838 return true; 839 } 840 841 /* 842 * Writes inode in the fast commit space under TLV with tag @tag. 843 * Returns 0 on success, error on failure. 844 */ 845 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 846 { 847 struct ext4_inode_info *ei = EXT4_I(inode); 848 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 849 int ret; 850 struct ext4_iloc iloc; 851 struct ext4_fc_inode fc_inode; 852 struct ext4_fc_tl tl; 853 u8 *dst; 854 855 ret = ext4_get_inode_loc(inode, &iloc); 856 if (ret) 857 return ret; 858 859 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 860 inode_len = EXT4_INODE_SIZE(inode->i_sb); 861 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 862 inode_len += ei->i_extra_isize; 863 864 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 865 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 866 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 867 868 ret = -ECANCELED; 869 dst = ext4_fc_reserve_space(inode->i_sb, 870 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 871 if (!dst) 872 goto err; 873 874 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 875 dst += EXT4_FC_TAG_BASE_LEN; 876 memcpy(dst, &fc_inode, sizeof(fc_inode)); 877 dst += sizeof(fc_inode); 878 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); 879 ret = 0; 880 err: 881 brelse(iloc.bh); 882 return ret; 883 } 884 885 /* 886 * Writes updated data ranges for the inode in question. Updates CRC. 887 * Returns 0 on success, error otherwise. 888 */ 889 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 890 { 891 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 892 struct ext4_inode_info *ei = EXT4_I(inode); 893 struct ext4_map_blocks map; 894 struct ext4_fc_add_range fc_ext; 895 struct ext4_fc_del_range lrange; 896 struct ext4_extent *ex; 897 int ret; 898 899 mutex_lock(&ei->i_fc_lock); 900 if (ei->i_fc_lblk_len == 0) { 901 mutex_unlock(&ei->i_fc_lock); 902 return 0; 903 } 904 old_blk_size = ei->i_fc_lblk_start; 905 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 906 ei->i_fc_lblk_len = 0; 907 mutex_unlock(&ei->i_fc_lock); 908 909 cur_lblk_off = old_blk_size; 910 ext4_debug("will try writing %d to %d for inode %ld\n", 911 cur_lblk_off, new_blk_size, inode->i_ino); 912 913 while (cur_lblk_off <= new_blk_size) { 914 map.m_lblk = cur_lblk_off; 915 map.m_len = new_blk_size - cur_lblk_off + 1; 916 ret = ext4_map_blocks(NULL, inode, &map, 0); 917 if (ret < 0) 918 return -ECANCELED; 919 920 if (map.m_len == 0) { 921 cur_lblk_off++; 922 continue; 923 } 924 925 if (ret == 0) { 926 lrange.fc_ino = cpu_to_le32(inode->i_ino); 927 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 928 lrange.fc_len = cpu_to_le32(map.m_len); 929 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 930 sizeof(lrange), (u8 *)&lrange, crc)) 931 return -ENOSPC; 932 } else { 933 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 934 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 935 936 /* Limit the number of blocks in one extent */ 937 map.m_len = min(max, map.m_len); 938 939 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 940 ex = (struct ext4_extent *)&fc_ext.fc_ex; 941 ex->ee_block = cpu_to_le32(map.m_lblk); 942 ex->ee_len = cpu_to_le16(map.m_len); 943 ext4_ext_store_pblock(ex, map.m_pblk); 944 if (map.m_flags & EXT4_MAP_UNWRITTEN) 945 ext4_ext_mark_unwritten(ex); 946 else 947 ext4_ext_mark_initialized(ex); 948 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 949 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 950 return -ENOSPC; 951 } 952 953 cur_lblk_off += map.m_len; 954 } 955 956 return 0; 957 } 958 959 960 /* Submit data for all the fast commit inodes */ 961 static int ext4_fc_submit_inode_data_all(journal_t *journal) 962 { 963 struct super_block *sb = journal->j_private; 964 struct ext4_sb_info *sbi = EXT4_SB(sb); 965 struct ext4_inode_info *ei; 966 int ret = 0; 967 968 spin_lock(&sbi->s_fc_lock); 969 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 970 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 971 while (atomic_read(&ei->i_fc_updates)) { 972 DEFINE_WAIT(wait); 973 974 prepare_to_wait(&ei->i_fc_wait, &wait, 975 TASK_UNINTERRUPTIBLE); 976 if (atomic_read(&ei->i_fc_updates)) { 977 spin_unlock(&sbi->s_fc_lock); 978 schedule(); 979 spin_lock(&sbi->s_fc_lock); 980 } 981 finish_wait(&ei->i_fc_wait, &wait); 982 } 983 spin_unlock(&sbi->s_fc_lock); 984 ret = jbd2_submit_inode_data(journal, ei->jinode); 985 if (ret) 986 return ret; 987 spin_lock(&sbi->s_fc_lock); 988 } 989 spin_unlock(&sbi->s_fc_lock); 990 991 return ret; 992 } 993 994 /* Wait for completion of data for all the fast commit inodes */ 995 static int ext4_fc_wait_inode_data_all(journal_t *journal) 996 { 997 struct super_block *sb = journal->j_private; 998 struct ext4_sb_info *sbi = EXT4_SB(sb); 999 struct ext4_inode_info *pos, *n; 1000 int ret = 0; 1001 1002 spin_lock(&sbi->s_fc_lock); 1003 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1004 if (!ext4_test_inode_state(&pos->vfs_inode, 1005 EXT4_STATE_FC_COMMITTING)) 1006 continue; 1007 spin_unlock(&sbi->s_fc_lock); 1008 1009 ret = jbd2_wait_inode_data(journal, pos->jinode); 1010 if (ret) 1011 return ret; 1012 spin_lock(&sbi->s_fc_lock); 1013 } 1014 spin_unlock(&sbi->s_fc_lock); 1015 1016 return 0; 1017 } 1018 1019 /* Commit all the directory entry updates */ 1020 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 1021 __acquires(&sbi->s_fc_lock) 1022 __releases(&sbi->s_fc_lock) 1023 { 1024 struct super_block *sb = journal->j_private; 1025 struct ext4_sb_info *sbi = EXT4_SB(sb); 1026 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 1027 struct inode *inode; 1028 struct ext4_inode_info *ei; 1029 int ret; 1030 1031 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 1032 return 0; 1033 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 1034 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 1035 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 1036 spin_unlock(&sbi->s_fc_lock); 1037 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1038 ret = -ENOSPC; 1039 goto lock_and_exit; 1040 } 1041 spin_lock(&sbi->s_fc_lock); 1042 continue; 1043 } 1044 /* 1045 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 1046 * corresponding inode pointer 1047 */ 1048 WARN_ON(list_empty(&fc_dentry->fcd_dilist)); 1049 ei = list_first_entry(&fc_dentry->fcd_dilist, 1050 struct ext4_inode_info, i_fc_dilist); 1051 inode = &ei->vfs_inode; 1052 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1053 1054 spin_unlock(&sbi->s_fc_lock); 1055 1056 /* 1057 * We first write the inode and then the create dirent. This 1058 * allows the recovery code to create an unnamed inode first 1059 * and then link it to a directory entry. This allows us 1060 * to use namei.c routines almost as is and simplifies 1061 * the recovery code. 1062 */ 1063 ret = ext4_fc_write_inode(inode, crc); 1064 if (ret) 1065 goto lock_and_exit; 1066 1067 ret = ext4_fc_write_inode_data(inode, crc); 1068 if (ret) 1069 goto lock_and_exit; 1070 1071 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { 1072 ret = -ENOSPC; 1073 goto lock_and_exit; 1074 } 1075 1076 spin_lock(&sbi->s_fc_lock); 1077 } 1078 return 0; 1079 lock_and_exit: 1080 spin_lock(&sbi->s_fc_lock); 1081 return ret; 1082 } 1083 1084 static int ext4_fc_perform_commit(journal_t *journal) 1085 { 1086 struct super_block *sb = journal->j_private; 1087 struct ext4_sb_info *sbi = EXT4_SB(sb); 1088 struct ext4_inode_info *iter; 1089 struct ext4_fc_head head; 1090 struct inode *inode; 1091 struct blk_plug plug; 1092 int ret = 0; 1093 u32 crc = 0; 1094 1095 ret = ext4_fc_submit_inode_data_all(journal); 1096 if (ret) 1097 return ret; 1098 1099 ret = ext4_fc_wait_inode_data_all(journal); 1100 if (ret) 1101 return ret; 1102 1103 /* 1104 * If file system device is different from journal device, issue a cache 1105 * flush before we start writing fast commit blocks. 1106 */ 1107 if (journal->j_fs_dev != journal->j_dev) 1108 blkdev_issue_flush(journal->j_fs_dev); 1109 1110 blk_start_plug(&plug); 1111 if (sbi->s_fc_bytes == 0) { 1112 /* 1113 * Add a head tag only if this is the first fast commit 1114 * in this TID. 1115 */ 1116 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1117 head.fc_tid = cpu_to_le32( 1118 sbi->s_journal->j_running_transaction->t_tid); 1119 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1120 (u8 *)&head, &crc)) { 1121 ret = -ENOSPC; 1122 goto out; 1123 } 1124 } 1125 1126 spin_lock(&sbi->s_fc_lock); 1127 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1128 if (ret) { 1129 spin_unlock(&sbi->s_fc_lock); 1130 goto out; 1131 } 1132 1133 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1134 inode = &iter->vfs_inode; 1135 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1136 continue; 1137 1138 spin_unlock(&sbi->s_fc_lock); 1139 ret = ext4_fc_write_inode_data(inode, &crc); 1140 if (ret) 1141 goto out; 1142 ret = ext4_fc_write_inode(inode, &crc); 1143 if (ret) 1144 goto out; 1145 spin_lock(&sbi->s_fc_lock); 1146 } 1147 spin_unlock(&sbi->s_fc_lock); 1148 1149 ret = ext4_fc_write_tail(sb, crc); 1150 1151 out: 1152 blk_finish_plug(&plug); 1153 return ret; 1154 } 1155 1156 static void ext4_fc_update_stats(struct super_block *sb, int status, 1157 u64 commit_time, int nblks, tid_t commit_tid) 1158 { 1159 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1160 1161 ext4_debug("Fast commit ended with status = %d for tid %u", 1162 status, commit_tid); 1163 if (status == EXT4_FC_STATUS_OK) { 1164 stats->fc_num_commits++; 1165 stats->fc_numblks += nblks; 1166 if (likely(stats->s_fc_avg_commit_time)) 1167 stats->s_fc_avg_commit_time = 1168 (commit_time + 1169 stats->s_fc_avg_commit_time * 3) / 4; 1170 else 1171 stats->s_fc_avg_commit_time = commit_time; 1172 } else if (status == EXT4_FC_STATUS_FAILED || 1173 status == EXT4_FC_STATUS_INELIGIBLE) { 1174 if (status == EXT4_FC_STATUS_FAILED) 1175 stats->fc_failed_commits++; 1176 stats->fc_ineligible_commits++; 1177 } else { 1178 stats->fc_skipped_commits++; 1179 } 1180 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1181 } 1182 1183 /* 1184 * The main commit entry point. Performs a fast commit for transaction 1185 * commit_tid if needed. If it's not possible to perform a fast commit 1186 * due to various reasons, we fall back to full commit. Returns 0 1187 * on success, error otherwise. 1188 */ 1189 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1190 { 1191 struct super_block *sb = journal->j_private; 1192 struct ext4_sb_info *sbi = EXT4_SB(sb); 1193 int nblks = 0, ret, bsize = journal->j_blocksize; 1194 int subtid = atomic_read(&sbi->s_fc_subtid); 1195 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1196 ktime_t start_time, commit_time; 1197 1198 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1199 return jbd2_complete_transaction(journal, commit_tid); 1200 1201 trace_ext4_fc_commit_start(sb, commit_tid); 1202 1203 start_time = ktime_get(); 1204 1205 restart_fc: 1206 ret = jbd2_fc_begin_commit(journal, commit_tid); 1207 if (ret == -EALREADY) { 1208 /* There was an ongoing commit, check if we need to restart */ 1209 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1210 commit_tid > journal->j_commit_sequence) 1211 goto restart_fc; 1212 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1213 commit_tid); 1214 return 0; 1215 } else if (ret) { 1216 /* 1217 * Commit couldn't start. Just update stats and perform a 1218 * full commit. 1219 */ 1220 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1221 commit_tid); 1222 return jbd2_complete_transaction(journal, commit_tid); 1223 } 1224 1225 /* 1226 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1227 * if we are fast commit ineligible. 1228 */ 1229 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1230 status = EXT4_FC_STATUS_INELIGIBLE; 1231 goto fallback; 1232 } 1233 1234 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1235 ret = ext4_fc_perform_commit(journal); 1236 if (ret < 0) { 1237 status = EXT4_FC_STATUS_FAILED; 1238 goto fallback; 1239 } 1240 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1241 ret = jbd2_fc_wait_bufs(journal, nblks); 1242 if (ret < 0) { 1243 status = EXT4_FC_STATUS_FAILED; 1244 goto fallback; 1245 } 1246 atomic_inc(&sbi->s_fc_subtid); 1247 ret = jbd2_fc_end_commit(journal); 1248 /* 1249 * weight the commit time higher than the average time so we 1250 * don't react too strongly to vast changes in the commit time 1251 */ 1252 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1253 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1254 return ret; 1255 1256 fallback: 1257 ret = jbd2_fc_end_commit_fallback(journal); 1258 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1259 return ret; 1260 } 1261 1262 /* 1263 * Fast commit cleanup routine. This is called after every fast commit and 1264 * full commit. full is true if we are called after a full commit. 1265 */ 1266 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1267 { 1268 struct super_block *sb = journal->j_private; 1269 struct ext4_sb_info *sbi = EXT4_SB(sb); 1270 struct ext4_inode_info *iter, *iter_n; 1271 struct ext4_fc_dentry_update *fc_dentry; 1272 1273 if (full && sbi->s_fc_bh) 1274 sbi->s_fc_bh = NULL; 1275 1276 trace_ext4_fc_cleanup(journal, full, tid); 1277 jbd2_fc_release_bufs(journal); 1278 1279 spin_lock(&sbi->s_fc_lock); 1280 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], 1281 i_fc_list) { 1282 list_del_init(&iter->i_fc_list); 1283 ext4_clear_inode_state(&iter->vfs_inode, 1284 EXT4_STATE_FC_COMMITTING); 1285 if (iter->i_sync_tid <= tid) 1286 ext4_fc_reset_inode(&iter->vfs_inode); 1287 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1288 smp_mb(); 1289 #if (BITS_PER_LONG < 64) 1290 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1291 #else 1292 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1293 #endif 1294 } 1295 1296 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1297 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1298 struct ext4_fc_dentry_update, 1299 fcd_list); 1300 list_del_init(&fc_dentry->fcd_list); 1301 list_del_init(&fc_dentry->fcd_dilist); 1302 spin_unlock(&sbi->s_fc_lock); 1303 1304 if (fc_dentry->fcd_name.name && 1305 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1306 kfree(fc_dentry->fcd_name.name); 1307 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1308 spin_lock(&sbi->s_fc_lock); 1309 } 1310 1311 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1312 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1313 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1314 &sbi->s_fc_q[FC_Q_MAIN]); 1315 1316 if (tid >= sbi->s_fc_ineligible_tid) { 1317 sbi->s_fc_ineligible_tid = 0; 1318 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1319 } 1320 1321 if (full) 1322 sbi->s_fc_bytes = 0; 1323 spin_unlock(&sbi->s_fc_lock); 1324 trace_ext4_fc_stats(sb); 1325 } 1326 1327 /* Ext4 Replay Path Routines */ 1328 1329 /* Helper struct for dentry replay routines */ 1330 struct dentry_info_args { 1331 int parent_ino, dname_len, ino, inode_len; 1332 char *dname; 1333 }; 1334 1335 /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1336 struct ext4_fc_tl_mem { 1337 u16 fc_tag; 1338 u16 fc_len; 1339 }; 1340 1341 static inline void tl_to_darg(struct dentry_info_args *darg, 1342 struct ext4_fc_tl_mem *tl, u8 *val) 1343 { 1344 struct ext4_fc_dentry_info fcd; 1345 1346 memcpy(&fcd, val, sizeof(fcd)); 1347 1348 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1349 darg->ino = le32_to_cpu(fcd.fc_ino); 1350 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1351 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1352 } 1353 1354 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1355 { 1356 struct ext4_fc_tl tl_disk; 1357 1358 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1359 tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1360 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1361 } 1362 1363 /* Unlink replay function */ 1364 static int ext4_fc_replay_unlink(struct super_block *sb, 1365 struct ext4_fc_tl_mem *tl, u8 *val) 1366 { 1367 struct inode *inode, *old_parent; 1368 struct qstr entry; 1369 struct dentry_info_args darg; 1370 int ret = 0; 1371 1372 tl_to_darg(&darg, tl, val); 1373 1374 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1375 darg.parent_ino, darg.dname_len); 1376 1377 entry.name = darg.dname; 1378 entry.len = darg.dname_len; 1379 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1380 1381 if (IS_ERR(inode)) { 1382 ext4_debug("Inode %d not found", darg.ino); 1383 return 0; 1384 } 1385 1386 old_parent = ext4_iget(sb, darg.parent_ino, 1387 EXT4_IGET_NORMAL); 1388 if (IS_ERR(old_parent)) { 1389 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1390 iput(inode); 1391 return 0; 1392 } 1393 1394 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1395 /* -ENOENT ok coz it might not exist anymore. */ 1396 if (ret == -ENOENT) 1397 ret = 0; 1398 iput(old_parent); 1399 iput(inode); 1400 return ret; 1401 } 1402 1403 static int ext4_fc_replay_link_internal(struct super_block *sb, 1404 struct dentry_info_args *darg, 1405 struct inode *inode) 1406 { 1407 struct inode *dir = NULL; 1408 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1409 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1410 int ret = 0; 1411 1412 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1413 if (IS_ERR(dir)) { 1414 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1415 dir = NULL; 1416 goto out; 1417 } 1418 1419 dentry_dir = d_obtain_alias(dir); 1420 if (IS_ERR(dentry_dir)) { 1421 ext4_debug("Failed to obtain dentry"); 1422 dentry_dir = NULL; 1423 goto out; 1424 } 1425 1426 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1427 if (!dentry_inode) { 1428 ext4_debug("Inode dentry not created."); 1429 ret = -ENOMEM; 1430 goto out; 1431 } 1432 1433 ret = __ext4_link(dir, inode, dentry_inode); 1434 /* 1435 * It's possible that link already existed since data blocks 1436 * for the dir in question got persisted before we crashed OR 1437 * we replayed this tag and crashed before the entire replay 1438 * could complete. 1439 */ 1440 if (ret && ret != -EEXIST) { 1441 ext4_debug("Failed to link\n"); 1442 goto out; 1443 } 1444 1445 ret = 0; 1446 out: 1447 if (dentry_dir) { 1448 d_drop(dentry_dir); 1449 dput(dentry_dir); 1450 } else if (dir) { 1451 iput(dir); 1452 } 1453 if (dentry_inode) { 1454 d_drop(dentry_inode); 1455 dput(dentry_inode); 1456 } 1457 1458 return ret; 1459 } 1460 1461 /* Link replay function */ 1462 static int ext4_fc_replay_link(struct super_block *sb, 1463 struct ext4_fc_tl_mem *tl, u8 *val) 1464 { 1465 struct inode *inode; 1466 struct dentry_info_args darg; 1467 int ret = 0; 1468 1469 tl_to_darg(&darg, tl, val); 1470 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1471 darg.parent_ino, darg.dname_len); 1472 1473 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1474 if (IS_ERR(inode)) { 1475 ext4_debug("Inode not found."); 1476 return 0; 1477 } 1478 1479 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1480 iput(inode); 1481 return ret; 1482 } 1483 1484 /* 1485 * Record all the modified inodes during replay. We use this later to setup 1486 * block bitmaps correctly. 1487 */ 1488 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1489 { 1490 struct ext4_fc_replay_state *state; 1491 int i; 1492 1493 state = &EXT4_SB(sb)->s_fc_replay_state; 1494 for (i = 0; i < state->fc_modified_inodes_used; i++) 1495 if (state->fc_modified_inodes[i] == ino) 1496 return 0; 1497 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1498 int *fc_modified_inodes; 1499 1500 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1501 sizeof(int) * (state->fc_modified_inodes_size + 1502 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1503 GFP_KERNEL); 1504 if (!fc_modified_inodes) 1505 return -ENOMEM; 1506 state->fc_modified_inodes = fc_modified_inodes; 1507 state->fc_modified_inodes_size += 1508 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1509 } 1510 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1511 return 0; 1512 } 1513 1514 /* 1515 * Inode replay function 1516 */ 1517 static int ext4_fc_replay_inode(struct super_block *sb, 1518 struct ext4_fc_tl_mem *tl, u8 *val) 1519 { 1520 struct ext4_fc_inode fc_inode; 1521 struct ext4_inode *raw_inode; 1522 struct ext4_inode *raw_fc_inode; 1523 struct inode *inode = NULL; 1524 struct ext4_iloc iloc; 1525 int inode_len, ino, ret, tag = tl->fc_tag; 1526 struct ext4_extent_header *eh; 1527 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1528 1529 memcpy(&fc_inode, val, sizeof(fc_inode)); 1530 1531 ino = le32_to_cpu(fc_inode.fc_ino); 1532 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1533 1534 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1535 if (!IS_ERR(inode)) { 1536 ext4_ext_clear_bb(inode); 1537 iput(inode); 1538 } 1539 inode = NULL; 1540 1541 ret = ext4_fc_record_modified_inode(sb, ino); 1542 if (ret) 1543 goto out; 1544 1545 raw_fc_inode = (struct ext4_inode *) 1546 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1547 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1548 if (ret) 1549 goto out; 1550 1551 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1552 raw_inode = ext4_raw_inode(&iloc); 1553 1554 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1555 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1556 inode_len - off_gen); 1557 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1558 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1559 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1560 memset(eh, 0, sizeof(*eh)); 1561 eh->eh_magic = EXT4_EXT_MAGIC; 1562 eh->eh_max = cpu_to_le16( 1563 (sizeof(raw_inode->i_block) - 1564 sizeof(struct ext4_extent_header)) 1565 / sizeof(struct ext4_extent)); 1566 } 1567 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1568 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1569 sizeof(raw_inode->i_block)); 1570 } 1571 1572 /* Immediately update the inode on disk. */ 1573 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1574 if (ret) 1575 goto out; 1576 ret = sync_dirty_buffer(iloc.bh); 1577 if (ret) 1578 goto out; 1579 ret = ext4_mark_inode_used(sb, ino); 1580 if (ret) 1581 goto out; 1582 1583 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1584 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1585 if (IS_ERR(inode)) { 1586 ext4_debug("Inode not found."); 1587 return -EFSCORRUPTED; 1588 } 1589 1590 /* 1591 * Our allocator could have made different decisions than before 1592 * crashing. This should be fixed but until then, we calculate 1593 * the number of blocks the inode. 1594 */ 1595 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1596 ext4_ext_replay_set_iblocks(inode); 1597 1598 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1599 ext4_reset_inode_seed(inode); 1600 1601 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1602 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1603 sync_dirty_buffer(iloc.bh); 1604 brelse(iloc.bh); 1605 out: 1606 iput(inode); 1607 if (!ret) 1608 blkdev_issue_flush(sb->s_bdev); 1609 1610 return 0; 1611 } 1612 1613 /* 1614 * Dentry create replay function. 1615 * 1616 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1617 * inode for which we are trying to create a dentry here, should already have 1618 * been replayed before we start here. 1619 */ 1620 static int ext4_fc_replay_create(struct super_block *sb, 1621 struct ext4_fc_tl_mem *tl, u8 *val) 1622 { 1623 int ret = 0; 1624 struct inode *inode = NULL; 1625 struct inode *dir = NULL; 1626 struct dentry_info_args darg; 1627 1628 tl_to_darg(&darg, tl, val); 1629 1630 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1631 darg.parent_ino, darg.dname_len); 1632 1633 /* This takes care of update group descriptor and other metadata */ 1634 ret = ext4_mark_inode_used(sb, darg.ino); 1635 if (ret) 1636 goto out; 1637 1638 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1639 if (IS_ERR(inode)) { 1640 ext4_debug("inode %d not found.", darg.ino); 1641 inode = NULL; 1642 ret = -EINVAL; 1643 goto out; 1644 } 1645 1646 if (S_ISDIR(inode->i_mode)) { 1647 /* 1648 * If we are creating a directory, we need to make sure that the 1649 * dot and dot dot dirents are setup properly. 1650 */ 1651 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1652 if (IS_ERR(dir)) { 1653 ext4_debug("Dir %d not found.", darg.ino); 1654 goto out; 1655 } 1656 ret = ext4_init_new_dir(NULL, dir, inode); 1657 iput(dir); 1658 if (ret) { 1659 ret = 0; 1660 goto out; 1661 } 1662 } 1663 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1664 if (ret) 1665 goto out; 1666 set_nlink(inode, 1); 1667 ext4_mark_inode_dirty(NULL, inode); 1668 out: 1669 iput(inode); 1670 return ret; 1671 } 1672 1673 /* 1674 * Record physical disk regions which are in use as per fast commit area, 1675 * and used by inodes during replay phase. Our simple replay phase 1676 * allocator excludes these regions from allocation. 1677 */ 1678 int ext4_fc_record_regions(struct super_block *sb, int ino, 1679 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1680 { 1681 struct ext4_fc_replay_state *state; 1682 struct ext4_fc_alloc_region *region; 1683 1684 state = &EXT4_SB(sb)->s_fc_replay_state; 1685 /* 1686 * during replay phase, the fc_regions_valid may not same as 1687 * fc_regions_used, update it when do new additions. 1688 */ 1689 if (replay && state->fc_regions_used != state->fc_regions_valid) 1690 state->fc_regions_used = state->fc_regions_valid; 1691 if (state->fc_regions_used == state->fc_regions_size) { 1692 struct ext4_fc_alloc_region *fc_regions; 1693 1694 fc_regions = krealloc(state->fc_regions, 1695 sizeof(struct ext4_fc_alloc_region) * 1696 (state->fc_regions_size + 1697 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1698 GFP_KERNEL); 1699 if (!fc_regions) 1700 return -ENOMEM; 1701 state->fc_regions_size += 1702 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1703 state->fc_regions = fc_regions; 1704 } 1705 region = &state->fc_regions[state->fc_regions_used++]; 1706 region->ino = ino; 1707 region->lblk = lblk; 1708 region->pblk = pblk; 1709 region->len = len; 1710 1711 if (replay) 1712 state->fc_regions_valid++; 1713 1714 return 0; 1715 } 1716 1717 /* Replay add range tag */ 1718 static int ext4_fc_replay_add_range(struct super_block *sb, 1719 struct ext4_fc_tl_mem *tl, u8 *val) 1720 { 1721 struct ext4_fc_add_range fc_add_ex; 1722 struct ext4_extent newex, *ex; 1723 struct inode *inode; 1724 ext4_lblk_t start, cur; 1725 int remaining, len; 1726 ext4_fsblk_t start_pblk; 1727 struct ext4_map_blocks map; 1728 struct ext4_ext_path *path = NULL; 1729 int ret; 1730 1731 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1732 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1733 1734 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1735 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1736 ext4_ext_get_actual_len(ex)); 1737 1738 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1739 if (IS_ERR(inode)) { 1740 ext4_debug("Inode not found."); 1741 return 0; 1742 } 1743 1744 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1745 if (ret) 1746 goto out; 1747 1748 start = le32_to_cpu(ex->ee_block); 1749 start_pblk = ext4_ext_pblock(ex); 1750 len = ext4_ext_get_actual_len(ex); 1751 1752 cur = start; 1753 remaining = len; 1754 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1755 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1756 inode->i_ino); 1757 1758 while (remaining > 0) { 1759 map.m_lblk = cur; 1760 map.m_len = remaining; 1761 map.m_pblk = 0; 1762 ret = ext4_map_blocks(NULL, inode, &map, 0); 1763 1764 if (ret < 0) 1765 goto out; 1766 1767 if (ret == 0) { 1768 /* Range is not mapped */ 1769 path = ext4_find_extent(inode, cur, NULL, 0); 1770 if (IS_ERR(path)) 1771 goto out; 1772 memset(&newex, 0, sizeof(newex)); 1773 newex.ee_block = cpu_to_le32(cur); 1774 ext4_ext_store_pblock( 1775 &newex, start_pblk + cur - start); 1776 newex.ee_len = cpu_to_le16(map.m_len); 1777 if (ext4_ext_is_unwritten(ex)) 1778 ext4_ext_mark_unwritten(&newex); 1779 down_write(&EXT4_I(inode)->i_data_sem); 1780 ret = ext4_ext_insert_extent( 1781 NULL, inode, &path, &newex, 0); 1782 up_write((&EXT4_I(inode)->i_data_sem)); 1783 ext4_free_ext_path(path); 1784 if (ret) 1785 goto out; 1786 goto next; 1787 } 1788 1789 if (start_pblk + cur - start != map.m_pblk) { 1790 /* 1791 * Logical to physical mapping changed. This can happen 1792 * if this range was removed and then reallocated to 1793 * map to new physical blocks during a fast commit. 1794 */ 1795 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1796 ext4_ext_is_unwritten(ex), 1797 start_pblk + cur - start); 1798 if (ret) 1799 goto out; 1800 /* 1801 * Mark the old blocks as free since they aren't used 1802 * anymore. We maintain an array of all the modified 1803 * inodes. In case these blocks are still used at either 1804 * a different logical range in the same inode or in 1805 * some different inode, we will mark them as allocated 1806 * at the end of the FC replay using our array of 1807 * modified inodes. 1808 */ 1809 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1810 goto next; 1811 } 1812 1813 /* Range is mapped and needs a state change */ 1814 ext4_debug("Converting from %ld to %d %lld", 1815 map.m_flags & EXT4_MAP_UNWRITTEN, 1816 ext4_ext_is_unwritten(ex), map.m_pblk); 1817 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1818 ext4_ext_is_unwritten(ex), map.m_pblk); 1819 if (ret) 1820 goto out; 1821 /* 1822 * We may have split the extent tree while toggling the state. 1823 * Try to shrink the extent tree now. 1824 */ 1825 ext4_ext_replay_shrink_inode(inode, start + len); 1826 next: 1827 cur += map.m_len; 1828 remaining -= map.m_len; 1829 } 1830 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1831 sb->s_blocksize_bits); 1832 out: 1833 iput(inode); 1834 return 0; 1835 } 1836 1837 /* Replay DEL_RANGE tag */ 1838 static int 1839 ext4_fc_replay_del_range(struct super_block *sb, 1840 struct ext4_fc_tl_mem *tl, u8 *val) 1841 { 1842 struct inode *inode; 1843 struct ext4_fc_del_range lrange; 1844 struct ext4_map_blocks map; 1845 ext4_lblk_t cur, remaining; 1846 int ret; 1847 1848 memcpy(&lrange, val, sizeof(lrange)); 1849 cur = le32_to_cpu(lrange.fc_lblk); 1850 remaining = le32_to_cpu(lrange.fc_len); 1851 1852 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1853 le32_to_cpu(lrange.fc_ino), cur, remaining); 1854 1855 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1856 if (IS_ERR(inode)) { 1857 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1858 return 0; 1859 } 1860 1861 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1862 if (ret) 1863 goto out; 1864 1865 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", 1866 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1867 le32_to_cpu(lrange.fc_len)); 1868 while (remaining > 0) { 1869 map.m_lblk = cur; 1870 map.m_len = remaining; 1871 1872 ret = ext4_map_blocks(NULL, inode, &map, 0); 1873 if (ret < 0) 1874 goto out; 1875 if (ret > 0) { 1876 remaining -= ret; 1877 cur += ret; 1878 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1879 } else { 1880 remaining -= map.m_len; 1881 cur += map.m_len; 1882 } 1883 } 1884 1885 down_write(&EXT4_I(inode)->i_data_sem); 1886 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1887 le32_to_cpu(lrange.fc_lblk) + 1888 le32_to_cpu(lrange.fc_len) - 1); 1889 up_write(&EXT4_I(inode)->i_data_sem); 1890 if (ret) 1891 goto out; 1892 ext4_ext_replay_shrink_inode(inode, 1893 i_size_read(inode) >> sb->s_blocksize_bits); 1894 ext4_mark_inode_dirty(NULL, inode); 1895 out: 1896 iput(inode); 1897 return 0; 1898 } 1899 1900 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1901 { 1902 struct ext4_fc_replay_state *state; 1903 struct inode *inode; 1904 struct ext4_ext_path *path = NULL; 1905 struct ext4_map_blocks map; 1906 int i, ret, j; 1907 ext4_lblk_t cur, end; 1908 1909 state = &EXT4_SB(sb)->s_fc_replay_state; 1910 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1911 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1912 EXT4_IGET_NORMAL); 1913 if (IS_ERR(inode)) { 1914 ext4_debug("Inode %d not found.", 1915 state->fc_modified_inodes[i]); 1916 continue; 1917 } 1918 cur = 0; 1919 end = EXT_MAX_BLOCKS; 1920 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1921 iput(inode); 1922 continue; 1923 } 1924 while (cur < end) { 1925 map.m_lblk = cur; 1926 map.m_len = end - cur; 1927 1928 ret = ext4_map_blocks(NULL, inode, &map, 0); 1929 if (ret < 0) 1930 break; 1931 1932 if (ret > 0) { 1933 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1934 if (!IS_ERR(path)) { 1935 for (j = 0; j < path->p_depth; j++) 1936 ext4_mb_mark_bb(inode->i_sb, 1937 path[j].p_block, 1, 1); 1938 ext4_free_ext_path(path); 1939 } 1940 cur += ret; 1941 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1942 map.m_len, 1); 1943 } else { 1944 cur = cur + (map.m_len ? map.m_len : 1); 1945 } 1946 } 1947 iput(inode); 1948 } 1949 } 1950 1951 /* 1952 * Check if block is in excluded regions for block allocation. The simple 1953 * allocator that runs during replay phase is calls this function to see 1954 * if it is okay to use a block. 1955 */ 1956 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1957 { 1958 int i; 1959 struct ext4_fc_replay_state *state; 1960 1961 state = &EXT4_SB(sb)->s_fc_replay_state; 1962 for (i = 0; i < state->fc_regions_valid; i++) { 1963 if (state->fc_regions[i].ino == 0 || 1964 state->fc_regions[i].len == 0) 1965 continue; 1966 if (in_range(blk, state->fc_regions[i].pblk, 1967 state->fc_regions[i].len)) 1968 return true; 1969 } 1970 return false; 1971 } 1972 1973 /* Cleanup function called after replay */ 1974 void ext4_fc_replay_cleanup(struct super_block *sb) 1975 { 1976 struct ext4_sb_info *sbi = EXT4_SB(sb); 1977 1978 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1979 kfree(sbi->s_fc_replay_state.fc_regions); 1980 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1981 } 1982 1983 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 1984 int tag, int len) 1985 { 1986 switch (tag) { 1987 case EXT4_FC_TAG_ADD_RANGE: 1988 return len == sizeof(struct ext4_fc_add_range); 1989 case EXT4_FC_TAG_DEL_RANGE: 1990 return len == sizeof(struct ext4_fc_del_range); 1991 case EXT4_FC_TAG_CREAT: 1992 case EXT4_FC_TAG_LINK: 1993 case EXT4_FC_TAG_UNLINK: 1994 len -= sizeof(struct ext4_fc_dentry_info); 1995 return len >= 1 && len <= EXT4_NAME_LEN; 1996 case EXT4_FC_TAG_INODE: 1997 len -= sizeof(struct ext4_fc_inode); 1998 return len >= EXT4_GOOD_OLD_INODE_SIZE && 1999 len <= sbi->s_inode_size; 2000 case EXT4_FC_TAG_PAD: 2001 return true; /* padding can have any length */ 2002 case EXT4_FC_TAG_TAIL: 2003 return len >= sizeof(struct ext4_fc_tail); 2004 case EXT4_FC_TAG_HEAD: 2005 return len == sizeof(struct ext4_fc_head); 2006 } 2007 return false; 2008 } 2009 2010 /* 2011 * Recovery Scan phase handler 2012 * 2013 * This function is called during the scan phase and is responsible 2014 * for doing following things: 2015 * - Make sure the fast commit area has valid tags for replay 2016 * - Count number of tags that need to be replayed by the replay handler 2017 * - Verify CRC 2018 * - Create a list of excluded blocks for allocation during replay phase 2019 * 2020 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2021 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2022 * to indicate that scan has finished and JBD2 can now start replay phase. 2023 * It returns a negative error to indicate that there was an error. At the end 2024 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2025 * to indicate the number of tags that need to replayed during the replay phase. 2026 */ 2027 static int ext4_fc_replay_scan(journal_t *journal, 2028 struct buffer_head *bh, int off, 2029 tid_t expected_tid) 2030 { 2031 struct super_block *sb = journal->j_private; 2032 struct ext4_sb_info *sbi = EXT4_SB(sb); 2033 struct ext4_fc_replay_state *state; 2034 int ret = JBD2_FC_REPLAY_CONTINUE; 2035 struct ext4_fc_add_range ext; 2036 struct ext4_fc_tl_mem tl; 2037 struct ext4_fc_tail tail; 2038 __u8 *start, *end, *cur, *val; 2039 struct ext4_fc_head head; 2040 struct ext4_extent *ex; 2041 2042 state = &sbi->s_fc_replay_state; 2043 2044 start = (u8 *)bh->b_data; 2045 end = start + journal->j_blocksize; 2046 2047 if (state->fc_replay_expected_off == 0) { 2048 state->fc_cur_tag = 0; 2049 state->fc_replay_num_tags = 0; 2050 state->fc_crc = 0; 2051 state->fc_regions = NULL; 2052 state->fc_regions_valid = state->fc_regions_used = 2053 state->fc_regions_size = 0; 2054 /* Check if we can stop early */ 2055 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2056 != EXT4_FC_TAG_HEAD) 2057 return 0; 2058 } 2059 2060 if (off != state->fc_replay_expected_off) { 2061 ret = -EFSCORRUPTED; 2062 goto out_err; 2063 } 2064 2065 state->fc_replay_expected_off++; 2066 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2067 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2068 ext4_fc_get_tl(&tl, cur); 2069 val = cur + EXT4_FC_TAG_BASE_LEN; 2070 if (tl.fc_len > end - val || 2071 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2072 ret = state->fc_replay_num_tags ? 2073 JBD2_FC_REPLAY_STOP : -ECANCELED; 2074 goto out_err; 2075 } 2076 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2077 tag2str(tl.fc_tag), bh->b_blocknr); 2078 switch (tl.fc_tag) { 2079 case EXT4_FC_TAG_ADD_RANGE: 2080 memcpy(&ext, val, sizeof(ext)); 2081 ex = (struct ext4_extent *)&ext.fc_ex; 2082 ret = ext4_fc_record_regions(sb, 2083 le32_to_cpu(ext.fc_ino), 2084 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2085 ext4_ext_get_actual_len(ex), 0); 2086 if (ret < 0) 2087 break; 2088 ret = JBD2_FC_REPLAY_CONTINUE; 2089 fallthrough; 2090 case EXT4_FC_TAG_DEL_RANGE: 2091 case EXT4_FC_TAG_LINK: 2092 case EXT4_FC_TAG_UNLINK: 2093 case EXT4_FC_TAG_CREAT: 2094 case EXT4_FC_TAG_INODE: 2095 case EXT4_FC_TAG_PAD: 2096 state->fc_cur_tag++; 2097 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2098 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2099 break; 2100 case EXT4_FC_TAG_TAIL: 2101 state->fc_cur_tag++; 2102 memcpy(&tail, val, sizeof(tail)); 2103 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2104 EXT4_FC_TAG_BASE_LEN + 2105 offsetof(struct ext4_fc_tail, 2106 fc_crc)); 2107 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2108 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2109 state->fc_replay_num_tags = state->fc_cur_tag; 2110 state->fc_regions_valid = 2111 state->fc_regions_used; 2112 } else { 2113 ret = state->fc_replay_num_tags ? 2114 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2115 } 2116 state->fc_crc = 0; 2117 break; 2118 case EXT4_FC_TAG_HEAD: 2119 memcpy(&head, val, sizeof(head)); 2120 if (le32_to_cpu(head.fc_features) & 2121 ~EXT4_FC_SUPPORTED_FEATURES) { 2122 ret = -EOPNOTSUPP; 2123 break; 2124 } 2125 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2126 ret = JBD2_FC_REPLAY_STOP; 2127 break; 2128 } 2129 state->fc_cur_tag++; 2130 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, 2131 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2132 break; 2133 default: 2134 ret = state->fc_replay_num_tags ? 2135 JBD2_FC_REPLAY_STOP : -ECANCELED; 2136 } 2137 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2138 break; 2139 } 2140 2141 out_err: 2142 trace_ext4_fc_replay_scan(sb, ret, off); 2143 return ret; 2144 } 2145 2146 /* 2147 * Main recovery path entry point. 2148 * The meaning of return codes is similar as above. 2149 */ 2150 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2151 enum passtype pass, int off, tid_t expected_tid) 2152 { 2153 struct super_block *sb = journal->j_private; 2154 struct ext4_sb_info *sbi = EXT4_SB(sb); 2155 struct ext4_fc_tl_mem tl; 2156 __u8 *start, *end, *cur, *val; 2157 int ret = JBD2_FC_REPLAY_CONTINUE; 2158 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2159 struct ext4_fc_tail tail; 2160 2161 if (pass == PASS_SCAN) { 2162 state->fc_current_pass = PASS_SCAN; 2163 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2164 } 2165 2166 if (state->fc_current_pass != pass) { 2167 state->fc_current_pass = pass; 2168 sbi->s_mount_state |= EXT4_FC_REPLAY; 2169 } 2170 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2171 ext4_debug("Replay stops\n"); 2172 ext4_fc_set_bitmaps_and_counters(sb); 2173 return 0; 2174 } 2175 2176 #ifdef CONFIG_EXT4_DEBUG 2177 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2178 pr_warn("Dropping fc block %d because max_replay set\n", off); 2179 return JBD2_FC_REPLAY_STOP; 2180 } 2181 #endif 2182 2183 start = (u8 *)bh->b_data; 2184 end = start + journal->j_blocksize; 2185 2186 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2187 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2188 ext4_fc_get_tl(&tl, cur); 2189 val = cur + EXT4_FC_TAG_BASE_LEN; 2190 2191 if (state->fc_replay_num_tags == 0) { 2192 ret = JBD2_FC_REPLAY_STOP; 2193 ext4_fc_set_bitmaps_and_counters(sb); 2194 break; 2195 } 2196 2197 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2198 state->fc_replay_num_tags--; 2199 switch (tl.fc_tag) { 2200 case EXT4_FC_TAG_LINK: 2201 ret = ext4_fc_replay_link(sb, &tl, val); 2202 break; 2203 case EXT4_FC_TAG_UNLINK: 2204 ret = ext4_fc_replay_unlink(sb, &tl, val); 2205 break; 2206 case EXT4_FC_TAG_ADD_RANGE: 2207 ret = ext4_fc_replay_add_range(sb, &tl, val); 2208 break; 2209 case EXT4_FC_TAG_CREAT: 2210 ret = ext4_fc_replay_create(sb, &tl, val); 2211 break; 2212 case EXT4_FC_TAG_DEL_RANGE: 2213 ret = ext4_fc_replay_del_range(sb, &tl, val); 2214 break; 2215 case EXT4_FC_TAG_INODE: 2216 ret = ext4_fc_replay_inode(sb, &tl, val); 2217 break; 2218 case EXT4_FC_TAG_PAD: 2219 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2220 tl.fc_len, 0); 2221 break; 2222 case EXT4_FC_TAG_TAIL: 2223 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2224 0, tl.fc_len, 0); 2225 memcpy(&tail, val, sizeof(tail)); 2226 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2227 break; 2228 case EXT4_FC_TAG_HEAD: 2229 break; 2230 default: 2231 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2232 ret = -ECANCELED; 2233 break; 2234 } 2235 if (ret < 0) 2236 break; 2237 ret = JBD2_FC_REPLAY_CONTINUE; 2238 } 2239 return ret; 2240 } 2241 2242 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2243 { 2244 /* 2245 * We set replay callback even if fast commit disabled because we may 2246 * could still have fast commit blocks that need to be replayed even if 2247 * fast commit has now been turned off. 2248 */ 2249 journal->j_fc_replay_callback = ext4_fc_replay; 2250 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2251 return; 2252 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2253 } 2254 2255 static const char * const fc_ineligible_reasons[] = { 2256 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2257 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2258 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2259 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2260 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2261 [EXT4_FC_REASON_RESIZE] = "Resize", 2262 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2263 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2264 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2265 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2266 }; 2267 2268 int ext4_fc_info_show(struct seq_file *seq, void *v) 2269 { 2270 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2271 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2272 int i; 2273 2274 if (v != SEQ_START_TOKEN) 2275 return 0; 2276 2277 seq_printf(seq, 2278 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2279 stats->fc_num_commits, stats->fc_ineligible_commits, 2280 stats->fc_numblks, 2281 div_u64(stats->s_fc_avg_commit_time, 1000)); 2282 seq_puts(seq, "Ineligible reasons:\n"); 2283 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2284 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2285 stats->fc_ineligible_reason_count[i]); 2286 2287 return 0; 2288 } 2289 2290 int __init ext4_fc_init_dentry_cache(void) 2291 { 2292 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2293 SLAB_RECLAIM_ACCOUNT); 2294 2295 if (ext4_fc_dentry_cachep == NULL) 2296 return -ENOMEM; 2297 2298 return 0; 2299 } 2300 2301 void ext4_fc_destroy_dentry_cache(void) 2302 { 2303 kmem_cache_destroy(ext4_fc_dentry_cachep); 2304 } 2305