1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 #include <linux/lockdep.h> 16 #include <linux/wait_bit.h> 17 /* 18 * Ext4 Fast Commits 19 * ----------------- 20 * 21 * Ext4 fast commits implement fine grained journalling for Ext4. 22 * 23 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 24 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 25 * TLV during the recovery phase. For the scenarios for which we currently 26 * don't have replay code, fast commit falls back to full commits. 27 * Fast commits record delta in one of the following three categories. 28 * 29 * (A) Directory entry updates: 30 * 31 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 32 * - EXT4_FC_TAG_LINK - records directory entry link 33 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 34 * 35 * (B) File specific data range updates: 36 * 37 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 38 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 39 * 40 * (C) Inode metadata (mtime / ctime etc): 41 * 42 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 43 * during recovery. Note that iblocks field is 44 * not replayed and instead derived during 45 * replay. 46 * Commit Operation 47 * ---------------- 48 * With fast commits, we maintain all the directory entry operations in the 49 * order in which they are issued in an in-memory queue. This queue is flushed 50 * to disk during the commit operation. We also maintain a list of inodes 51 * that need to be committed during a fast commit in another in memory queue of 52 * inodes. During the commit operation, we commit in the following order: 53 * 54 * [1] Prepare all the inodes to write out their data by setting 55 * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be 56 * deleted while it is being flushed. 57 * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" 58 * state. 59 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that 60 * all the exsiting handles finish and no new handles can start. 61 * [4] Mark all the fast commit eligible inodes as undergoing fast commit 62 * by setting "EXT4_STATE_FC_COMMITTING" state. 63 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows 64 * starting of new handles. If new handles try to start an update on 65 * any of the inodes that are being committed, ext4_fc_track_inode() 66 * will block until those inodes have finished the fast commit. 67 * [6] Commit all the directory entry updates in the fast commit space. 68 * [7] Commit all the changed inodes in the fast commit space and clear 69 * "EXT4_STATE_FC_COMMITTING" for these inodes. 70 * [8] Write tail tag (this tag ensures the atomicity, please read the following 71 * section for more details). 72 * 73 * All the inode updates must be enclosed within jbd2_jounrnal_start() 74 * and jbd2_journal_stop() similar to JBD2 journaling. 75 * 76 * Fast Commit Ineligibility 77 * ------------------------- 78 * 79 * Not all operations are supported by fast commits today (e.g extended 80 * attributes). Fast commit ineligibility is marked by calling 81 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back 82 * to full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * Fast Commit Replay Idempotence 107 * ------------------------------ 108 * 109 * Fast commits tags are idempotent in nature provided the recovery code follows 110 * certain rules. The guiding principle that the commit path follows while 111 * committing is that it stores the result of a particular operation instead of 112 * storing the procedure. 113 * 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 * was associated with inode 10. During fast commit, instead of storing this 116 * operation as a procedure "rename a to b", we store the resulting file system 117 * state as a "series" of outcomes: 118 * 119 * - Link dirent b to inode 10 120 * - Unlink dirent a 121 * - Inode <10> with valid refcount 122 * 123 * Now when recovery code runs, it needs "enforce" this state on the file 124 * system. This is what guarantees idempotence of fast commit replay. 125 * 126 * Let's take an example of a procedure that is not idempotent and see how fast 127 * commits make it idempotent. Consider following sequence of operations: 128 * 129 * rm A; mv B A; read A 130 * (x) (y) (z) 131 * 132 * (x), (y) and (z) are the points at which we can crash. If we store this 133 * sequence of operations as is then the replay is not idempotent. Let's say 134 * while in replay, we crash at (z). During the second replay, file A (which was 135 * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 * file named A would be absent when we try to read A. So, this sequence of 137 * operations is not idempotent. However, as mentioned above, instead of storing 138 * the procedure fast commits store the outcome of each procedure. Thus the fast 139 * commit log for above procedure would be as follows: 140 * 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 * inode 11 before the replay) 143 * 144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 * (w) (x) (y) (z) 146 * 147 * If we crash at (z), we will have file A linked to inode 11. During the second 148 * replay, we will remove file A (inode 11). But we will create it back and make 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 * similarly. Thus, by converting a non-idempotent procedure into a series of 153 * idempotent outcomes, fast commits ensured idempotence during the replay. 154 * 155 * Locking 156 * ------- 157 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit 158 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given 159 * inode. Most of the code avoids acquiring both the locks, but if one must do 160 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. 161 * 162 * TODOs 163 * ----- 164 * 165 * 0) Fast commit replay path hardening: Fast commit replay code should use 166 * journal handles to make sure all the updates it does during the replay 167 * path are atomic. With that if we crash during fast commit replay, after 168 * trying to do recovery again, we will find a file system where fast commit 169 * area is invalid (because new full commit would be found). In order to deal 170 * with that, fast commit replay code should ensure that the "FC_REPLAY" 171 * superblock state is persisted before starting the replay, so that after 172 * the crash, fast commit recovery code can look at that flag and perform 173 * fast commit recovery even if that area is invalidated by later full 174 * commits. 175 * 176 * 1) Handle more ineligible cases. 177 * 178 * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent 179 * status tree. This would get rid of the need to call ext4_fc_track_inode() 180 * before acquiring i_data_sem. To do that we would need to ensure that 181 * modified extents from the extent status tree are not evicted from memory. 182 */ 183 184 #include <trace/events/ext4.h> 185 static struct kmem_cache *ext4_fc_dentry_cachep; 186 187 static void ext4_end_buffer_io_sync(struct bio *bio) 188 { 189 struct buffer_head *bh; 190 bool uptodate = bio_endio_bh(bio, &bh); 191 192 BUFFER_TRACE(bh, ""); 193 if (uptodate) { 194 ext4_debug("%s: Block %lld up-to-date", 195 __func__, bh->b_blocknr); 196 set_buffer_uptodate(bh); 197 } else { 198 ext4_debug("%s: Block %lld not up-to-date", 199 __func__, bh->b_blocknr); 200 clear_buffer_uptodate(bh); 201 } 202 203 unlock_buffer(bh); 204 } 205 206 static inline void ext4_fc_reset_inode(struct inode *inode) 207 { 208 struct ext4_inode_info *ei = EXT4_I(inode); 209 210 ei->i_fc_lblk_start = 0; 211 ei->i_fc_lblk_len = 0; 212 } 213 214 void ext4_fc_init_inode(struct inode *inode) 215 { 216 struct ext4_inode_info *ei = EXT4_I(inode); 217 218 ext4_fc_reset_inode(inode); 219 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 220 INIT_LIST_HEAD(&ei->i_fc_list); 221 INIT_LIST_HEAD(&ei->i_fc_dilist); 222 } 223 224 static bool ext4_fc_disabled(struct super_block *sb) 225 { 226 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 227 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 228 } 229 230 static bool ext4_fc_eligible(struct super_block *sb) 231 { 232 return !ext4_fc_disabled(sb) && 233 !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)); 234 } 235 236 /* 237 * Remove inode from fast commit list. If the inode is being committed 238 * we wait until inode commit is done. 239 */ 240 void ext4_fc_del(struct inode *inode) 241 { 242 struct ext4_inode_info *ei = EXT4_I(inode); 243 struct ext4_fc_dentry_update *fc_dentry; 244 wait_queue_head_t *wq; 245 int alloc_ctx; 246 247 if (ext4_fc_disabled(inode->i_sb)) 248 return; 249 250 alloc_ctx = ext4_fc_lock(inode->i_sb); 251 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { 252 ext4_fc_unlock(inode->i_sb, alloc_ctx); 253 return; 254 } 255 256 /* 257 * Since ext4_fc_del is called from ext4_evict_inode while having a 258 * handle open, there is no need for us to wait here even if a fast 259 * commit is going on. That is because, if this inode is being 260 * committed, ext4_mark_inode_dirty would have waited for inode commit 261 * operation to finish before we come here. So, by the time we come 262 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, 263 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode 264 * here. 265 * 266 * We may come here without any handles open in the "no_delete" case of 267 * ext4_evict_inode as well. However, if that happens, we first mark the 268 * file system as fast commit ineligible anyway. So, even in that case, 269 * it is okay to remove the inode from the fc list. 270 */ 271 WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) 272 && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); 273 while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { 274 #if (BITS_PER_LONG < 64) 275 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 276 EXT4_STATE_FC_FLUSHING_DATA); 277 wq = bit_waitqueue(&ei->i_state_flags, 278 EXT4_STATE_FC_FLUSHING_DATA); 279 #else 280 DEFINE_WAIT_BIT(wait, &ei->i_flags, 281 EXT4_STATE_FC_FLUSHING_DATA); 282 wq = bit_waitqueue(&ei->i_flags, 283 EXT4_STATE_FC_FLUSHING_DATA); 284 #endif 285 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 286 if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { 287 ext4_fc_unlock(inode->i_sb, alloc_ctx); 288 schedule(); 289 alloc_ctx = ext4_fc_lock(inode->i_sb); 290 } 291 finish_wait(wq, &wait.wq_entry); 292 } 293 list_del_init(&ei->i_fc_list); 294 295 /* 296 * Since this inode is getting removed, let's also remove all FC 297 * dentry create references, since it is not needed to log it anyways. 298 */ 299 if (list_empty(&ei->i_fc_dilist)) { 300 ext4_fc_unlock(inode->i_sb, alloc_ctx); 301 return; 302 } 303 304 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); 305 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); 306 list_del_init(&fc_dentry->fcd_list); 307 list_del_init(&fc_dentry->fcd_dilist); 308 309 WARN_ON(!list_empty(&ei->i_fc_dilist)); 310 ext4_fc_unlock(inode->i_sb, alloc_ctx); 311 312 release_dentry_name_snapshot(&fc_dentry->fcd_name); 313 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 314 } 315 316 /* 317 * Mark file system as fast commit ineligible, and record latest 318 * ineligible transaction tid. This means until the recorded 319 * transaction, commit operation would result in a full jbd2 commit. 320 */ 321 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) 322 { 323 struct ext4_sb_info *sbi = EXT4_SB(sb); 324 tid_t tid; 325 bool has_transaction = true; 326 bool is_ineligible; 327 int alloc_ctx; 328 329 if (ext4_fc_disabled(sb)) 330 return; 331 332 if (!IS_ERR_OR_NULL(handle)) 333 tid = handle->h_transaction->t_tid; 334 else { 335 read_lock(&sbi->s_journal->j_state_lock); 336 if (sbi->s_journal->j_running_transaction) 337 tid = sbi->s_journal->j_running_transaction->t_tid; 338 else 339 has_transaction = false; 340 read_unlock(&sbi->s_journal->j_state_lock); 341 } 342 alloc_ctx = ext4_fc_lock(sb); 343 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 344 if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) 345 sbi->s_fc_ineligible_tid = tid; 346 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 347 ext4_fc_unlock(sb, alloc_ctx); 348 WARN_ON(reason >= EXT4_FC_REASON_MAX); 349 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 350 } 351 352 /* 353 * Generic fast commit tracking function. If this is the first time this we are 354 * called after a full commit, we initialize fast commit fields and then call 355 * __fc_track_fn() with update = 0. If we have already been called after a full 356 * commit, we pass update = 1. Based on that, the track function can determine 357 * if it needs to track a field for the first time or if it needs to just 358 * update the previously tracked value. 359 * 360 * If enqueue is set, this function enqueues the inode in fast commit list. 361 */ 362 static int ext4_fc_track_template( 363 handle_t *handle, struct inode *inode, 364 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), 365 void *args, int enqueue) 366 { 367 bool update = false; 368 struct ext4_inode_info *ei = EXT4_I(inode); 369 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 370 tid_t tid = 0; 371 int alloc_ctx; 372 int ret; 373 374 tid = handle->h_transaction->t_tid; 375 spin_lock(&ei->i_fc_lock); 376 if (tid == ei->i_sync_tid) { 377 update = true; 378 } else { 379 ext4_fc_reset_inode(inode); 380 ei->i_sync_tid = tid; 381 } 382 ret = __fc_track_fn(handle, inode, args, update); 383 spin_unlock(&ei->i_fc_lock); 384 if (!enqueue) 385 return ret; 386 387 alloc_ctx = ext4_fc_lock(inode->i_sb); 388 if (list_empty(&EXT4_I(inode)->i_fc_list)) 389 list_add_tail(&EXT4_I(inode)->i_fc_list, 390 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 391 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? 392 &sbi->s_fc_q[FC_Q_STAGING] : 393 &sbi->s_fc_q[FC_Q_MAIN]); 394 ext4_fc_unlock(inode->i_sb, alloc_ctx); 395 396 return ret; 397 } 398 399 struct __track_dentry_update_args { 400 struct dentry *dentry; 401 int op; 402 }; 403 404 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 405 static int __track_dentry_update(handle_t *handle, struct inode *inode, 406 void *arg, bool update) 407 { 408 struct ext4_fc_dentry_update *node; 409 struct ext4_inode_info *ei = EXT4_I(inode); 410 struct __track_dentry_update_args *dentry_update = 411 (struct __track_dentry_update_args *)arg; 412 struct dentry *dentry = dentry_update->dentry; 413 struct inode *dir = dentry->d_parent->d_inode; 414 struct super_block *sb = inode->i_sb; 415 struct ext4_sb_info *sbi = EXT4_SB(sb); 416 int alloc_ctx; 417 418 spin_unlock(&ei->i_fc_lock); 419 420 if (IS_ENCRYPTED(dir)) { 421 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, 422 handle); 423 spin_lock(&ei->i_fc_lock); 424 return -EOPNOTSUPP; 425 } 426 427 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 428 if (!node) { 429 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); 430 spin_lock(&ei->i_fc_lock); 431 return -ENOMEM; 432 } 433 434 node->fcd_op = dentry_update->op; 435 node->fcd_parent = dir->i_ino; 436 node->fcd_ino = inode->i_ino; 437 take_dentry_name_snapshot(&node->fcd_name, dentry); 438 INIT_LIST_HEAD(&node->fcd_dilist); 439 INIT_LIST_HEAD(&node->fcd_list); 440 alloc_ctx = ext4_fc_lock(sb); 441 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 442 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) 443 list_add_tail(&node->fcd_list, 444 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 445 else 446 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 447 448 /* 449 * This helps us keep a track of all fc_dentry updates which is part of 450 * this ext4 inode. So in case the inode is getting unlinked, before 451 * even we get a chance to fsync, we could remove all fc_dentry 452 * references while evicting the inode in ext4_fc_del(). 453 * Also with this, we don't need to loop over all the inodes in 454 * sbi->s_fc_q to get the corresponding inode in 455 * ext4_fc_commit_dentry_updates(). 456 */ 457 if (dentry_update->op == EXT4_FC_TAG_CREAT) { 458 WARN_ON(!list_empty(&ei->i_fc_dilist)); 459 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); 460 } 461 ext4_fc_unlock(sb, alloc_ctx); 462 spin_lock(&ei->i_fc_lock); 463 464 return 0; 465 } 466 467 void __ext4_fc_track_unlink(handle_t *handle, 468 struct inode *inode, struct dentry *dentry) 469 { 470 struct __track_dentry_update_args args; 471 int ret; 472 473 args.dentry = dentry; 474 args.op = EXT4_FC_TAG_UNLINK; 475 476 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 477 (void *)&args, 0); 478 trace_ext4_fc_track_unlink(handle, inode, dentry, ret); 479 } 480 481 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 482 { 483 struct inode *inode = d_inode(dentry); 484 485 if (ext4_fc_eligible(inode->i_sb)) 486 __ext4_fc_track_unlink(handle, inode, dentry); 487 } 488 489 void __ext4_fc_track_link(handle_t *handle, 490 struct inode *inode, struct dentry *dentry) 491 { 492 struct __track_dentry_update_args args; 493 int ret; 494 495 args.dentry = dentry; 496 args.op = EXT4_FC_TAG_LINK; 497 498 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 499 (void *)&args, 0); 500 trace_ext4_fc_track_link(handle, inode, dentry, ret); 501 } 502 503 void ext4_fc_track_link(handle_t *handle, struct inode *inode, 504 struct dentry *dentry) 505 { 506 if (ext4_fc_eligible(inode->i_sb)) 507 __ext4_fc_track_link(handle, inode, dentry); 508 } 509 510 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 511 struct dentry *dentry) 512 { 513 struct __track_dentry_update_args args; 514 int ret; 515 516 args.dentry = dentry; 517 args.op = EXT4_FC_TAG_CREAT; 518 519 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 520 (void *)&args, 0); 521 trace_ext4_fc_track_create(handle, inode, dentry, ret); 522 } 523 524 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 525 { 526 struct inode *inode = d_inode(dentry); 527 528 if (ext4_fc_eligible(inode->i_sb)) 529 __ext4_fc_track_create(handle, inode, dentry); 530 } 531 532 /* __track_fn for inode tracking */ 533 static int __track_inode(handle_t *handle, struct inode *inode, void *arg, 534 bool update) 535 { 536 if (update) 537 return -EEXIST; 538 539 EXT4_I(inode)->i_fc_lblk_len = 0; 540 541 return 0; 542 } 543 544 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 545 { 546 struct ext4_inode_info *ei = EXT4_I(inode); 547 wait_queue_head_t *wq; 548 int ret; 549 550 if (S_ISDIR(inode->i_mode)) 551 return; 552 553 if (ext4_should_journal_data(inode)) { 554 ext4_fc_mark_ineligible(inode->i_sb, 555 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 556 return; 557 } 558 559 if (!ext4_fc_eligible(inode->i_sb)) 560 return; 561 562 /* 563 * If we come here, we may sleep while waiting for the inode to 564 * commit. We shouldn't be holding i_data_sem when we go to sleep since 565 * the commit path needs to grab the lock while committing the inode. 566 */ 567 lockdep_assert_not_held(&ei->i_data_sem); 568 569 while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 570 #if (BITS_PER_LONG < 64) 571 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 572 EXT4_STATE_FC_COMMITTING); 573 wq = bit_waitqueue(&ei->i_state_flags, 574 EXT4_STATE_FC_COMMITTING); 575 #else 576 DEFINE_WAIT_BIT(wait, &ei->i_flags, 577 EXT4_STATE_FC_COMMITTING); 578 wq = bit_waitqueue(&ei->i_flags, 579 EXT4_STATE_FC_COMMITTING); 580 #endif 581 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 582 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 583 schedule(); 584 finish_wait(wq, &wait.wq_entry); 585 } 586 587 /* 588 * From this point on, this inode will not be committed either 589 * by fast or full commit as long as the handle is open. 590 */ 591 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 592 trace_ext4_fc_track_inode(handle, inode, ret); 593 } 594 595 struct __track_range_args { 596 ext4_lblk_t start, end; 597 }; 598 599 /* __track_fn for tracking data updates */ 600 static int __track_range(handle_t *handle, struct inode *inode, void *arg, 601 bool update) 602 { 603 struct ext4_inode_info *ei = EXT4_I(inode); 604 ext4_lblk_t oldstart; 605 struct __track_range_args *__arg = 606 (struct __track_range_args *)arg; 607 608 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 609 ext4_debug("Special inode %llu being modified\n", inode->i_ino); 610 return -ECANCELED; 611 } 612 613 oldstart = ei->i_fc_lblk_start; 614 615 if (update && ei->i_fc_lblk_len > 0) { 616 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 617 ei->i_fc_lblk_len = 618 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 619 ei->i_fc_lblk_start + 1; 620 } else { 621 ei->i_fc_lblk_start = __arg->start; 622 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 623 } 624 625 return 0; 626 } 627 628 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 629 ext4_lblk_t end) 630 { 631 struct __track_range_args args; 632 int ret; 633 634 if (S_ISDIR(inode->i_mode)) 635 return; 636 637 if (!ext4_fc_eligible(inode->i_sb)) 638 return; 639 640 if (ext4_has_inline_data(inode)) { 641 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, 642 handle); 643 return; 644 } 645 646 args.start = start; 647 args.end = end; 648 649 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 650 651 trace_ext4_fc_track_range(handle, inode, start, end, ret); 652 } 653 654 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) 655 { 656 blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; 657 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 658 659 /* Add REQ_FUA | REQ_PREFLUSH only its tail */ 660 if (test_opt(sb, BARRIER) && is_tail) 661 write_flags |= REQ_FUA | REQ_PREFLUSH; 662 lock_buffer(bh); 663 set_buffer_dirty(bh); 664 set_buffer_uptodate(bh); 665 bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync); 666 EXT4_SB(sb)->s_fc_bh = NULL; 667 } 668 669 /* Ext4 commit path routines */ 670 671 /* 672 * Allocate len bytes on a fast commit buffer. 673 * 674 * During the commit time this function is used to manage fast commit 675 * block space. We don't split a fast commit log onto different 676 * blocks. So this function makes sure that if there's not enough space 677 * on the current block, the remaining space in the current block is 678 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 679 * new block is from jbd2 and CRC is updated to reflect the padding 680 * we added. 681 */ 682 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 683 { 684 struct ext4_fc_tl tl; 685 struct ext4_sb_info *sbi = EXT4_SB(sb); 686 struct buffer_head *bh; 687 int bsize = sbi->s_journal->j_blocksize; 688 int ret, off = sbi->s_fc_bytes % bsize; 689 int remaining; 690 u8 *dst; 691 692 /* 693 * If 'len' is too long to fit in any block alongside a PAD tlv, then we 694 * cannot fulfill the request. 695 */ 696 if (len > bsize - EXT4_FC_TAG_BASE_LEN) 697 return NULL; 698 699 if (!sbi->s_fc_bh) { 700 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 701 if (ret) 702 return NULL; 703 sbi->s_fc_bh = bh; 704 } 705 dst = sbi->s_fc_bh->b_data + off; 706 707 /* 708 * Allocate the bytes in the current block if we can do so while still 709 * leaving enough space for a PAD tlv. 710 */ 711 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; 712 if (len <= remaining) { 713 sbi->s_fc_bytes += len; 714 return dst; 715 } 716 717 /* 718 * Else, terminate the current block with a PAD tlv, then allocate a new 719 * block and allocate the bytes at the start of that new block. 720 */ 721 722 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 723 tl.fc_len = cpu_to_le16(remaining); 724 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 725 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); 726 *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); 727 728 ext4_fc_submit_bh(sb, false); 729 730 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 731 if (ret) 732 return NULL; 733 sbi->s_fc_bh = bh; 734 sbi->s_fc_bytes += bsize - off + len; 735 return sbi->s_fc_bh->b_data; 736 } 737 738 /* 739 * Complete a fast commit by writing tail tag. 740 * 741 * Writing tail tag marks the end of a fast commit. In order to guarantee 742 * atomicity, after writing tail tag, even if there's space remaining 743 * in the block, next commit shouldn't use it. That's why tail tag 744 * has the length as that of the remaining space on the block. 745 */ 746 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 747 { 748 struct ext4_sb_info *sbi = EXT4_SB(sb); 749 struct ext4_fc_tl tl; 750 struct ext4_fc_tail tail; 751 int off, bsize = sbi->s_journal->j_blocksize; 752 u8 *dst; 753 754 /* 755 * ext4_fc_reserve_space takes care of allocating an extra block if 756 * there's no enough space on this block for accommodating this tail. 757 */ 758 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); 759 if (!dst) 760 return -ENOSPC; 761 762 off = sbi->s_fc_bytes % bsize; 763 764 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 765 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); 766 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 767 768 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 769 dst += EXT4_FC_TAG_BASE_LEN; 770 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 771 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); 772 dst += sizeof(tail.fc_tid); 773 crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, 774 dst - (u8 *)sbi->s_fc_bh->b_data); 775 tail.fc_crc = cpu_to_le32(crc); 776 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); 777 dst += sizeof(tail.fc_crc); 778 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ 779 780 ext4_fc_submit_bh(sb, true); 781 782 return 0; 783 } 784 785 /* 786 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 787 * Returns false if there's not enough space. 788 */ 789 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 790 u32 *crc) 791 { 792 struct ext4_fc_tl tl; 793 u8 *dst; 794 795 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); 796 if (!dst) 797 return false; 798 799 tl.fc_tag = cpu_to_le16(tag); 800 tl.fc_len = cpu_to_le16(len); 801 802 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 803 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); 804 805 return true; 806 } 807 808 /* Same as above, but adds dentry tlv. */ 809 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, 810 struct ext4_fc_dentry_update *fc_dentry) 811 { 812 struct ext4_fc_dentry_info fcd; 813 struct ext4_fc_tl tl; 814 int dlen = fc_dentry->fcd_name.name.len; 815 u8 *dst = ext4_fc_reserve_space(sb, 816 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); 817 818 if (!dst) 819 return false; 820 821 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); 822 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); 823 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); 824 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 825 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 826 dst += EXT4_FC_TAG_BASE_LEN; 827 memcpy(dst, &fcd, sizeof(fcd)); 828 dst += sizeof(fcd); 829 memcpy(dst, fc_dentry->fcd_name.name.name, dlen); 830 831 return true; 832 } 833 834 /* 835 * Writes inode in the fast commit space under TLV with tag @tag. 836 * Returns 0 on success, error on failure. 837 */ 838 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 839 { 840 struct ext4_inode_info *ei = EXT4_I(inode); 841 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 842 int ret; 843 struct ext4_iloc iloc; 844 struct ext4_fc_inode fc_inode; 845 struct ext4_fc_tl tl; 846 u8 *dst; 847 848 ret = ext4_get_inode_loc(inode, &iloc); 849 if (ret) 850 return ret; 851 852 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 853 inode_len = EXT4_INODE_SIZE(inode->i_sb); 854 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 855 inode_len += ei->i_extra_isize; 856 857 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 858 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 859 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 860 861 ret = -ECANCELED; 862 dst = ext4_fc_reserve_space(inode->i_sb, 863 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); 864 if (!dst) 865 goto err; 866 867 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); 868 dst += EXT4_FC_TAG_BASE_LEN; 869 memcpy(dst, &fc_inode, sizeof(fc_inode)); 870 dst += sizeof(fc_inode); 871 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); 872 ret = 0; 873 err: 874 brelse(iloc.bh); 875 return ret; 876 } 877 878 /* 879 * Writes updated data ranges for the inode in question. Updates CRC. 880 * Returns 0 on success, error otherwise. 881 */ 882 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 883 { 884 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 885 struct ext4_inode_info *ei = EXT4_I(inode); 886 struct ext4_map_blocks map; 887 struct ext4_fc_add_range fc_ext; 888 struct ext4_fc_del_range lrange; 889 struct ext4_extent *ex; 890 int ret; 891 892 spin_lock(&ei->i_fc_lock); 893 if (ei->i_fc_lblk_len == 0) { 894 spin_unlock(&ei->i_fc_lock); 895 return 0; 896 } 897 old_blk_size = ei->i_fc_lblk_start; 898 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 899 ei->i_fc_lblk_len = 0; 900 spin_unlock(&ei->i_fc_lock); 901 902 cur_lblk_off = old_blk_size; 903 ext4_debug("will try writing %d to %d for inode %llu\n", 904 cur_lblk_off, new_blk_size, inode->i_ino); 905 906 while (cur_lblk_off <= new_blk_size) { 907 map.m_lblk = cur_lblk_off; 908 map.m_len = new_blk_size - cur_lblk_off + 1; 909 ret = ext4_map_blocks(NULL, inode, &map, 910 EXT4_GET_BLOCKS_IO_SUBMIT | 911 EXT4_EX_NOCACHE); 912 if (ret < 0) 913 return -ECANCELED; 914 915 if (map.m_len == 0) { 916 cur_lblk_off++; 917 continue; 918 } 919 920 if (ret == 0) { 921 lrange.fc_ino = cpu_to_le32(inode->i_ino); 922 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 923 lrange.fc_len = cpu_to_le32(map.m_len); 924 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 925 sizeof(lrange), (u8 *)&lrange, crc)) 926 return -ENOSPC; 927 } else { 928 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? 929 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; 930 931 /* Limit the number of blocks in one extent */ 932 map.m_len = min(max, map.m_len); 933 934 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 935 ex = (struct ext4_extent *)&fc_ext.fc_ex; 936 ex->ee_block = cpu_to_le32(map.m_lblk); 937 ex->ee_len = cpu_to_le16(map.m_len); 938 ext4_ext_store_pblock(ex, map.m_pblk); 939 if (map.m_flags & EXT4_MAP_UNWRITTEN) 940 ext4_ext_mark_unwritten(ex); 941 else 942 ext4_ext_mark_initialized(ex); 943 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 944 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 945 return -ENOSPC; 946 } 947 948 cur_lblk_off += map.m_len; 949 } 950 951 return 0; 952 } 953 954 955 /* Flushes data of all the inodes in the commit queue. */ 956 static int ext4_fc_flush_data(journal_t *journal) 957 { 958 struct super_block *sb = journal->j_private; 959 struct ext4_sb_info *sbi = EXT4_SB(sb); 960 struct ext4_inode_info *ei; 961 int ret = 0; 962 963 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 964 ret = jbd2_submit_inode_data(journal, READ_ONCE(ei->jinode)); 965 if (ret) 966 return ret; 967 } 968 969 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 970 ret = jbd2_wait_inode_data(journal, READ_ONCE(ei->jinode)); 971 if (ret) 972 return ret; 973 } 974 975 return 0; 976 } 977 978 /* Commit all the directory entry updates */ 979 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 980 { 981 struct super_block *sb = journal->j_private; 982 struct ext4_sb_info *sbi = EXT4_SB(sb); 983 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; 984 struct inode *inode; 985 struct ext4_inode_info *ei; 986 int ret; 987 988 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 989 return 0; 990 list_for_each_entry_safe(fc_dentry, fc_dentry_n, 991 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { 992 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 993 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) 994 return -ENOSPC; 995 continue; 996 } 997 /* 998 * With fcd_dilist we need not loop in sbi->s_fc_q to get the 999 * corresponding inode. Also, the corresponding inode could have been 1000 * deleted, in which case, we don't need to do anything. 1001 */ 1002 if (list_empty(&fc_dentry->fcd_dilist)) 1003 continue; 1004 ei = list_first_entry(&fc_dentry->fcd_dilist, 1005 struct ext4_inode_info, i_fc_dilist); 1006 inode = &ei->vfs_inode; 1007 WARN_ON(inode->i_ino != fc_dentry->fcd_ino); 1008 1009 /* 1010 * We first write the inode and then the create dirent. This 1011 * allows the recovery code to create an unnamed inode first 1012 * and then link it to a directory entry. This allows us 1013 * to use namei.c routines almost as is and simplifies 1014 * the recovery code. 1015 */ 1016 ret = ext4_fc_write_inode(inode, crc); 1017 if (ret) 1018 return ret; 1019 ret = ext4_fc_write_inode_data(inode, crc); 1020 if (ret) 1021 return ret; 1022 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) 1023 return -ENOSPC; 1024 } 1025 return 0; 1026 } 1027 1028 static int ext4_fc_perform_commit(journal_t *journal) 1029 { 1030 struct super_block *sb = journal->j_private; 1031 struct ext4_sb_info *sbi = EXT4_SB(sb); 1032 struct ext4_inode_info *iter; 1033 struct ext4_fc_head head; 1034 struct inode *inode; 1035 struct blk_plug plug; 1036 int ret = 0; 1037 u32 crc = 0; 1038 int alloc_ctx; 1039 1040 /* 1041 * Step 1: Mark all inodes on s_fc_q[MAIN] with 1042 * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being 1043 * freed until the data flush is over. 1044 */ 1045 alloc_ctx = ext4_fc_lock(sb); 1046 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1047 ext4_set_inode_state(&iter->vfs_inode, 1048 EXT4_STATE_FC_FLUSHING_DATA); 1049 } 1050 ext4_fc_unlock(sb, alloc_ctx); 1051 1052 /* Step 2: Flush data for all the eligible inodes. */ 1053 ret = ext4_fc_flush_data(journal); 1054 1055 /* 1056 * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning 1057 * any error from step 2. This ensures that waiters waiting on 1058 * EXT4_STATE_FC_FLUSHING_DATA can resume. 1059 */ 1060 alloc_ctx = ext4_fc_lock(sb); 1061 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1062 ext4_clear_inode_state(&iter->vfs_inode, 1063 EXT4_STATE_FC_FLUSHING_DATA); 1064 #if (BITS_PER_LONG < 64) 1065 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); 1066 #else 1067 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); 1068 #endif 1069 } 1070 1071 /* 1072 * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before 1073 * the waiter checks the bit. Pairs with implicit barrier in 1074 * prepare_to_wait() in ext4_fc_del(). 1075 */ 1076 smp_mb(); 1077 ext4_fc_unlock(sb, alloc_ctx); 1078 1079 /* 1080 * If we encountered error in Step 2, return it now after clearing 1081 * EXT4_STATE_FC_FLUSHING_DATA bit. 1082 */ 1083 if (ret) 1084 return ret; 1085 1086 1087 /* Step 4: Mark all inodes as being committed. */ 1088 jbd2_journal_lock_updates(journal); 1089 /* 1090 * The journal is now locked. No more handles can start and all the 1091 * previous handles are now drained. We now mark the inodes on the 1092 * commit queue as being committed. 1093 */ 1094 alloc_ctx = ext4_fc_lock(sb); 1095 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1096 ext4_set_inode_state(&iter->vfs_inode, 1097 EXT4_STATE_FC_COMMITTING); 1098 } 1099 ext4_fc_unlock(sb, alloc_ctx); 1100 jbd2_journal_unlock_updates(journal); 1101 1102 /* 1103 * Step 5: If file system device is different from journal device, 1104 * issue a cache flush before we start writing fast commit blocks. 1105 */ 1106 if (journal->j_fs_dev != journal->j_dev) 1107 blkdev_issue_flush(journal->j_fs_dev); 1108 1109 blk_start_plug(&plug); 1110 alloc_ctx = ext4_fc_lock(sb); 1111 /* Step 6: Write fast commit blocks to disk. */ 1112 if (sbi->s_fc_bytes == 0) { 1113 /* 1114 * Step 6.1: Add a head tag only if this is the first fast 1115 * commit in this TID. 1116 */ 1117 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1118 head.fc_tid = cpu_to_le32( 1119 sbi->s_journal->j_running_transaction->t_tid); 1120 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1121 (u8 *)&head, &crc)) { 1122 ret = -ENOSPC; 1123 goto out; 1124 } 1125 } 1126 1127 /* Step 6.2: Now write all the dentry updates. */ 1128 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1129 if (ret) 1130 goto out; 1131 1132 /* Step 6.3: Now write all the changed inodes to disk. */ 1133 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 1134 inode = &iter->vfs_inode; 1135 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1136 continue; 1137 1138 ret = ext4_fc_write_inode_data(inode, &crc); 1139 if (ret) 1140 goto out; 1141 ret = ext4_fc_write_inode(inode, &crc); 1142 if (ret) 1143 goto out; 1144 } 1145 /* Step 6.4: Finally write tail tag to conclude this fast commit. */ 1146 ret = ext4_fc_write_tail(sb, crc); 1147 1148 out: 1149 ext4_fc_unlock(sb, alloc_ctx); 1150 blk_finish_plug(&plug); 1151 return ret; 1152 } 1153 1154 static void ext4_fc_update_stats(struct super_block *sb, int status, 1155 u64 commit_time, int nblks, tid_t commit_tid) 1156 { 1157 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; 1158 1159 ext4_debug("Fast commit ended with status = %d for tid %u", 1160 status, commit_tid); 1161 if (status == EXT4_FC_STATUS_OK) { 1162 stats->fc_num_commits++; 1163 stats->fc_numblks += nblks; 1164 if (likely(stats->s_fc_avg_commit_time)) 1165 stats->s_fc_avg_commit_time = 1166 (commit_time + 1167 stats->s_fc_avg_commit_time * 3) / 4; 1168 else 1169 stats->s_fc_avg_commit_time = commit_time; 1170 } else if (status == EXT4_FC_STATUS_FAILED || 1171 status == EXT4_FC_STATUS_INELIGIBLE) { 1172 if (status == EXT4_FC_STATUS_FAILED) 1173 stats->fc_failed_commits++; 1174 stats->fc_ineligible_commits++; 1175 } else { 1176 stats->fc_skipped_commits++; 1177 } 1178 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); 1179 } 1180 1181 /* 1182 * The main commit entry point. Performs a fast commit for transaction 1183 * commit_tid if needed. If it's not possible to perform a fast commit 1184 * due to various reasons, we fall back to full commit. Returns 0 1185 * on success, error otherwise. 1186 */ 1187 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1188 { 1189 struct super_block *sb = journal->j_private; 1190 struct ext4_sb_info *sbi = EXT4_SB(sb); 1191 int nblks = 0, ret, bsize = journal->j_blocksize; 1192 int subtid = atomic_read(&sbi->s_fc_subtid); 1193 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; 1194 ktime_t start_time, commit_time; 1195 int old_ioprio, journal_ioprio; 1196 1197 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 1198 return jbd2_complete_transaction(journal, commit_tid); 1199 1200 trace_ext4_fc_commit_start(sb, commit_tid); 1201 1202 start_time = ktime_get(); 1203 old_ioprio = get_current_ioprio(); 1204 1205 restart_fc: 1206 ret = jbd2_fc_begin_commit(journal, commit_tid); 1207 if (ret == -EALREADY) { 1208 /* There was an ongoing commit, check if we need to restart */ 1209 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1210 tid_gt(commit_tid, journal->j_commit_sequence)) 1211 goto restart_fc; 1212 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, 1213 commit_tid); 1214 return 0; 1215 } else if (ret) { 1216 /* 1217 * Commit couldn't start. Just update stats and perform a 1218 * full commit. 1219 */ 1220 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, 1221 commit_tid); 1222 return jbd2_complete_transaction(journal, commit_tid); 1223 } 1224 1225 /* 1226 * After establishing journal barrier via jbd2_fc_begin_commit(), check 1227 * if we are fast commit ineligible. 1228 */ 1229 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { 1230 status = EXT4_FC_STATUS_INELIGIBLE; 1231 goto fallback; 1232 } 1233 1234 /* 1235 * Now that we know that this thread is going to do a fast commit, 1236 * elevate the priority to match that of the journal thread. 1237 */ 1238 if (journal->j_task->io_context) 1239 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; 1240 else 1241 journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; 1242 set_task_ioprio(current, journal_ioprio); 1243 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1244 ret = ext4_fc_perform_commit(journal); 1245 if (ret < 0) { 1246 status = EXT4_FC_STATUS_FAILED; 1247 goto fallback; 1248 } 1249 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1250 ret = jbd2_fc_wait_bufs(journal, nblks); 1251 if (ret < 0) { 1252 status = EXT4_FC_STATUS_FAILED; 1253 goto fallback; 1254 } 1255 atomic_inc(&sbi->s_fc_subtid); 1256 ret = jbd2_fc_end_commit(journal); 1257 set_task_ioprio(current, old_ioprio); 1258 /* 1259 * weight the commit time higher than the average time so we 1260 * don't react too strongly to vast changes in the commit time 1261 */ 1262 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1263 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); 1264 return ret; 1265 1266 fallback: 1267 set_task_ioprio(current, old_ioprio); 1268 ret = jbd2_fc_end_commit_fallback(journal); 1269 ext4_fc_update_stats(sb, status, 0, 0, commit_tid); 1270 return ret; 1271 } 1272 1273 /* 1274 * Fast commit cleanup routine. This is called after every fast commit and 1275 * full commit. full is true if we are called after a full commit. 1276 */ 1277 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) 1278 { 1279 struct super_block *sb = journal->j_private; 1280 struct ext4_sb_info *sbi = EXT4_SB(sb); 1281 struct ext4_inode_info *ei; 1282 struct ext4_fc_dentry_update *fc_dentry; 1283 int alloc_ctx; 1284 1285 if (full && sbi->s_fc_bh) 1286 sbi->s_fc_bh = NULL; 1287 1288 trace_ext4_fc_cleanup(journal, full, tid); 1289 jbd2_fc_release_bufs(journal); 1290 1291 alloc_ctx = ext4_fc_lock(sb); 1292 while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { 1293 ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], 1294 struct ext4_inode_info, 1295 i_fc_list); 1296 list_del_init(&ei->i_fc_list); 1297 ext4_clear_inode_state(&ei->vfs_inode, 1298 EXT4_STATE_FC_COMMITTING); 1299 if (tid_geq(tid, ei->i_sync_tid)) { 1300 ext4_fc_reset_inode(&ei->vfs_inode); 1301 } else if (full) { 1302 /* 1303 * We are called after a full commit, inode has been 1304 * modified while the commit was running. Re-enqueue 1305 * the inode into STAGING, which will then be splice 1306 * back into MAIN. This cannot happen during 1307 * fastcommit because the journal is locked all the 1308 * time in that case (and tid doesn't increase so 1309 * tid check above isn't reliable). 1310 */ 1311 list_add_tail(&ei->i_fc_list, 1312 &sbi->s_fc_q[FC_Q_STAGING]); 1313 } 1314 /* 1315 * Make sure clearing of EXT4_STATE_FC_COMMITTING is 1316 * visible before we send the wakeup. Pairs with implicit 1317 * barrier in prepare_to_wait() in ext4_fc_track_inode(). 1318 */ 1319 smp_mb(); 1320 #if (BITS_PER_LONG < 64) 1321 wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); 1322 #else 1323 wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); 1324 #endif 1325 } 1326 1327 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1328 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1329 struct ext4_fc_dentry_update, 1330 fcd_list); 1331 list_del_init(&fc_dentry->fcd_list); 1332 list_del_init(&fc_dentry->fcd_dilist); 1333 1334 release_dentry_name_snapshot(&fc_dentry->fcd_name); 1335 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1336 } 1337 1338 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1339 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1340 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1341 &sbi->s_fc_q[FC_Q_MAIN]); 1342 1343 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { 1344 sbi->s_fc_ineligible_tid = 0; 1345 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1346 } 1347 1348 if (full) 1349 sbi->s_fc_bytes = 0; 1350 ext4_fc_unlock(sb, alloc_ctx); 1351 trace_ext4_fc_stats(sb); 1352 } 1353 1354 /* Ext4 Replay Path Routines */ 1355 1356 /* Helper struct for dentry replay routines */ 1357 struct dentry_info_args { 1358 int parent_ino, dname_len, ino, inode_len; 1359 char *dname; 1360 }; 1361 1362 /* Same as struct ext4_fc_tl, but uses native endianness fields */ 1363 struct ext4_fc_tl_mem { 1364 u16 fc_tag; 1365 u16 fc_len; 1366 }; 1367 1368 static inline void tl_to_darg(struct dentry_info_args *darg, 1369 struct ext4_fc_tl_mem *tl, u8 *val) 1370 { 1371 struct ext4_fc_dentry_info fcd; 1372 1373 memcpy(&fcd, val, sizeof(fcd)); 1374 1375 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); 1376 darg->ino = le32_to_cpu(fcd.fc_ino); 1377 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); 1378 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); 1379 } 1380 1381 static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) 1382 { 1383 struct ext4_fc_tl tl_disk; 1384 1385 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); 1386 tl->fc_len = le16_to_cpu(tl_disk.fc_len); 1387 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); 1388 } 1389 1390 /* Unlink replay function */ 1391 static int ext4_fc_replay_unlink(struct super_block *sb, 1392 struct ext4_fc_tl_mem *tl, u8 *val) 1393 { 1394 struct inode *inode, *old_parent; 1395 struct qstr entry; 1396 struct dentry_info_args darg; 1397 int ret = 0; 1398 1399 tl_to_darg(&darg, tl, val); 1400 1401 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1402 darg.parent_ino, darg.dname_len); 1403 1404 entry.name = darg.dname; 1405 entry.len = darg.dname_len; 1406 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1407 1408 if (IS_ERR(inode)) { 1409 ext4_debug("Inode %d not found", darg.ino); 1410 return 0; 1411 } 1412 1413 old_parent = ext4_iget(sb, darg.parent_ino, 1414 EXT4_IGET_NORMAL); 1415 if (IS_ERR(old_parent)) { 1416 ext4_debug("Dir with inode %d not found", darg.parent_ino); 1417 iput(inode); 1418 return 0; 1419 } 1420 1421 ret = __ext4_unlink(old_parent, &entry, inode, NULL); 1422 /* -ENOENT ok coz it might not exist anymore. */ 1423 if (ret == -ENOENT) 1424 ret = 0; 1425 iput(old_parent); 1426 iput(inode); 1427 return ret; 1428 } 1429 1430 static int ext4_fc_replay_link_internal(struct super_block *sb, 1431 struct dentry_info_args *darg, 1432 struct inode *inode) 1433 { 1434 struct inode *dir = NULL; 1435 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1436 int ret = 0; 1437 1438 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1439 if (IS_ERR(dir)) { 1440 ext4_debug("Dir with inode %d not found.", darg->parent_ino); 1441 dir = NULL; 1442 goto out; 1443 } 1444 1445 ret = __ext4_link(dir, inode, &qstr_dname, NULL); 1446 /* 1447 * It's possible that link already existed since data blocks 1448 * for the dir in question got persisted before we crashed OR 1449 * we replayed this tag and crashed before the entire replay 1450 * could complete. 1451 */ 1452 if (ret && ret != -EEXIST) { 1453 ext4_debug("Failed to link\n"); 1454 goto out; 1455 } 1456 1457 ret = 0; 1458 out: 1459 if (dir) 1460 iput(dir); 1461 1462 return ret; 1463 } 1464 1465 /* Link replay function */ 1466 static int ext4_fc_replay_link(struct super_block *sb, 1467 struct ext4_fc_tl_mem *tl, u8 *val) 1468 { 1469 struct inode *inode; 1470 struct dentry_info_args darg; 1471 int ret = 0; 1472 1473 tl_to_darg(&darg, tl, val); 1474 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1475 darg.parent_ino, darg.dname_len); 1476 1477 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1478 if (IS_ERR(inode)) { 1479 ext4_debug("Inode not found."); 1480 return 0; 1481 } 1482 1483 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1484 iput(inode); 1485 return ret; 1486 } 1487 1488 /* 1489 * Record all the modified inodes during replay. We use this later to setup 1490 * block bitmaps correctly. 1491 */ 1492 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1493 { 1494 struct ext4_fc_replay_state *state; 1495 int i; 1496 1497 state = &EXT4_SB(sb)->s_fc_replay_state; 1498 for (i = 0; i < state->fc_modified_inodes_used; i++) 1499 if (state->fc_modified_inodes[i] == ino) 1500 return 0; 1501 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1502 int *fc_modified_inodes; 1503 1504 fc_modified_inodes = krealloc(state->fc_modified_inodes, 1505 sizeof(int) * (state->fc_modified_inodes_size + 1506 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1507 GFP_KERNEL); 1508 if (!fc_modified_inodes) 1509 return -ENOMEM; 1510 state->fc_modified_inodes = fc_modified_inodes; 1511 state->fc_modified_inodes_size += 1512 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1513 } 1514 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1515 return 0; 1516 } 1517 1518 /* 1519 * Inode replay function 1520 */ 1521 static int ext4_fc_replay_inode(struct super_block *sb, 1522 struct ext4_fc_tl_mem *tl, u8 *val) 1523 { 1524 struct ext4_fc_inode fc_inode; 1525 struct ext4_inode *raw_inode; 1526 struct ext4_inode *raw_fc_inode; 1527 struct inode *inode = NULL; 1528 struct ext4_iloc iloc; 1529 int inode_len, ino, ret, tag = tl->fc_tag; 1530 struct ext4_extent_header *eh; 1531 size_t off_gen = offsetof(struct ext4_inode, i_generation); 1532 1533 memcpy(&fc_inode, val, sizeof(fc_inode)); 1534 1535 ino = le32_to_cpu(fc_inode.fc_ino); 1536 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1537 1538 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1539 if (!IS_ERR(inode)) { 1540 ext4_ext_clear_bb(inode); 1541 iput(inode); 1542 } 1543 inode = NULL; 1544 1545 ret = ext4_fc_record_modified_inode(sb, ino); 1546 if (ret) 1547 goto out; 1548 1549 raw_fc_inode = (struct ext4_inode *) 1550 (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); 1551 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1552 if (ret) 1553 goto out; 1554 1555 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); 1556 raw_inode = ext4_raw_inode(&iloc); 1557 1558 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1559 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, 1560 inode_len - off_gen); 1561 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1562 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1563 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1564 memset(eh, 0, sizeof(*eh)); 1565 eh->eh_magic = EXT4_EXT_MAGIC; 1566 eh->eh_max = cpu_to_le16( 1567 (sizeof(raw_inode->i_block) - 1568 sizeof(struct ext4_extent_header)) 1569 / sizeof(struct ext4_extent)); 1570 } 1571 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1572 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1573 sizeof(raw_inode->i_block)); 1574 } 1575 1576 /* Immediately update the inode on disk. */ 1577 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1578 if (ret) 1579 goto out_brelse; 1580 ret = sync_dirty_buffer(iloc.bh); 1581 if (ret) 1582 goto out_brelse; 1583 ret = ext4_mark_inode_used(sb, ino); 1584 if (ret) 1585 goto out_brelse; 1586 1587 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1588 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1589 if (IS_ERR(inode)) { 1590 ext4_debug("Inode not found."); 1591 inode = NULL; 1592 ret = -EFSCORRUPTED; 1593 goto out_brelse; 1594 } 1595 1596 /* 1597 * Our allocator could have made different decisions than before 1598 * crashing. This should be fixed but until then, we calculate 1599 * the number of blocks the inode. 1600 */ 1601 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) 1602 ext4_ext_replay_set_iblocks(inode); 1603 1604 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1605 ext4_reset_inode_seed(inode); 1606 1607 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1608 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1609 sync_dirty_buffer(iloc.bh); 1610 out_brelse: 1611 brelse(iloc.bh); 1612 out: 1613 iput(inode); 1614 if (!ret) 1615 blkdev_issue_flush(sb->s_bdev); 1616 1617 return ret; 1618 } 1619 1620 /* 1621 * Dentry create replay function. 1622 * 1623 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1624 * inode for which we are trying to create a dentry here, should already have 1625 * been replayed before we start here. 1626 */ 1627 static int ext4_fc_replay_create(struct super_block *sb, 1628 struct ext4_fc_tl_mem *tl, u8 *val) 1629 { 1630 int ret = 0; 1631 struct inode *inode = NULL; 1632 struct inode *dir = NULL; 1633 struct dentry_info_args darg; 1634 1635 tl_to_darg(&darg, tl, val); 1636 1637 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1638 darg.parent_ino, darg.dname_len); 1639 1640 /* This takes care of update group descriptor and other metadata */ 1641 ret = ext4_mark_inode_used(sb, darg.ino); 1642 if (ret) 1643 goto out; 1644 1645 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1646 if (IS_ERR(inode)) { 1647 ext4_debug("inode %d not found.", darg.ino); 1648 inode = NULL; 1649 ret = -EINVAL; 1650 goto out; 1651 } 1652 1653 if (S_ISDIR(inode->i_mode)) { 1654 /* 1655 * If we are creating a directory, we need to make sure that the 1656 * dot and dot dot dirents are setup properly. 1657 */ 1658 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1659 if (IS_ERR(dir)) { 1660 ext4_debug("Dir %d not found.", darg.ino); 1661 goto out; 1662 } 1663 ret = ext4_init_new_dir(NULL, dir, inode); 1664 iput(dir); 1665 if (ret) { 1666 ret = 0; 1667 goto out; 1668 } 1669 } 1670 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1671 if (ret) 1672 goto out; 1673 set_nlink(inode, 1); 1674 ext4_mark_inode_dirty(NULL, inode); 1675 out: 1676 iput(inode); 1677 return ret; 1678 } 1679 1680 /* 1681 * Record physical disk regions which are in use as per fast commit area, 1682 * and used by inodes during replay phase. Our simple replay phase 1683 * allocator excludes these regions from allocation. 1684 */ 1685 int ext4_fc_record_regions(struct super_block *sb, int ino, 1686 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) 1687 { 1688 struct ext4_fc_replay_state *state; 1689 struct ext4_fc_alloc_region *region; 1690 1691 state = &EXT4_SB(sb)->s_fc_replay_state; 1692 /* 1693 * during replay phase, the fc_regions_valid may not same as 1694 * fc_regions_used, update it when do new additions. 1695 */ 1696 if (replay && state->fc_regions_used != state->fc_regions_valid) 1697 state->fc_regions_used = state->fc_regions_valid; 1698 if (state->fc_regions_used == state->fc_regions_size) { 1699 struct ext4_fc_alloc_region *fc_regions; 1700 1701 fc_regions = krealloc(state->fc_regions, 1702 sizeof(struct ext4_fc_alloc_region) * 1703 (state->fc_regions_size + 1704 EXT4_FC_REPLAY_REALLOC_INCREMENT), 1705 GFP_KERNEL); 1706 if (!fc_regions) 1707 return -ENOMEM; 1708 state->fc_regions_size += 1709 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1710 state->fc_regions = fc_regions; 1711 } 1712 region = &state->fc_regions[state->fc_regions_used++]; 1713 region->ino = ino; 1714 region->lblk = lblk; 1715 region->pblk = pblk; 1716 region->len = len; 1717 1718 if (replay) 1719 state->fc_regions_valid++; 1720 1721 return 0; 1722 } 1723 1724 /* Replay add range tag */ 1725 static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val) 1726 { 1727 struct ext4_fc_add_range fc_add_ex; 1728 struct ext4_extent newex, *ex; 1729 struct inode *inode; 1730 ext4_lblk_t start, cur; 1731 int remaining, len; 1732 ext4_fsblk_t start_pblk; 1733 struct ext4_map_blocks map; 1734 struct ext4_ext_path *path = NULL; 1735 int ret; 1736 1737 memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); 1738 ex = (struct ext4_extent *)&fc_add_ex.fc_ex; 1739 1740 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1741 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), 1742 ext4_ext_get_actual_len(ex)); 1743 1744 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); 1745 if (IS_ERR(inode)) { 1746 ext4_debug("Inode not found."); 1747 return 0; 1748 } 1749 1750 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1751 if (ret) 1752 goto out; 1753 1754 start = le32_to_cpu(ex->ee_block); 1755 start_pblk = ext4_ext_pblock(ex); 1756 len = ext4_ext_get_actual_len(ex); 1757 1758 cur = start; 1759 remaining = len; 1760 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n", 1761 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1762 inode->i_ino); 1763 1764 while (remaining > 0) { 1765 map.m_lblk = cur; 1766 map.m_len = remaining; 1767 map.m_pblk = 0; 1768 ret = ext4_map_blocks(NULL, inode, &map, 0); 1769 1770 if (ret < 0) 1771 goto out; 1772 1773 if (ret == 0) { 1774 /* Range is not mapped */ 1775 path = ext4_find_extent(inode, cur, path, 0); 1776 if (IS_ERR(path)) 1777 goto out; 1778 memset(&newex, 0, sizeof(newex)); 1779 newex.ee_block = cpu_to_le32(cur); 1780 ext4_ext_store_pblock( 1781 &newex, start_pblk + cur - start); 1782 newex.ee_len = cpu_to_le16(map.m_len); 1783 if (ext4_ext_is_unwritten(ex)) 1784 ext4_ext_mark_unwritten(&newex); 1785 down_write(&EXT4_I(inode)->i_data_sem); 1786 path = ext4_ext_insert_extent(NULL, inode, 1787 path, &newex, 0); 1788 up_write((&EXT4_I(inode)->i_data_sem)); 1789 if (IS_ERR(path)) 1790 goto out; 1791 goto next; 1792 } 1793 1794 if (start_pblk + cur - start != map.m_pblk) { 1795 /* 1796 * Logical to physical mapping changed. This can happen 1797 * if this range was removed and then reallocated to 1798 * map to new physical blocks during a fast commit. 1799 */ 1800 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1801 ext4_ext_is_unwritten(ex), 1802 start_pblk + cur - start); 1803 if (ret) 1804 goto out; 1805 /* 1806 * Mark the old blocks as free since they aren't used 1807 * anymore. We maintain an array of all the modified 1808 * inodes. In case these blocks are still used at either 1809 * a different logical range in the same inode or in 1810 * some different inode, we will mark them as allocated 1811 * at the end of the FC replay using our array of 1812 * modified inodes. 1813 */ 1814 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1815 goto next; 1816 } 1817 1818 /* Range is mapped and needs a state change */ 1819 ext4_debug("Converting from %ld to %d %lld", 1820 map.m_flags & EXT4_MAP_UNWRITTEN, 1821 ext4_ext_is_unwritten(ex), map.m_pblk); 1822 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1823 ext4_ext_is_unwritten(ex), map.m_pblk); 1824 if (ret) 1825 goto out; 1826 /* 1827 * We may have split the extent tree while toggling the state. 1828 * Try to shrink the extent tree now. 1829 */ 1830 ext4_ext_replay_shrink_inode(inode, start + len); 1831 next: 1832 cur += map.m_len; 1833 remaining -= map.m_len; 1834 } 1835 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1836 sb->s_blocksize_bits); 1837 out: 1838 ext4_free_ext_path(path); 1839 iput(inode); 1840 return 0; 1841 } 1842 1843 /* Replay DEL_RANGE tag */ 1844 static int 1845 ext4_fc_replay_del_range(struct super_block *sb, u8 *val) 1846 { 1847 struct inode *inode; 1848 struct ext4_fc_del_range lrange; 1849 struct ext4_map_blocks map; 1850 ext4_lblk_t cur, remaining; 1851 int ret; 1852 1853 memcpy(&lrange, val, sizeof(lrange)); 1854 cur = le32_to_cpu(lrange.fc_lblk); 1855 remaining = le32_to_cpu(lrange.fc_len); 1856 1857 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1858 le32_to_cpu(lrange.fc_ino), cur, remaining); 1859 1860 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); 1861 if (IS_ERR(inode)) { 1862 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); 1863 return 0; 1864 } 1865 1866 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1867 if (ret) 1868 goto out; 1869 1870 ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n", 1871 inode->i_ino, le32_to_cpu(lrange.fc_lblk), 1872 le32_to_cpu(lrange.fc_len)); 1873 while (remaining > 0) { 1874 map.m_lblk = cur; 1875 map.m_len = remaining; 1876 1877 ret = ext4_map_blocks(NULL, inode, &map, 0); 1878 if (ret < 0) 1879 goto out; 1880 if (ret > 0) { 1881 remaining -= ret; 1882 cur += ret; 1883 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1884 } else { 1885 remaining -= map.m_len; 1886 cur += map.m_len; 1887 } 1888 } 1889 1890 down_write(&EXT4_I(inode)->i_data_sem); 1891 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), 1892 le32_to_cpu(lrange.fc_lblk) + 1893 le32_to_cpu(lrange.fc_len) - 1); 1894 up_write(&EXT4_I(inode)->i_data_sem); 1895 if (ret) 1896 goto out; 1897 ext4_ext_replay_shrink_inode(inode, 1898 i_size_read(inode) >> sb->s_blocksize_bits); 1899 ext4_mark_inode_dirty(NULL, inode); 1900 out: 1901 iput(inode); 1902 return 0; 1903 } 1904 1905 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1906 { 1907 struct ext4_fc_replay_state *state; 1908 struct inode *inode; 1909 struct ext4_ext_path *path = NULL; 1910 struct ext4_map_blocks map; 1911 int i, ret, j; 1912 ext4_lblk_t cur, end; 1913 1914 state = &EXT4_SB(sb)->s_fc_replay_state; 1915 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1916 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1917 EXT4_IGET_NORMAL); 1918 if (IS_ERR(inode)) { 1919 ext4_debug("Inode %d not found.", 1920 state->fc_modified_inodes[i]); 1921 continue; 1922 } 1923 cur = 0; 1924 end = EXT_MAX_BLOCKS; 1925 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { 1926 iput(inode); 1927 continue; 1928 } 1929 while (cur < end) { 1930 map.m_lblk = cur; 1931 map.m_len = end - cur; 1932 1933 ret = ext4_map_blocks(NULL, inode, &map, 0); 1934 if (ret < 0) 1935 break; 1936 1937 if (ret > 0) { 1938 path = ext4_find_extent(inode, map.m_lblk, path, 0); 1939 if (!IS_ERR(path)) { 1940 for (j = 0; j < path->p_depth; j++) 1941 ext4_mb_mark_bb(inode->i_sb, 1942 path[j].p_block, 1, true); 1943 } else { 1944 path = NULL; 1945 } 1946 cur += ret; 1947 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1948 map.m_len, true); 1949 } else { 1950 cur = cur + (map.m_len ? map.m_len : 1); 1951 } 1952 } 1953 iput(inode); 1954 } 1955 1956 ext4_free_ext_path(path); 1957 } 1958 1959 /* 1960 * Check if block is in excluded regions for block allocation. The simple 1961 * allocator that runs during replay phase is calls this function to see 1962 * if it is okay to use a block. 1963 */ 1964 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1965 { 1966 int i; 1967 struct ext4_fc_replay_state *state; 1968 1969 state = &EXT4_SB(sb)->s_fc_replay_state; 1970 for (i = 0; i < state->fc_regions_valid; i++) { 1971 if (state->fc_regions[i].ino == 0 || 1972 state->fc_regions[i].len == 0) 1973 continue; 1974 if (in_range(blk, state->fc_regions[i].pblk, 1975 state->fc_regions[i].len)) 1976 return true; 1977 } 1978 return false; 1979 } 1980 1981 /* Cleanup function called after replay */ 1982 void ext4_fc_replay_cleanup(struct super_block *sb) 1983 { 1984 struct ext4_sb_info *sbi = EXT4_SB(sb); 1985 1986 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1987 kfree(sbi->s_fc_replay_state.fc_regions); 1988 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1989 } 1990 1991 static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, 1992 int tag, int len) 1993 { 1994 switch (tag) { 1995 case EXT4_FC_TAG_ADD_RANGE: 1996 return len == sizeof(struct ext4_fc_add_range); 1997 case EXT4_FC_TAG_DEL_RANGE: 1998 return len == sizeof(struct ext4_fc_del_range); 1999 case EXT4_FC_TAG_CREAT: 2000 case EXT4_FC_TAG_LINK: 2001 case EXT4_FC_TAG_UNLINK: 2002 len -= sizeof(struct ext4_fc_dentry_info); 2003 return len >= 1 && len <= EXT4_NAME_LEN; 2004 case EXT4_FC_TAG_INODE: 2005 len -= sizeof(struct ext4_fc_inode); 2006 return len >= EXT4_GOOD_OLD_INODE_SIZE && 2007 len <= sbi->s_inode_size; 2008 case EXT4_FC_TAG_PAD: 2009 return true; /* padding can have any length */ 2010 case EXT4_FC_TAG_TAIL: 2011 return len >= sizeof(struct ext4_fc_tail); 2012 case EXT4_FC_TAG_HEAD: 2013 return len == sizeof(struct ext4_fc_head); 2014 } 2015 return false; 2016 } 2017 2018 /* 2019 * Recovery Scan phase handler 2020 * 2021 * This function is called during the scan phase and is responsible 2022 * for doing following things: 2023 * - Make sure the fast commit area has valid tags for replay 2024 * - Count number of tags that need to be replayed by the replay handler 2025 * - Verify CRC 2026 * - Create a list of excluded blocks for allocation during replay phase 2027 * 2028 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 2029 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 2030 * to indicate that scan has finished and JBD2 can now start replay phase. 2031 * It returns a negative error to indicate that there was an error. At the end 2032 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 2033 * to indicate the number of tags that need to replayed during the replay phase. 2034 */ 2035 static int ext4_fc_replay_scan(journal_t *journal, 2036 struct buffer_head *bh, int off, 2037 tid_t expected_tid) 2038 { 2039 struct super_block *sb = journal->j_private; 2040 struct ext4_sb_info *sbi = EXT4_SB(sb); 2041 struct ext4_fc_replay_state *state; 2042 int ret = JBD2_FC_REPLAY_CONTINUE; 2043 struct ext4_fc_add_range ext; 2044 struct ext4_fc_tl_mem tl; 2045 struct ext4_fc_tail tail; 2046 __u8 *start, *end, *cur, *val; 2047 struct ext4_fc_head head; 2048 struct ext4_extent *ex; 2049 2050 state = &sbi->s_fc_replay_state; 2051 2052 start = (u8 *)bh->b_data; 2053 end = start + journal->j_blocksize; 2054 2055 if (state->fc_replay_expected_off == 0) { 2056 state->fc_cur_tag = 0; 2057 state->fc_replay_num_tags = 0; 2058 state->fc_crc = 0; 2059 state->fc_regions = NULL; 2060 state->fc_regions_valid = state->fc_regions_used = 2061 state->fc_regions_size = 0; 2062 /* Check if we can stop early */ 2063 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 2064 != EXT4_FC_TAG_HEAD) 2065 return 0; 2066 } 2067 2068 if (off != state->fc_replay_expected_off) { 2069 ret = -EFSCORRUPTED; 2070 goto out_err; 2071 } 2072 2073 state->fc_replay_expected_off++; 2074 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2075 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2076 ext4_fc_get_tl(&tl, cur); 2077 val = cur + EXT4_FC_TAG_BASE_LEN; 2078 if (tl.fc_len > end - val || 2079 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { 2080 ret = state->fc_replay_num_tags ? 2081 JBD2_FC_REPLAY_STOP : -ECANCELED; 2082 goto out_err; 2083 } 2084 ext4_debug("Scan phase, tag:%s, blk %lld\n", 2085 tag2str(tl.fc_tag), bh->b_blocknr); 2086 switch (tl.fc_tag) { 2087 case EXT4_FC_TAG_ADD_RANGE: 2088 memcpy(&ext, val, sizeof(ext)); 2089 ex = (struct ext4_extent *)&ext.fc_ex; 2090 ret = ext4_fc_record_regions(sb, 2091 le32_to_cpu(ext.fc_ino), 2092 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2093 ext4_ext_get_actual_len(ex), 0); 2094 if (ret < 0) 2095 break; 2096 ret = JBD2_FC_REPLAY_CONTINUE; 2097 fallthrough; 2098 case EXT4_FC_TAG_DEL_RANGE: 2099 case EXT4_FC_TAG_LINK: 2100 case EXT4_FC_TAG_UNLINK: 2101 case EXT4_FC_TAG_CREAT: 2102 case EXT4_FC_TAG_INODE: 2103 case EXT4_FC_TAG_PAD: 2104 state->fc_cur_tag++; 2105 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2106 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2107 break; 2108 case EXT4_FC_TAG_TAIL: 2109 state->fc_cur_tag++; 2110 memcpy(&tail, val, sizeof(tail)); 2111 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2112 EXT4_FC_TAG_BASE_LEN + 2113 offsetof(struct ext4_fc_tail, 2114 fc_crc)); 2115 if (le32_to_cpu(tail.fc_tid) == expected_tid && 2116 le32_to_cpu(tail.fc_crc) == state->fc_crc) { 2117 state->fc_replay_num_tags = state->fc_cur_tag; 2118 state->fc_regions_valid = 2119 state->fc_regions_used; 2120 } else { 2121 ret = state->fc_replay_num_tags ? 2122 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2123 } 2124 state->fc_crc = 0; 2125 break; 2126 case EXT4_FC_TAG_HEAD: 2127 memcpy(&head, val, sizeof(head)); 2128 if (le32_to_cpu(head.fc_features) & 2129 ~EXT4_FC_SUPPORTED_FEATURES) { 2130 ret = -EOPNOTSUPP; 2131 break; 2132 } 2133 if (le32_to_cpu(head.fc_tid) != expected_tid) { 2134 ret = JBD2_FC_REPLAY_STOP; 2135 break; 2136 } 2137 state->fc_cur_tag++; 2138 state->fc_crc = ext4_chksum(state->fc_crc, cur, 2139 EXT4_FC_TAG_BASE_LEN + tl.fc_len); 2140 break; 2141 default: 2142 ret = state->fc_replay_num_tags ? 2143 JBD2_FC_REPLAY_STOP : -ECANCELED; 2144 } 2145 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2146 break; 2147 } 2148 2149 out_err: 2150 trace_ext4_fc_replay_scan(sb, ret, off); 2151 return ret; 2152 } 2153 2154 /* 2155 * Main recovery path entry point. 2156 * The meaning of return codes is similar as above. 2157 */ 2158 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2159 enum passtype pass, int off, tid_t expected_tid) 2160 { 2161 struct super_block *sb = journal->j_private; 2162 struct ext4_sb_info *sbi = EXT4_SB(sb); 2163 struct ext4_fc_tl_mem tl; 2164 __u8 *start, *end, *cur, *val; 2165 int ret = JBD2_FC_REPLAY_CONTINUE; 2166 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2167 struct ext4_fc_tail tail; 2168 2169 if (pass == PASS_SCAN) { 2170 state->fc_current_pass = PASS_SCAN; 2171 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2172 } 2173 2174 if (state->fc_current_pass != pass) { 2175 state->fc_current_pass = pass; 2176 sbi->s_mount_state |= EXT4_FC_REPLAY; 2177 } 2178 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2179 ext4_debug("Replay stops\n"); 2180 ext4_fc_set_bitmaps_and_counters(sb); 2181 return 0; 2182 } 2183 2184 #ifdef CONFIG_EXT4_DEBUG 2185 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2186 pr_warn("Dropping fc block %d because max_replay set\n", off); 2187 return JBD2_FC_REPLAY_STOP; 2188 } 2189 #endif 2190 2191 start = (u8 *)bh->b_data; 2192 end = start + journal->j_blocksize; 2193 2194 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; 2195 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { 2196 ext4_fc_get_tl(&tl, cur); 2197 val = cur + EXT4_FC_TAG_BASE_LEN; 2198 2199 if (state->fc_replay_num_tags == 0) { 2200 ret = JBD2_FC_REPLAY_STOP; 2201 ext4_fc_set_bitmaps_and_counters(sb); 2202 break; 2203 } 2204 2205 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); 2206 state->fc_replay_num_tags--; 2207 switch (tl.fc_tag) { 2208 case EXT4_FC_TAG_LINK: 2209 ret = ext4_fc_replay_link(sb, &tl, val); 2210 break; 2211 case EXT4_FC_TAG_UNLINK: 2212 ret = ext4_fc_replay_unlink(sb, &tl, val); 2213 break; 2214 case EXT4_FC_TAG_ADD_RANGE: 2215 ret = ext4_fc_replay_add_range(sb, val); 2216 break; 2217 case EXT4_FC_TAG_CREAT: 2218 ret = ext4_fc_replay_create(sb, &tl, val); 2219 break; 2220 case EXT4_FC_TAG_DEL_RANGE: 2221 ret = ext4_fc_replay_del_range(sb, val); 2222 break; 2223 case EXT4_FC_TAG_INODE: 2224 ret = ext4_fc_replay_inode(sb, &tl, val); 2225 break; 2226 case EXT4_FC_TAG_PAD: 2227 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2228 tl.fc_len, 0); 2229 break; 2230 case EXT4_FC_TAG_TAIL: 2231 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 2232 0, tl.fc_len, 0); 2233 memcpy(&tail, val, sizeof(tail)); 2234 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); 2235 break; 2236 case EXT4_FC_TAG_HEAD: 2237 break; 2238 default: 2239 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); 2240 ret = -ECANCELED; 2241 break; 2242 } 2243 if (ret < 0) 2244 break; 2245 ret = JBD2_FC_REPLAY_CONTINUE; 2246 } 2247 return ret; 2248 } 2249 2250 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2251 { 2252 /* 2253 * We set replay callback even if fast commit disabled because we may 2254 * could still have fast commit blocks that need to be replayed even if 2255 * fast commit has now been turned off. 2256 */ 2257 journal->j_fc_replay_callback = ext4_fc_replay; 2258 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2259 return; 2260 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2261 } 2262 2263 static const char * const fc_ineligible_reasons[] = { 2264 [EXT4_FC_REASON_XATTR] = "Extended attributes changed", 2265 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", 2266 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", 2267 [EXT4_FC_REASON_NOMEM] = "Insufficient memory", 2268 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", 2269 [EXT4_FC_REASON_RESIZE] = "Resize", 2270 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", 2271 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", 2272 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", 2273 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", 2274 [EXT4_FC_REASON_MIGRATE] = "Inode format migration", 2275 [EXT4_FC_REASON_VERITY] = "fs-verity enable", 2276 [EXT4_FC_REASON_MOVE_EXT] = "Move extents", 2277 }; 2278 2279 int ext4_fc_info_show(struct seq_file *seq, void *v) 2280 { 2281 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2282 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2283 int i; 2284 2285 if (v != SEQ_START_TOKEN) 2286 return 0; 2287 2288 seq_printf(seq, 2289 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2290 stats->fc_num_commits, stats->fc_ineligible_commits, 2291 stats->fc_numblks, 2292 div_u64(stats->s_fc_avg_commit_time, 1000)); 2293 seq_puts(seq, "Ineligible reasons:\n"); 2294 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2295 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2296 stats->fc_ineligible_reason_count[i]); 2297 2298 return 0; 2299 } 2300 2301 int __init ext4_fc_init_dentry_cache(void) 2302 { 2303 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2304 SLAB_RECLAIM_ACCOUNT); 2305 2306 if (ext4_fc_dentry_cachep == NULL) 2307 return -ENOMEM; 2308 2309 return 0; 2310 } 2311 2312 void ext4_fc_destroy_dentry_cache(void) 2313 { 2314 kmem_cache_destroy(ext4_fc_dentry_cachep); 2315 } 2316