1 // SPDX-License-Identifier: GPL-2.0 2 3 /* 4 * fs/ext4/fast_commit.c 5 * 6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> 7 * 8 * Ext4 fast commits routines. 9 */ 10 #include "ext4.h" 11 #include "ext4_jbd2.h" 12 #include "ext4_extents.h" 13 #include "mballoc.h" 14 15 /* 16 * Ext4 Fast Commits 17 * ----------------- 18 * 19 * Ext4 fast commits implement fine grained journalling for Ext4. 20 * 21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See 22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by 23 * TLV during the recovery phase. For the scenarios for which we currently 24 * don't have replay code, fast commit falls back to full commits. 25 * Fast commits record delta in one of the following three categories. 26 * 27 * (A) Directory entry updates: 28 * 29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink 30 * - EXT4_FC_TAG_LINK - records directory entry link 31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation 32 * 33 * (B) File specific data range updates: 34 * 35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode 36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode 37 * 38 * (C) Inode metadata (mtime / ctime etc): 39 * 40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed 41 * during recovery. Note that iblocks field is 42 * not replayed and instead derived during 43 * replay. 44 * Commit Operation 45 * ---------------- 46 * With fast commits, we maintain all the directory entry operations in the 47 * order in which they are issued in an in-memory queue. This queue is flushed 48 * to disk during the commit operation. We also maintain a list of inodes 49 * that need to be committed during a fast commit in another in memory queue of 50 * inodes. During the commit operation, we commit in the following order: 51 * 52 * [1] Lock inodes for any further data updates by setting COMMITTING state 53 * [2] Submit data buffers of all the inodes 54 * [3] Wait for [2] to complete 55 * [4] Commit all the directory entry updates in the fast commit space 56 * [5] Commit all the changed inode structures 57 * [6] Write tail tag (this tag ensures the atomicity, please read the following 58 * section for more details). 59 * [7] Wait for [4], [5] and [6] to complete. 60 * 61 * All the inode updates must call ext4_fc_start_update() before starting an 62 * update. If such an ongoing update is present, fast commit waits for it to 63 * complete. The completion of such an update is marked by 64 * ext4_fc_stop_update(). 65 * 66 * Fast Commit Ineligibility 67 * ------------------------- 68 * Not all operations are supported by fast commits today (e.g extended 69 * attributes). Fast commit ineligiblity is marked by calling one of the 70 * two following functions: 71 * 72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall 73 * back to full commit. This is useful in case of transient errors. 74 * 75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all 76 * the fast commits happening between ext4_fc_start_ineligible() and 77 * ext4_fc_stop_ineligible() and one fast commit after the call to 78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to 79 * make one more fast commit to fall back to full commit after stop call so 80 * that it guaranteed that the fast commit ineligible operation contained 81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is 82 * followed by at least 1 full commit. 83 * 84 * Atomicity of commits 85 * -------------------- 86 * In order to guarantee atomicity during the commit operation, fast commit 87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail 88 * tag contains CRC of the contents and TID of the transaction after which 89 * this fast commit should be applied. Recovery code replays fast commit 90 * logs only if there's at least 1 valid tail present. For every fast commit 91 * operation, there is 1 tail. This means, we may end up with multiple tails 92 * in the fast commit space. Here's an example: 93 * 94 * - Create a new file A and remove existing file B 95 * - fsync() 96 * - Append contents to file A 97 * - Truncate file A 98 * - fsync() 99 * 100 * The fast commit space at the end of above operations would look like this: 101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] 102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| 103 * 104 * Replay code should thus check for all the valid tails in the FC area. 105 * 106 * Fast Commit Replay Idempotence 107 * ------------------------------ 108 * 109 * Fast commits tags are idempotent in nature provided the recovery code follows 110 * certain rules. The guiding principle that the commit path follows while 111 * committing is that it stores the result of a particular operation instead of 112 * storing the procedure. 113 * 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' 115 * was associated with inode 10. During fast commit, instead of storing this 116 * operation as a procedure "rename a to b", we store the resulting file system 117 * state as a "series" of outcomes: 118 * 119 * - Link dirent b to inode 10 120 * - Unlink dirent a 121 * - Inode <10> with valid refcount 122 * 123 * Now when recovery code runs, it needs "enforce" this state on the file 124 * system. This is what guarantees idempotence of fast commit replay. 125 * 126 * Let's take an example of a procedure that is not idempotent and see how fast 127 * commits make it idempotent. Consider following sequence of operations: 128 * 129 * rm A; mv B A; read A 130 * (x) (y) (z) 131 * 132 * (x), (y) and (z) are the points at which we can crash. If we store this 133 * sequence of operations as is then the replay is not idempotent. Let's say 134 * while in replay, we crash at (z). During the second replay, file A (which was 135 * actually created as a result of "mv B A" operation) would get deleted. Thus, 136 * file named A would be absent when we try to read A. So, this sequence of 137 * operations is not idempotent. However, as mentioned above, instead of storing 138 * the procedure fast commits store the outcome of each procedure. Thus the fast 139 * commit log for above procedure would be as follows: 140 * 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to 142 * inode 11 before the replay) 143 * 144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] 145 * (w) (x) (y) (z) 146 * 147 * If we crash at (z), we will have file A linked to inode 11. During the second 148 * replay, we will remove file A (inode 11). But we will create it back and make 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled 152 * similarly. Thus, by converting a non-idempotent procedure into a series of 153 * idempotent outcomes, fast commits ensured idempotence during the replay. 154 * 155 * TODOs 156 * ----- 157 * 158 * 0) Fast commit replay path hardening: Fast commit replay code should use 159 * journal handles to make sure all the updates it does during the replay 160 * path are atomic. With that if we crash during fast commit replay, after 161 * trying to do recovery again, we will find a file system where fast commit 162 * area is invalid (because new full commit would be found). In order to deal 163 * with that, fast commit replay code should ensure that the "FC_REPLAY" 164 * superblock state is persisted before starting the replay, so that after 165 * the crash, fast commit recovery code can look at that flag and perform 166 * fast commit recovery even if that area is invalidated by later full 167 * commits. 168 * 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit 170 * eligible update must be protected within ext4_fc_start_update() and 171 * ext4_fc_stop_update(). These routines are called at much higher 172 * routines. This can be made more fine grained by combining with 173 * ext4_journal_start(). 174 * 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() 176 * 177 * 3) Handle more ineligible cases. 178 */ 179 180 #include <trace/events/ext4.h> 181 static struct kmem_cache *ext4_fc_dentry_cachep; 182 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 184 { 185 BUFFER_TRACE(bh, ""); 186 if (uptodate) { 187 ext4_debug("%s: Block %lld up-to-date", 188 __func__, bh->b_blocknr); 189 set_buffer_uptodate(bh); 190 } else { 191 ext4_debug("%s: Block %lld not up-to-date", 192 __func__, bh->b_blocknr); 193 clear_buffer_uptodate(bh); 194 } 195 196 unlock_buffer(bh); 197 } 198 199 static inline void ext4_fc_reset_inode(struct inode *inode) 200 { 201 struct ext4_inode_info *ei = EXT4_I(inode); 202 203 ei->i_fc_lblk_start = 0; 204 ei->i_fc_lblk_len = 0; 205 } 206 207 void ext4_fc_init_inode(struct inode *inode) 208 { 209 struct ext4_inode_info *ei = EXT4_I(inode); 210 211 ext4_fc_reset_inode(inode); 212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 213 INIT_LIST_HEAD(&ei->i_fc_list); 214 init_waitqueue_head(&ei->i_fc_wait); 215 atomic_set(&ei->i_fc_updates, 0); 216 } 217 218 /* This function must be called with sbi->s_fc_lock held. */ 219 static void ext4_fc_wait_committing_inode(struct inode *inode) 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) 221 { 222 wait_queue_head_t *wq; 223 struct ext4_inode_info *ei = EXT4_I(inode); 224 225 #if (BITS_PER_LONG < 64) 226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags, 227 EXT4_STATE_FC_COMMITTING); 228 wq = bit_waitqueue(&ei->i_state_flags, 229 EXT4_STATE_FC_COMMITTING); 230 #else 231 DEFINE_WAIT_BIT(wait, &ei->i_flags, 232 EXT4_STATE_FC_COMMITTING); 233 wq = bit_waitqueue(&ei->i_flags, 234 EXT4_STATE_FC_COMMITTING); 235 #endif 236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); 237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 239 schedule(); 240 finish_wait(wq, &wait.wq_entry); 241 } 242 243 /* 244 * Inform Ext4's fast about start of an inode update 245 * 246 * This function is called by the high level call VFS callbacks before 247 * performing any inode update. This function blocks if there's an ongoing 248 * fast commit on the inode in question. 249 */ 250 void ext4_fc_start_update(struct inode *inode) 251 { 252 struct ext4_inode_info *ei = EXT4_I(inode); 253 254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 256 return; 257 258 restart: 259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 260 if (list_empty(&ei->i_fc_list)) 261 goto out; 262 263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 264 ext4_fc_wait_committing_inode(inode); 265 goto restart; 266 } 267 out: 268 atomic_inc(&ei->i_fc_updates); 269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 270 } 271 272 /* 273 * Stop inode update and wake up waiting fast commits if any. 274 */ 275 void ext4_fc_stop_update(struct inode *inode) 276 { 277 struct ext4_inode_info *ei = EXT4_I(inode); 278 279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 281 return; 282 283 if (atomic_dec_and_test(&ei->i_fc_updates)) 284 wake_up_all(&ei->i_fc_wait); 285 } 286 287 /* 288 * Remove inode from fast commit list. If the inode is being committed 289 * we wait until inode commit is done. 290 */ 291 void ext4_fc_del(struct inode *inode) 292 { 293 struct ext4_inode_info *ei = EXT4_I(inode); 294 295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)) 297 return; 298 299 restart: 300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); 301 if (list_empty(&ei->i_fc_list)) { 302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 303 return; 304 } 305 306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { 307 ext4_fc_wait_committing_inode(inode); 308 goto restart; 309 } 310 list_del_init(&ei->i_fc_list); 311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); 312 } 313 314 /* 315 * Mark file system as fast commit ineligible. This means that next commit 316 * operation would result in a full jbd2 commit. 317 */ 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason) 319 { 320 struct ext4_sb_info *sbi = EXT4_SB(sb); 321 322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 324 return; 325 326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 327 WARN_ON(reason >= EXT4_FC_REASON_MAX); 328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 329 } 330 331 /* 332 * Start a fast commit ineligible update. Any commits that happen while 333 * such an operation is in progress fall back to full commits. 334 */ 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(sb); 338 339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 341 return; 342 343 WARN_ON(reason >= EXT4_FC_REASON_MAX); 344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; 345 atomic_inc(&sbi->s_fc_ineligible_updates); 346 } 347 348 /* 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here 350 * to ensure that after stopping the ineligible update, at least one full 351 * commit takes place. 352 */ 353 void ext4_fc_stop_ineligible(struct super_block *sb) 354 { 355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) 357 return; 358 359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); 361 } 362 363 static inline int ext4_fc_is_ineligible(struct super_block *sb) 364 { 365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) || 366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates)); 367 } 368 369 /* 370 * Generic fast commit tracking function. If this is the first time this we are 371 * called after a full commit, we initialize fast commit fields and then call 372 * __fc_track_fn() with update = 0. If we have already been called after a full 373 * commit, we pass update = 1. Based on that, the track function can determine 374 * if it needs to track a field for the first time or if it needs to just 375 * update the previously tracked value. 376 * 377 * If enqueue is set, this function enqueues the inode in fast commit list. 378 */ 379 static int ext4_fc_track_template( 380 handle_t *handle, struct inode *inode, 381 int (*__fc_track_fn)(struct inode *, void *, bool), 382 void *args, int enqueue) 383 { 384 bool update = false; 385 struct ext4_inode_info *ei = EXT4_I(inode); 386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 387 tid_t tid = 0; 388 int ret; 389 390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) || 391 (sbi->s_mount_state & EXT4_FC_REPLAY)) 392 return -EOPNOTSUPP; 393 394 if (ext4_fc_is_ineligible(inode->i_sb)) 395 return -EINVAL; 396 397 tid = handle->h_transaction->t_tid; 398 mutex_lock(&ei->i_fc_lock); 399 if (tid == ei->i_sync_tid) { 400 update = true; 401 } else { 402 ext4_fc_reset_inode(inode); 403 ei->i_sync_tid = tid; 404 } 405 ret = __fc_track_fn(inode, args, update); 406 mutex_unlock(&ei->i_fc_lock); 407 408 if (!enqueue) 409 return ret; 410 411 spin_lock(&sbi->s_fc_lock); 412 if (list_empty(&EXT4_I(inode)->i_fc_list)) 413 list_add_tail(&EXT4_I(inode)->i_fc_list, 414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ? 415 &sbi->s_fc_q[FC_Q_STAGING] : 416 &sbi->s_fc_q[FC_Q_MAIN]); 417 spin_unlock(&sbi->s_fc_lock); 418 419 return ret; 420 } 421 422 struct __track_dentry_update_args { 423 struct dentry *dentry; 424 int op; 425 }; 426 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update) 429 { 430 struct ext4_fc_dentry_update *node; 431 struct ext4_inode_info *ei = EXT4_I(inode); 432 struct __track_dentry_update_args *dentry_update = 433 (struct __track_dentry_update_args *)arg; 434 struct dentry *dentry = dentry_update->dentry; 435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 436 437 mutex_unlock(&ei->i_fc_lock); 438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); 439 if (!node) { 440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); 441 mutex_lock(&ei->i_fc_lock); 442 return -ENOMEM; 443 } 444 445 node->fcd_op = dentry_update->op; 446 node->fcd_parent = dentry->d_parent->d_inode->i_ino; 447 node->fcd_ino = inode->i_ino; 448 if (dentry->d_name.len > DNAME_INLINE_LEN) { 449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); 450 if (!node->fcd_name.name) { 451 kmem_cache_free(ext4_fc_dentry_cachep, node); 452 ext4_fc_mark_ineligible(inode->i_sb, 453 EXT4_FC_REASON_NOMEM); 454 mutex_lock(&ei->i_fc_lock); 455 return -ENOMEM; 456 } 457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, 458 dentry->d_name.len); 459 } else { 460 memcpy(node->fcd_iname, dentry->d_name.name, 461 dentry->d_name.len); 462 node->fcd_name.name = node->fcd_iname; 463 } 464 node->fcd_name.len = dentry->d_name.len; 465 466 spin_lock(&sbi->s_fc_lock); 467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) 468 list_add_tail(&node->fcd_list, 469 &sbi->s_fc_dentry_q[FC_Q_STAGING]); 470 else 471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); 472 spin_unlock(&sbi->s_fc_lock); 473 mutex_lock(&ei->i_fc_lock); 474 475 return 0; 476 } 477 478 void __ext4_fc_track_unlink(handle_t *handle, 479 struct inode *inode, struct dentry *dentry) 480 { 481 struct __track_dentry_update_args args; 482 int ret; 483 484 args.dentry = dentry; 485 args.op = EXT4_FC_TAG_UNLINK; 486 487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 488 (void *)&args, 0); 489 trace_ext4_fc_track_unlink(inode, dentry, ret); 490 } 491 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) 493 { 494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry); 495 } 496 497 void __ext4_fc_track_link(handle_t *handle, 498 struct inode *inode, struct dentry *dentry) 499 { 500 struct __track_dentry_update_args args; 501 int ret; 502 503 args.dentry = dentry; 504 args.op = EXT4_FC_TAG_LINK; 505 506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 507 (void *)&args, 0); 508 trace_ext4_fc_track_link(inode, dentry, ret); 509 } 510 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 512 { 513 __ext4_fc_track_link(handle, d_inode(dentry), dentry); 514 } 515 516 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) 517 { 518 struct __track_dentry_update_args args; 519 struct inode *inode = d_inode(dentry); 520 int ret; 521 522 args.dentry = dentry; 523 args.op = EXT4_FC_TAG_CREAT; 524 525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update, 526 (void *)&args, 0); 527 trace_ext4_fc_track_create(inode, dentry, ret); 528 } 529 530 /* __track_fn for inode tracking */ 531 static int __track_inode(struct inode *inode, void *arg, bool update) 532 { 533 if (update) 534 return -EEXIST; 535 536 EXT4_I(inode)->i_fc_lblk_len = 0; 537 538 return 0; 539 } 540 541 void ext4_fc_track_inode(handle_t *handle, struct inode *inode) 542 { 543 int ret; 544 545 if (S_ISDIR(inode->i_mode)) 546 return; 547 548 if (ext4_should_journal_data(inode)) { 549 ext4_fc_mark_ineligible(inode->i_sb, 550 EXT4_FC_REASON_INODE_JOURNAL_DATA); 551 return; 552 } 553 554 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); 555 trace_ext4_fc_track_inode(inode, ret); 556 } 557 558 struct __track_range_args { 559 ext4_lblk_t start, end; 560 }; 561 562 /* __track_fn for tracking data updates */ 563 static int __track_range(struct inode *inode, void *arg, bool update) 564 { 565 struct ext4_inode_info *ei = EXT4_I(inode); 566 ext4_lblk_t oldstart; 567 struct __track_range_args *__arg = 568 (struct __track_range_args *)arg; 569 570 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { 571 ext4_debug("Special inode %ld being modified\n", inode->i_ino); 572 return -ECANCELED; 573 } 574 575 oldstart = ei->i_fc_lblk_start; 576 577 if (update && ei->i_fc_lblk_len > 0) { 578 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); 579 ei->i_fc_lblk_len = 580 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - 581 ei->i_fc_lblk_start + 1; 582 } else { 583 ei->i_fc_lblk_start = __arg->start; 584 ei->i_fc_lblk_len = __arg->end - __arg->start + 1; 585 } 586 587 return 0; 588 } 589 590 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, 591 ext4_lblk_t end) 592 { 593 struct __track_range_args args; 594 int ret; 595 596 if (S_ISDIR(inode->i_mode)) 597 return; 598 599 args.start = start; 600 args.end = end; 601 602 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); 603 604 trace_ext4_fc_track_range(inode, start, end, ret); 605 } 606 607 static void ext4_fc_submit_bh(struct super_block *sb) 608 { 609 int write_flags = REQ_SYNC; 610 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; 611 612 /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ 613 if (test_opt(sb, BARRIER)) 614 write_flags |= REQ_FUA | REQ_PREFLUSH; 615 lock_buffer(bh); 616 set_buffer_dirty(bh); 617 set_buffer_uptodate(bh); 618 bh->b_end_io = ext4_end_buffer_io_sync; 619 submit_bh(REQ_OP_WRITE, write_flags, bh); 620 EXT4_SB(sb)->s_fc_bh = NULL; 621 } 622 623 /* Ext4 commit path routines */ 624 625 /* memzero and update CRC */ 626 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len, 627 u32 *crc) 628 { 629 void *ret; 630 631 ret = memset(dst, 0, len); 632 if (crc) 633 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len); 634 return ret; 635 } 636 637 /* 638 * Allocate len bytes on a fast commit buffer. 639 * 640 * During the commit time this function is used to manage fast commit 641 * block space. We don't split a fast commit log onto different 642 * blocks. So this function makes sure that if there's not enough space 643 * on the current block, the remaining space in the current block is 644 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, 645 * new block is from jbd2 and CRC is updated to reflect the padding 646 * we added. 647 */ 648 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) 649 { 650 struct ext4_fc_tl *tl; 651 struct ext4_sb_info *sbi = EXT4_SB(sb); 652 struct buffer_head *bh; 653 int bsize = sbi->s_journal->j_blocksize; 654 int ret, off = sbi->s_fc_bytes % bsize; 655 int pad_len; 656 657 /* 658 * After allocating len, we should have space at least for a 0 byte 659 * padding. 660 */ 661 if (len + sizeof(struct ext4_fc_tl) > bsize) 662 return NULL; 663 664 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) { 665 /* 666 * Only allocate from current buffer if we have enough space for 667 * this request AND we have space to add a zero byte padding. 668 */ 669 if (!sbi->s_fc_bh) { 670 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 671 if (ret) 672 return NULL; 673 sbi->s_fc_bh = bh; 674 } 675 sbi->s_fc_bytes += len; 676 return sbi->s_fc_bh->b_data + off; 677 } 678 /* Need to add PAD tag */ 679 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off); 680 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); 681 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl); 682 tl->fc_len = cpu_to_le16(pad_len); 683 if (crc) 684 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); 685 if (pad_len > 0) 686 ext4_fc_memzero(sb, tl + 1, pad_len, crc); 687 ext4_fc_submit_bh(sb); 688 689 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); 690 if (ret) 691 return NULL; 692 sbi->s_fc_bh = bh; 693 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len; 694 return sbi->s_fc_bh->b_data; 695 } 696 697 /* memcpy to fc reserved space and update CRC */ 698 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src, 699 int len, u32 *crc) 700 { 701 if (crc) 702 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len); 703 return memcpy(dst, src, len); 704 } 705 706 /* 707 * Complete a fast commit by writing tail tag. 708 * 709 * Writing tail tag marks the end of a fast commit. In order to guarantee 710 * atomicity, after writing tail tag, even if there's space remaining 711 * in the block, next commit shouldn't use it. That's why tail tag 712 * has the length as that of the remaining space on the block. 713 */ 714 static int ext4_fc_write_tail(struct super_block *sb, u32 crc) 715 { 716 struct ext4_sb_info *sbi = EXT4_SB(sb); 717 struct ext4_fc_tl tl; 718 struct ext4_fc_tail tail; 719 int off, bsize = sbi->s_journal->j_blocksize; 720 u8 *dst; 721 722 /* 723 * ext4_fc_reserve_space takes care of allocating an extra block if 724 * there's no enough space on this block for accommodating this tail. 725 */ 726 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc); 727 if (!dst) 728 return -ENOSPC; 729 730 off = sbi->s_fc_bytes % bsize; 731 732 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); 733 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail)); 734 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); 735 736 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc); 737 dst += sizeof(tl); 738 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); 739 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc); 740 dst += sizeof(tail.fc_tid); 741 tail.fc_crc = cpu_to_le32(crc); 742 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); 743 744 ext4_fc_submit_bh(sb); 745 746 return 0; 747 } 748 749 /* 750 * Adds tag, length, value and updates CRC. Returns true if tlv was added. 751 * Returns false if there's not enough space. 752 */ 753 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, 754 u32 *crc) 755 { 756 struct ext4_fc_tl tl; 757 u8 *dst; 758 759 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc); 760 if (!dst) 761 return false; 762 763 tl.fc_tag = cpu_to_le16(tag); 764 tl.fc_len = cpu_to_le16(len); 765 766 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 767 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc); 768 769 return true; 770 } 771 772 /* Same as above, but adds dentry tlv. */ 773 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, 774 int parent_ino, int ino, int dlen, 775 const unsigned char *dname, 776 u32 *crc) 777 { 778 struct ext4_fc_dentry_info fcd; 779 struct ext4_fc_tl tl; 780 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, 781 crc); 782 783 if (!dst) 784 return false; 785 786 fcd.fc_parent_ino = cpu_to_le32(parent_ino); 787 fcd.fc_ino = cpu_to_le32(ino); 788 tl.fc_tag = cpu_to_le16(tag); 789 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); 790 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); 791 dst += sizeof(tl); 792 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); 793 dst += sizeof(fcd); 794 ext4_fc_memcpy(sb, dst, dname, dlen, crc); 795 dst += dlen; 796 797 return true; 798 } 799 800 /* 801 * Writes inode in the fast commit space under TLV with tag @tag. 802 * Returns 0 on success, error on failure. 803 */ 804 static int ext4_fc_write_inode(struct inode *inode, u32 *crc) 805 { 806 struct ext4_inode_info *ei = EXT4_I(inode); 807 int inode_len = EXT4_GOOD_OLD_INODE_SIZE; 808 int ret; 809 struct ext4_iloc iloc; 810 struct ext4_fc_inode fc_inode; 811 struct ext4_fc_tl tl; 812 u8 *dst; 813 814 ret = ext4_get_inode_loc(inode, &iloc); 815 if (ret) 816 return ret; 817 818 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) 819 inode_len += ei->i_extra_isize; 820 821 fc_inode.fc_ino = cpu_to_le32(inode->i_ino); 822 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); 823 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); 824 825 dst = ext4_fc_reserve_space(inode->i_sb, 826 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc); 827 if (!dst) 828 return -ECANCELED; 829 830 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc)) 831 return -ECANCELED; 832 dst += sizeof(tl); 833 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc)) 834 return -ECANCELED; 835 dst += sizeof(fc_inode); 836 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc), 837 inode_len, crc)) 838 return -ECANCELED; 839 840 return 0; 841 } 842 843 /* 844 * Writes updated data ranges for the inode in question. Updates CRC. 845 * Returns 0 on success, error otherwise. 846 */ 847 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) 848 { 849 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; 850 struct ext4_inode_info *ei = EXT4_I(inode); 851 struct ext4_map_blocks map; 852 struct ext4_fc_add_range fc_ext; 853 struct ext4_fc_del_range lrange; 854 struct ext4_extent *ex; 855 int ret; 856 857 mutex_lock(&ei->i_fc_lock); 858 if (ei->i_fc_lblk_len == 0) { 859 mutex_unlock(&ei->i_fc_lock); 860 return 0; 861 } 862 old_blk_size = ei->i_fc_lblk_start; 863 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; 864 ei->i_fc_lblk_len = 0; 865 mutex_unlock(&ei->i_fc_lock); 866 867 cur_lblk_off = old_blk_size; 868 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", 869 __func__, cur_lblk_off, new_blk_size, inode->i_ino); 870 871 while (cur_lblk_off <= new_blk_size) { 872 map.m_lblk = cur_lblk_off; 873 map.m_len = new_blk_size - cur_lblk_off + 1; 874 ret = ext4_map_blocks(NULL, inode, &map, 0); 875 if (ret < 0) 876 return -ECANCELED; 877 878 if (map.m_len == 0) { 879 cur_lblk_off++; 880 continue; 881 } 882 883 if (ret == 0) { 884 lrange.fc_ino = cpu_to_le32(inode->i_ino); 885 lrange.fc_lblk = cpu_to_le32(map.m_lblk); 886 lrange.fc_len = cpu_to_le32(map.m_len); 887 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, 888 sizeof(lrange), (u8 *)&lrange, crc)) 889 return -ENOSPC; 890 } else { 891 fc_ext.fc_ino = cpu_to_le32(inode->i_ino); 892 ex = (struct ext4_extent *)&fc_ext.fc_ex; 893 ex->ee_block = cpu_to_le32(map.m_lblk); 894 ex->ee_len = cpu_to_le16(map.m_len); 895 ext4_ext_store_pblock(ex, map.m_pblk); 896 if (map.m_flags & EXT4_MAP_UNWRITTEN) 897 ext4_ext_mark_unwritten(ex); 898 else 899 ext4_ext_mark_initialized(ex); 900 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, 901 sizeof(fc_ext), (u8 *)&fc_ext, crc)) 902 return -ENOSPC; 903 } 904 905 cur_lblk_off += map.m_len; 906 } 907 908 return 0; 909 } 910 911 912 /* Submit data for all the fast commit inodes */ 913 static int ext4_fc_submit_inode_data_all(journal_t *journal) 914 { 915 struct super_block *sb = (struct super_block *)(journal->j_private); 916 struct ext4_sb_info *sbi = EXT4_SB(sb); 917 struct ext4_inode_info *ei; 918 struct list_head *pos; 919 int ret = 0; 920 921 spin_lock(&sbi->s_fc_lock); 922 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); 923 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 924 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 925 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); 926 while (atomic_read(&ei->i_fc_updates)) { 927 DEFINE_WAIT(wait); 928 929 prepare_to_wait(&ei->i_fc_wait, &wait, 930 TASK_UNINTERRUPTIBLE); 931 if (atomic_read(&ei->i_fc_updates)) { 932 spin_unlock(&sbi->s_fc_lock); 933 schedule(); 934 spin_lock(&sbi->s_fc_lock); 935 } 936 finish_wait(&ei->i_fc_wait, &wait); 937 } 938 spin_unlock(&sbi->s_fc_lock); 939 ret = jbd2_submit_inode_data(ei->jinode); 940 if (ret) 941 return ret; 942 spin_lock(&sbi->s_fc_lock); 943 } 944 spin_unlock(&sbi->s_fc_lock); 945 946 return ret; 947 } 948 949 /* Wait for completion of data for all the fast commit inodes */ 950 static int ext4_fc_wait_inode_data_all(journal_t *journal) 951 { 952 struct super_block *sb = (struct super_block *)(journal->j_private); 953 struct ext4_sb_info *sbi = EXT4_SB(sb); 954 struct ext4_inode_info *pos, *n; 955 int ret = 0; 956 957 spin_lock(&sbi->s_fc_lock); 958 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { 959 if (!ext4_test_inode_state(&pos->vfs_inode, 960 EXT4_STATE_FC_COMMITTING)) 961 continue; 962 spin_unlock(&sbi->s_fc_lock); 963 964 ret = jbd2_wait_inode_data(journal, pos->jinode); 965 if (ret) 966 return ret; 967 spin_lock(&sbi->s_fc_lock); 968 } 969 spin_unlock(&sbi->s_fc_lock); 970 971 return 0; 972 } 973 974 /* Commit all the directory entry updates */ 975 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) 976 __acquires(&sbi->s_fc_lock) 977 __releases(&sbi->s_fc_lock) 978 { 979 struct super_block *sb = (struct super_block *)(journal->j_private); 980 struct ext4_sb_info *sbi = EXT4_SB(sb); 981 struct ext4_fc_dentry_update *fc_dentry; 982 struct inode *inode; 983 struct list_head *pos, *n, *fcd_pos, *fcd_n; 984 struct ext4_inode_info *ei; 985 int ret; 986 987 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) 988 return 0; 989 list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { 990 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, 991 fcd_list); 992 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { 993 spin_unlock(&sbi->s_fc_lock); 994 if (!ext4_fc_add_dentry_tlv( 995 sb, fc_dentry->fcd_op, 996 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 997 fc_dentry->fcd_name.len, 998 fc_dentry->fcd_name.name, crc)) { 999 ret = -ENOSPC; 1000 goto lock_and_exit; 1001 } 1002 spin_lock(&sbi->s_fc_lock); 1003 continue; 1004 } 1005 1006 inode = NULL; 1007 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 1008 ei = list_entry(pos, struct ext4_inode_info, i_fc_list); 1009 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { 1010 inode = &ei->vfs_inode; 1011 break; 1012 } 1013 } 1014 /* 1015 * If we don't find inode in our list, then it was deleted, 1016 * in which case, we don't need to record it's create tag. 1017 */ 1018 if (!inode) 1019 continue; 1020 spin_unlock(&sbi->s_fc_lock); 1021 1022 /* 1023 * We first write the inode and then the create dirent. This 1024 * allows the recovery code to create an unnamed inode first 1025 * and then link it to a directory entry. This allows us 1026 * to use namei.c routines almost as is and simplifies 1027 * the recovery code. 1028 */ 1029 ret = ext4_fc_write_inode(inode, crc); 1030 if (ret) 1031 goto lock_and_exit; 1032 1033 ret = ext4_fc_write_inode_data(inode, crc); 1034 if (ret) 1035 goto lock_and_exit; 1036 1037 if (!ext4_fc_add_dentry_tlv( 1038 sb, fc_dentry->fcd_op, 1039 fc_dentry->fcd_parent, fc_dentry->fcd_ino, 1040 fc_dentry->fcd_name.len, 1041 fc_dentry->fcd_name.name, crc)) { 1042 ret = -ENOSPC; 1043 goto lock_and_exit; 1044 } 1045 1046 spin_lock(&sbi->s_fc_lock); 1047 } 1048 return 0; 1049 lock_and_exit: 1050 spin_lock(&sbi->s_fc_lock); 1051 return ret; 1052 } 1053 1054 static int ext4_fc_perform_commit(journal_t *journal) 1055 { 1056 struct super_block *sb = (struct super_block *)(journal->j_private); 1057 struct ext4_sb_info *sbi = EXT4_SB(sb); 1058 struct ext4_inode_info *iter; 1059 struct ext4_fc_head head; 1060 struct list_head *pos; 1061 struct inode *inode; 1062 struct blk_plug plug; 1063 int ret = 0; 1064 u32 crc = 0; 1065 1066 ret = ext4_fc_submit_inode_data_all(journal); 1067 if (ret) 1068 return ret; 1069 1070 ret = ext4_fc_wait_inode_data_all(journal); 1071 if (ret) 1072 return ret; 1073 1074 /* 1075 * If file system device is different from journal device, issue a cache 1076 * flush before we start writing fast commit blocks. 1077 */ 1078 if (journal->j_fs_dev != journal->j_dev) 1079 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); 1080 1081 blk_start_plug(&plug); 1082 if (sbi->s_fc_bytes == 0) { 1083 /* 1084 * Add a head tag only if this is the first fast commit 1085 * in this TID. 1086 */ 1087 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); 1088 head.fc_tid = cpu_to_le32( 1089 sbi->s_journal->j_running_transaction->t_tid); 1090 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), 1091 (u8 *)&head, &crc)) 1092 goto out; 1093 } 1094 1095 spin_lock(&sbi->s_fc_lock); 1096 ret = ext4_fc_commit_dentry_updates(journal, &crc); 1097 if (ret) { 1098 spin_unlock(&sbi->s_fc_lock); 1099 goto out; 1100 } 1101 1102 list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { 1103 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1104 inode = &iter->vfs_inode; 1105 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) 1106 continue; 1107 1108 spin_unlock(&sbi->s_fc_lock); 1109 ret = ext4_fc_write_inode_data(inode, &crc); 1110 if (ret) 1111 goto out; 1112 ret = ext4_fc_write_inode(inode, &crc); 1113 if (ret) 1114 goto out; 1115 spin_lock(&sbi->s_fc_lock); 1116 } 1117 spin_unlock(&sbi->s_fc_lock); 1118 1119 ret = ext4_fc_write_tail(sb, crc); 1120 1121 out: 1122 blk_finish_plug(&plug); 1123 return ret; 1124 } 1125 1126 /* 1127 * The main commit entry point. Performs a fast commit for transaction 1128 * commit_tid if needed. If it's not possible to perform a fast commit 1129 * due to various reasons, we fall back to full commit. Returns 0 1130 * on success, error otherwise. 1131 */ 1132 int ext4_fc_commit(journal_t *journal, tid_t commit_tid) 1133 { 1134 struct super_block *sb = (struct super_block *)(journal->j_private); 1135 struct ext4_sb_info *sbi = EXT4_SB(sb); 1136 int nblks = 0, ret, bsize = journal->j_blocksize; 1137 int subtid = atomic_read(&sbi->s_fc_subtid); 1138 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0; 1139 ktime_t start_time, commit_time; 1140 1141 trace_ext4_fc_commit_start(sb); 1142 1143 start_time = ktime_get(); 1144 1145 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 1146 (ext4_fc_is_ineligible(sb))) { 1147 reason = EXT4_FC_REASON_INELIGIBLE; 1148 goto out; 1149 } 1150 1151 restart_fc: 1152 ret = jbd2_fc_begin_commit(journal, commit_tid); 1153 if (ret == -EALREADY) { 1154 /* There was an ongoing commit, check if we need to restart */ 1155 if (atomic_read(&sbi->s_fc_subtid) <= subtid && 1156 commit_tid > journal->j_commit_sequence) 1157 goto restart_fc; 1158 reason = EXT4_FC_REASON_ALREADY_COMMITTED; 1159 goto out; 1160 } else if (ret) { 1161 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1162 reason = EXT4_FC_REASON_FC_START_FAILED; 1163 goto out; 1164 } 1165 1166 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; 1167 ret = ext4_fc_perform_commit(journal); 1168 if (ret < 0) { 1169 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1170 reason = EXT4_FC_REASON_FC_FAILED; 1171 goto out; 1172 } 1173 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; 1174 ret = jbd2_fc_wait_bufs(journal, nblks); 1175 if (ret < 0) { 1176 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1177 reason = EXT4_FC_REASON_FC_FAILED; 1178 goto out; 1179 } 1180 atomic_inc(&sbi->s_fc_subtid); 1181 jbd2_fc_end_commit(journal); 1182 out: 1183 /* Has any ineligible update happened since we started? */ 1184 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) { 1185 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++; 1186 reason = EXT4_FC_REASON_INELIGIBLE; 1187 } 1188 1189 spin_lock(&sbi->s_fc_lock); 1190 if (reason != EXT4_FC_REASON_OK && 1191 reason != EXT4_FC_REASON_ALREADY_COMMITTED) { 1192 sbi->s_fc_stats.fc_ineligible_commits++; 1193 } else { 1194 sbi->s_fc_stats.fc_num_commits++; 1195 sbi->s_fc_stats.fc_numblks += nblks; 1196 } 1197 spin_unlock(&sbi->s_fc_lock); 1198 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0; 1199 trace_ext4_fc_commit_stop(sb, nblks, reason); 1200 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1201 /* 1202 * weight the commit time higher than the average time so we don't 1203 * react too strongly to vast changes in the commit time 1204 */ 1205 if (likely(sbi->s_fc_avg_commit_time)) 1206 sbi->s_fc_avg_commit_time = (commit_time + 1207 sbi->s_fc_avg_commit_time * 3) / 4; 1208 else 1209 sbi->s_fc_avg_commit_time = commit_time; 1210 jbd_debug(1, 1211 "Fast commit ended with blks = %d, reason = %d, subtid - %d", 1212 nblks, reason, subtid); 1213 if (reason == EXT4_FC_REASON_FC_FAILED) 1214 return jbd2_fc_end_commit_fallback(journal); 1215 if (reason == EXT4_FC_REASON_FC_START_FAILED || 1216 reason == EXT4_FC_REASON_INELIGIBLE) 1217 return jbd2_complete_transaction(journal, commit_tid); 1218 return 0; 1219 } 1220 1221 /* 1222 * Fast commit cleanup routine. This is called after every fast commit and 1223 * full commit. full is true if we are called after a full commit. 1224 */ 1225 static void ext4_fc_cleanup(journal_t *journal, int full) 1226 { 1227 struct super_block *sb = journal->j_private; 1228 struct ext4_sb_info *sbi = EXT4_SB(sb); 1229 struct ext4_inode_info *iter; 1230 struct ext4_fc_dentry_update *fc_dentry; 1231 struct list_head *pos, *n; 1232 1233 if (full && sbi->s_fc_bh) 1234 sbi->s_fc_bh = NULL; 1235 1236 jbd2_fc_release_bufs(journal); 1237 1238 spin_lock(&sbi->s_fc_lock); 1239 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { 1240 iter = list_entry(pos, struct ext4_inode_info, i_fc_list); 1241 list_del_init(&iter->i_fc_list); 1242 ext4_clear_inode_state(&iter->vfs_inode, 1243 EXT4_STATE_FC_COMMITTING); 1244 ext4_fc_reset_inode(&iter->vfs_inode); 1245 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ 1246 smp_mb(); 1247 #if (BITS_PER_LONG < 64) 1248 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); 1249 #else 1250 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); 1251 #endif 1252 } 1253 1254 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { 1255 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], 1256 struct ext4_fc_dentry_update, 1257 fcd_list); 1258 list_del_init(&fc_dentry->fcd_list); 1259 spin_unlock(&sbi->s_fc_lock); 1260 1261 if (fc_dentry->fcd_name.name && 1262 fc_dentry->fcd_name.len > DNAME_INLINE_LEN) 1263 kfree(fc_dentry->fcd_name.name); 1264 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); 1265 spin_lock(&sbi->s_fc_lock); 1266 } 1267 1268 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], 1269 &sbi->s_fc_dentry_q[FC_Q_MAIN]); 1270 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], 1271 &sbi->s_fc_q[FC_Q_STAGING]); 1272 1273 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 1274 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 1275 1276 if (full) 1277 sbi->s_fc_bytes = 0; 1278 spin_unlock(&sbi->s_fc_lock); 1279 trace_ext4_fc_stats(sb); 1280 } 1281 1282 /* Ext4 Replay Path Routines */ 1283 1284 /* Get length of a particular tlv */ 1285 static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) 1286 { 1287 return le16_to_cpu(tl->fc_len); 1288 } 1289 1290 /* Get a pointer to "value" of a tlv */ 1291 static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) 1292 { 1293 return (u8 *)tl + sizeof(*tl); 1294 } 1295 1296 /* Helper struct for dentry replay routines */ 1297 struct dentry_info_args { 1298 int parent_ino, dname_len, ino, inode_len; 1299 char *dname; 1300 }; 1301 1302 static inline void tl_to_darg(struct dentry_info_args *darg, 1303 struct ext4_fc_tl *tl) 1304 { 1305 struct ext4_fc_dentry_info *fcd; 1306 1307 fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl); 1308 1309 darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino); 1310 darg->ino = le32_to_cpu(fcd->fc_ino); 1311 darg->dname = fcd->fc_dname; 1312 darg->dname_len = ext4_fc_tag_len(tl) - 1313 sizeof(struct ext4_fc_dentry_info); 1314 } 1315 1316 /* Unlink replay function */ 1317 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) 1318 { 1319 struct inode *inode, *old_parent; 1320 struct qstr entry; 1321 struct dentry_info_args darg; 1322 int ret = 0; 1323 1324 tl_to_darg(&darg, tl); 1325 1326 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, 1327 darg.parent_ino, darg.dname_len); 1328 1329 entry.name = darg.dname; 1330 entry.len = darg.dname_len; 1331 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1332 1333 if (IS_ERR_OR_NULL(inode)) { 1334 jbd_debug(1, "Inode %d not found", darg.ino); 1335 return 0; 1336 } 1337 1338 old_parent = ext4_iget(sb, darg.parent_ino, 1339 EXT4_IGET_NORMAL); 1340 if (IS_ERR_OR_NULL(old_parent)) { 1341 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); 1342 iput(inode); 1343 return 0; 1344 } 1345 1346 ret = __ext4_unlink(NULL, old_parent, &entry, inode); 1347 /* -ENOENT ok coz it might not exist anymore. */ 1348 if (ret == -ENOENT) 1349 ret = 0; 1350 iput(old_parent); 1351 iput(inode); 1352 return ret; 1353 } 1354 1355 static int ext4_fc_replay_link_internal(struct super_block *sb, 1356 struct dentry_info_args *darg, 1357 struct inode *inode) 1358 { 1359 struct inode *dir = NULL; 1360 struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1361 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1362 int ret = 0; 1363 1364 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); 1365 if (IS_ERR(dir)) { 1366 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); 1367 dir = NULL; 1368 goto out; 1369 } 1370 1371 dentry_dir = d_obtain_alias(dir); 1372 if (IS_ERR(dentry_dir)) { 1373 jbd_debug(1, "Failed to obtain dentry"); 1374 dentry_dir = NULL; 1375 goto out; 1376 } 1377 1378 dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1379 if (!dentry_inode) { 1380 jbd_debug(1, "Inode dentry not created."); 1381 ret = -ENOMEM; 1382 goto out; 1383 } 1384 1385 ret = __ext4_link(dir, inode, dentry_inode); 1386 /* 1387 * It's possible that link already existed since data blocks 1388 * for the dir in question got persisted before we crashed OR 1389 * we replayed this tag and crashed before the entire replay 1390 * could complete. 1391 */ 1392 if (ret && ret != -EEXIST) { 1393 jbd_debug(1, "Failed to link\n"); 1394 goto out; 1395 } 1396 1397 ret = 0; 1398 out: 1399 if (dentry_dir) { 1400 d_drop(dentry_dir); 1401 dput(dentry_dir); 1402 } else if (dir) { 1403 iput(dir); 1404 } 1405 if (dentry_inode) { 1406 d_drop(dentry_inode); 1407 dput(dentry_inode); 1408 } 1409 1410 return ret; 1411 } 1412 1413 /* Link replay function */ 1414 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) 1415 { 1416 struct inode *inode; 1417 struct dentry_info_args darg; 1418 int ret = 0; 1419 1420 tl_to_darg(&darg, tl); 1421 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, 1422 darg.parent_ino, darg.dname_len); 1423 1424 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1425 if (IS_ERR_OR_NULL(inode)) { 1426 jbd_debug(1, "Inode not found."); 1427 return 0; 1428 } 1429 1430 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1431 iput(inode); 1432 return ret; 1433 } 1434 1435 /* 1436 * Record all the modified inodes during replay. We use this later to setup 1437 * block bitmaps correctly. 1438 */ 1439 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) 1440 { 1441 struct ext4_fc_replay_state *state; 1442 int i; 1443 1444 state = &EXT4_SB(sb)->s_fc_replay_state; 1445 for (i = 0; i < state->fc_modified_inodes_used; i++) 1446 if (state->fc_modified_inodes[i] == ino) 1447 return 0; 1448 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { 1449 state->fc_modified_inodes_size += 1450 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1451 state->fc_modified_inodes = krealloc( 1452 state->fc_modified_inodes, sizeof(int) * 1453 state->fc_modified_inodes_size, 1454 GFP_KERNEL); 1455 if (!state->fc_modified_inodes) 1456 return -ENOMEM; 1457 } 1458 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; 1459 return 0; 1460 } 1461 1462 /* 1463 * Inode replay function 1464 */ 1465 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) 1466 { 1467 struct ext4_fc_inode *fc_inode; 1468 struct ext4_inode *raw_inode; 1469 struct ext4_inode *raw_fc_inode; 1470 struct inode *inode = NULL; 1471 struct ext4_iloc iloc; 1472 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag); 1473 struct ext4_extent_header *eh; 1474 1475 fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl); 1476 1477 ino = le32_to_cpu(fc_inode->fc_ino); 1478 trace_ext4_fc_replay(sb, tag, ino, 0, 0); 1479 1480 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1481 if (!IS_ERR_OR_NULL(inode)) { 1482 ext4_ext_clear_bb(inode); 1483 iput(inode); 1484 } 1485 1486 ext4_fc_record_modified_inode(sb, ino); 1487 1488 raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode; 1489 ret = ext4_get_fc_inode_loc(sb, ino, &iloc); 1490 if (ret) 1491 goto out; 1492 1493 inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode); 1494 raw_inode = ext4_raw_inode(&iloc); 1495 1496 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); 1497 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation, 1498 inode_len - offsetof(struct ext4_inode, i_generation)); 1499 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { 1500 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); 1501 if (eh->eh_magic != EXT4_EXT_MAGIC) { 1502 memset(eh, 0, sizeof(*eh)); 1503 eh->eh_magic = EXT4_EXT_MAGIC; 1504 eh->eh_max = cpu_to_le16( 1505 (sizeof(raw_inode->i_block) - 1506 sizeof(struct ext4_extent_header)) 1507 / sizeof(struct ext4_extent)); 1508 } 1509 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { 1510 memcpy(raw_inode->i_block, raw_fc_inode->i_block, 1511 sizeof(raw_inode->i_block)); 1512 } 1513 1514 /* Immediately update the inode on disk. */ 1515 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1516 if (ret) 1517 goto out; 1518 ret = sync_dirty_buffer(iloc.bh); 1519 if (ret) 1520 goto out; 1521 ret = ext4_mark_inode_used(sb, ino); 1522 if (ret) 1523 goto out; 1524 1525 /* Given that we just wrote the inode on disk, this SHOULD succeed. */ 1526 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); 1527 if (IS_ERR_OR_NULL(inode)) { 1528 jbd_debug(1, "Inode not found."); 1529 return -EFSCORRUPTED; 1530 } 1531 1532 /* 1533 * Our allocator could have made different decisions than before 1534 * crashing. This should be fixed but until then, we calculate 1535 * the number of blocks the inode. 1536 */ 1537 ext4_ext_replay_set_iblocks(inode); 1538 1539 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); 1540 ext4_reset_inode_seed(inode); 1541 1542 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); 1543 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); 1544 sync_dirty_buffer(iloc.bh); 1545 brelse(iloc.bh); 1546 out: 1547 iput(inode); 1548 if (!ret) 1549 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); 1550 1551 return 0; 1552 } 1553 1554 /* 1555 * Dentry create replay function. 1556 * 1557 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the 1558 * inode for which we are trying to create a dentry here, should already have 1559 * been replayed before we start here. 1560 */ 1561 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) 1562 { 1563 int ret = 0; 1564 struct inode *inode = NULL; 1565 struct inode *dir = NULL; 1566 struct dentry_info_args darg; 1567 1568 tl_to_darg(&darg, tl); 1569 1570 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, 1571 darg.parent_ino, darg.dname_len); 1572 1573 /* This takes care of update group descriptor and other metadata */ 1574 ret = ext4_mark_inode_used(sb, darg.ino); 1575 if (ret) 1576 goto out; 1577 1578 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); 1579 if (IS_ERR_OR_NULL(inode)) { 1580 jbd_debug(1, "inode %d not found.", darg.ino); 1581 inode = NULL; 1582 ret = -EINVAL; 1583 goto out; 1584 } 1585 1586 if (S_ISDIR(inode->i_mode)) { 1587 /* 1588 * If we are creating a directory, we need to make sure that the 1589 * dot and dot dot dirents are setup properly. 1590 */ 1591 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); 1592 if (IS_ERR_OR_NULL(dir)) { 1593 jbd_debug(1, "Dir %d not found.", darg.ino); 1594 goto out; 1595 } 1596 ret = ext4_init_new_dir(NULL, dir, inode); 1597 iput(dir); 1598 if (ret) { 1599 ret = 0; 1600 goto out; 1601 } 1602 } 1603 ret = ext4_fc_replay_link_internal(sb, &darg, inode); 1604 if (ret) 1605 goto out; 1606 set_nlink(inode, 1); 1607 ext4_mark_inode_dirty(NULL, inode); 1608 out: 1609 if (inode) 1610 iput(inode); 1611 return ret; 1612 } 1613 1614 /* 1615 * Record physical disk regions which are in use as per fast commit area. Our 1616 * simple replay phase allocator excludes these regions from allocation. 1617 */ 1618 static int ext4_fc_record_regions(struct super_block *sb, int ino, 1619 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len) 1620 { 1621 struct ext4_fc_replay_state *state; 1622 struct ext4_fc_alloc_region *region; 1623 1624 state = &EXT4_SB(sb)->s_fc_replay_state; 1625 if (state->fc_regions_used == state->fc_regions_size) { 1626 state->fc_regions_size += 1627 EXT4_FC_REPLAY_REALLOC_INCREMENT; 1628 state->fc_regions = krealloc( 1629 state->fc_regions, 1630 state->fc_regions_size * 1631 sizeof(struct ext4_fc_alloc_region), 1632 GFP_KERNEL); 1633 if (!state->fc_regions) 1634 return -ENOMEM; 1635 } 1636 region = &state->fc_regions[state->fc_regions_used++]; 1637 region->ino = ino; 1638 region->lblk = lblk; 1639 region->pblk = pblk; 1640 region->len = len; 1641 1642 return 0; 1643 } 1644 1645 /* Replay add range tag */ 1646 static int ext4_fc_replay_add_range(struct super_block *sb, 1647 struct ext4_fc_tl *tl) 1648 { 1649 struct ext4_fc_add_range *fc_add_ex; 1650 struct ext4_extent newex, *ex; 1651 struct inode *inode; 1652 ext4_lblk_t start, cur; 1653 int remaining, len; 1654 ext4_fsblk_t start_pblk; 1655 struct ext4_map_blocks map; 1656 struct ext4_ext_path *path = NULL; 1657 int ret; 1658 1659 fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 1660 ex = (struct ext4_extent *)&fc_add_ex->fc_ex; 1661 1662 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, 1663 le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block), 1664 ext4_ext_get_actual_len(ex)); 1665 1666 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), 1667 EXT4_IGET_NORMAL); 1668 if (IS_ERR_OR_NULL(inode)) { 1669 jbd_debug(1, "Inode not found."); 1670 return 0; 1671 } 1672 1673 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1674 1675 start = le32_to_cpu(ex->ee_block); 1676 start_pblk = ext4_ext_pblock(ex); 1677 len = ext4_ext_get_actual_len(ex); 1678 1679 cur = start; 1680 remaining = len; 1681 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", 1682 start, start_pblk, len, ext4_ext_is_unwritten(ex), 1683 inode->i_ino); 1684 1685 while (remaining > 0) { 1686 map.m_lblk = cur; 1687 map.m_len = remaining; 1688 map.m_pblk = 0; 1689 ret = ext4_map_blocks(NULL, inode, &map, 0); 1690 1691 if (ret < 0) { 1692 iput(inode); 1693 return 0; 1694 } 1695 1696 if (ret == 0) { 1697 /* Range is not mapped */ 1698 path = ext4_find_extent(inode, cur, NULL, 0); 1699 if (IS_ERR(path)) { 1700 iput(inode); 1701 return 0; 1702 } 1703 memset(&newex, 0, sizeof(newex)); 1704 newex.ee_block = cpu_to_le32(cur); 1705 ext4_ext_store_pblock( 1706 &newex, start_pblk + cur - start); 1707 newex.ee_len = cpu_to_le16(map.m_len); 1708 if (ext4_ext_is_unwritten(ex)) 1709 ext4_ext_mark_unwritten(&newex); 1710 down_write(&EXT4_I(inode)->i_data_sem); 1711 ret = ext4_ext_insert_extent( 1712 NULL, inode, &path, &newex, 0); 1713 up_write((&EXT4_I(inode)->i_data_sem)); 1714 ext4_ext_drop_refs(path); 1715 kfree(path); 1716 if (ret) { 1717 iput(inode); 1718 return 0; 1719 } 1720 goto next; 1721 } 1722 1723 if (start_pblk + cur - start != map.m_pblk) { 1724 /* 1725 * Logical to physical mapping changed. This can happen 1726 * if this range was removed and then reallocated to 1727 * map to new physical blocks during a fast commit. 1728 */ 1729 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1730 ext4_ext_is_unwritten(ex), 1731 start_pblk + cur - start); 1732 if (ret) { 1733 iput(inode); 1734 return 0; 1735 } 1736 /* 1737 * Mark the old blocks as free since they aren't used 1738 * anymore. We maintain an array of all the modified 1739 * inodes. In case these blocks are still used at either 1740 * a different logical range in the same inode or in 1741 * some different inode, we will mark them as allocated 1742 * at the end of the FC replay using our array of 1743 * modified inodes. 1744 */ 1745 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1746 goto next; 1747 } 1748 1749 /* Range is mapped and needs a state change */ 1750 jbd_debug(1, "Converting from %d to %d %lld", 1751 map.m_flags & EXT4_MAP_UNWRITTEN, 1752 ext4_ext_is_unwritten(ex), map.m_pblk); 1753 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, 1754 ext4_ext_is_unwritten(ex), map.m_pblk); 1755 if (ret) { 1756 iput(inode); 1757 return 0; 1758 } 1759 /* 1760 * We may have split the extent tree while toggling the state. 1761 * Try to shrink the extent tree now. 1762 */ 1763 ext4_ext_replay_shrink_inode(inode, start + len); 1764 next: 1765 cur += map.m_len; 1766 remaining -= map.m_len; 1767 } 1768 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> 1769 sb->s_blocksize_bits); 1770 iput(inode); 1771 return 0; 1772 } 1773 1774 /* Replay DEL_RANGE tag */ 1775 static int 1776 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) 1777 { 1778 struct inode *inode; 1779 struct ext4_fc_del_range *lrange; 1780 struct ext4_map_blocks map; 1781 ext4_lblk_t cur, remaining; 1782 int ret; 1783 1784 lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl); 1785 cur = le32_to_cpu(lrange->fc_lblk); 1786 remaining = le32_to_cpu(lrange->fc_len); 1787 1788 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, 1789 le32_to_cpu(lrange->fc_ino), cur, remaining); 1790 1791 inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); 1792 if (IS_ERR_OR_NULL(inode)) { 1793 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); 1794 return 0; 1795 } 1796 1797 ret = ext4_fc_record_modified_inode(sb, inode->i_ino); 1798 1799 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", 1800 inode->i_ino, le32_to_cpu(lrange->fc_lblk), 1801 le32_to_cpu(lrange->fc_len)); 1802 while (remaining > 0) { 1803 map.m_lblk = cur; 1804 map.m_len = remaining; 1805 1806 ret = ext4_map_blocks(NULL, inode, &map, 0); 1807 if (ret < 0) { 1808 iput(inode); 1809 return 0; 1810 } 1811 if (ret > 0) { 1812 remaining -= ret; 1813 cur += ret; 1814 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1815 } else { 1816 remaining -= map.m_len; 1817 cur += map.m_len; 1818 } 1819 } 1820 1821 ret = ext4_punch_hole(inode, 1822 le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits, 1823 le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits); 1824 if (ret) 1825 jbd_debug(1, "ext4_punch_hole returned %d", ret); 1826 ext4_ext_replay_shrink_inode(inode, 1827 i_size_read(inode) >> sb->s_blocksize_bits); 1828 ext4_mark_inode_dirty(NULL, inode); 1829 iput(inode); 1830 1831 return 0; 1832 } 1833 1834 static inline const char *tag2str(u16 tag) 1835 { 1836 switch (tag) { 1837 case EXT4_FC_TAG_LINK: 1838 return "TAG_ADD_ENTRY"; 1839 case EXT4_FC_TAG_UNLINK: 1840 return "TAG_DEL_ENTRY"; 1841 case EXT4_FC_TAG_ADD_RANGE: 1842 return "TAG_ADD_RANGE"; 1843 case EXT4_FC_TAG_CREAT: 1844 return "TAG_CREAT_DENTRY"; 1845 case EXT4_FC_TAG_DEL_RANGE: 1846 return "TAG_DEL_RANGE"; 1847 case EXT4_FC_TAG_INODE: 1848 return "TAG_INODE"; 1849 case EXT4_FC_TAG_PAD: 1850 return "TAG_PAD"; 1851 case EXT4_FC_TAG_TAIL: 1852 return "TAG_TAIL"; 1853 case EXT4_FC_TAG_HEAD: 1854 return "TAG_HEAD"; 1855 default: 1856 return "TAG_ERROR"; 1857 } 1858 } 1859 1860 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) 1861 { 1862 struct ext4_fc_replay_state *state; 1863 struct inode *inode; 1864 struct ext4_ext_path *path = NULL; 1865 struct ext4_map_blocks map; 1866 int i, ret, j; 1867 ext4_lblk_t cur, end; 1868 1869 state = &EXT4_SB(sb)->s_fc_replay_state; 1870 for (i = 0; i < state->fc_modified_inodes_used; i++) { 1871 inode = ext4_iget(sb, state->fc_modified_inodes[i], 1872 EXT4_IGET_NORMAL); 1873 if (IS_ERR_OR_NULL(inode)) { 1874 jbd_debug(1, "Inode %d not found.", 1875 state->fc_modified_inodes[i]); 1876 continue; 1877 } 1878 cur = 0; 1879 end = EXT_MAX_BLOCKS; 1880 while (cur < end) { 1881 map.m_lblk = cur; 1882 map.m_len = end - cur; 1883 1884 ret = ext4_map_blocks(NULL, inode, &map, 0); 1885 if (ret < 0) 1886 break; 1887 1888 if (ret > 0) { 1889 path = ext4_find_extent(inode, map.m_lblk, NULL, 0); 1890 if (!IS_ERR_OR_NULL(path)) { 1891 for (j = 0; j < path->p_depth; j++) 1892 ext4_mb_mark_bb(inode->i_sb, 1893 path[j].p_block, 1, 1); 1894 ext4_ext_drop_refs(path); 1895 kfree(path); 1896 } 1897 cur += ret; 1898 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1899 map.m_len, 1); 1900 } else { 1901 cur = cur + (map.m_len ? map.m_len : 1); 1902 } 1903 } 1904 iput(inode); 1905 } 1906 } 1907 1908 /* 1909 * Check if block is in excluded regions for block allocation. The simple 1910 * allocator that runs during replay phase is calls this function to see 1911 * if it is okay to use a block. 1912 */ 1913 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) 1914 { 1915 int i; 1916 struct ext4_fc_replay_state *state; 1917 1918 state = &EXT4_SB(sb)->s_fc_replay_state; 1919 for (i = 0; i < state->fc_regions_valid; i++) { 1920 if (state->fc_regions[i].ino == 0 || 1921 state->fc_regions[i].len == 0) 1922 continue; 1923 if (blk >= state->fc_regions[i].pblk && 1924 blk < state->fc_regions[i].pblk + state->fc_regions[i].len) 1925 return true; 1926 } 1927 return false; 1928 } 1929 1930 /* Cleanup function called after replay */ 1931 void ext4_fc_replay_cleanup(struct super_block *sb) 1932 { 1933 struct ext4_sb_info *sbi = EXT4_SB(sb); 1934 1935 sbi->s_mount_state &= ~EXT4_FC_REPLAY; 1936 kfree(sbi->s_fc_replay_state.fc_regions); 1937 kfree(sbi->s_fc_replay_state.fc_modified_inodes); 1938 } 1939 1940 /* 1941 * Recovery Scan phase handler 1942 * 1943 * This function is called during the scan phase and is responsible 1944 * for doing following things: 1945 * - Make sure the fast commit area has valid tags for replay 1946 * - Count number of tags that need to be replayed by the replay handler 1947 * - Verify CRC 1948 * - Create a list of excluded blocks for allocation during replay phase 1949 * 1950 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is 1951 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP 1952 * to indicate that scan has finished and JBD2 can now start replay phase. 1953 * It returns a negative error to indicate that there was an error. At the end 1954 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set 1955 * to indicate the number of tags that need to replayed during the replay phase. 1956 */ 1957 static int ext4_fc_replay_scan(journal_t *journal, 1958 struct buffer_head *bh, int off, 1959 tid_t expected_tid) 1960 { 1961 struct super_block *sb = journal->j_private; 1962 struct ext4_sb_info *sbi = EXT4_SB(sb); 1963 struct ext4_fc_replay_state *state; 1964 int ret = JBD2_FC_REPLAY_CONTINUE; 1965 struct ext4_fc_add_range *ext; 1966 struct ext4_fc_tl *tl; 1967 struct ext4_fc_tail *tail; 1968 __u8 *start, *end; 1969 struct ext4_fc_head *head; 1970 struct ext4_extent *ex; 1971 1972 state = &sbi->s_fc_replay_state; 1973 1974 start = (u8 *)bh->b_data; 1975 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 1976 1977 if (state->fc_replay_expected_off == 0) { 1978 state->fc_cur_tag = 0; 1979 state->fc_replay_num_tags = 0; 1980 state->fc_crc = 0; 1981 state->fc_regions = NULL; 1982 state->fc_regions_valid = state->fc_regions_used = 1983 state->fc_regions_size = 0; 1984 /* Check if we can stop early */ 1985 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) 1986 != EXT4_FC_TAG_HEAD) 1987 return 0; 1988 } 1989 1990 if (off != state->fc_replay_expected_off) { 1991 ret = -EFSCORRUPTED; 1992 goto out_err; 1993 } 1994 1995 state->fc_replay_expected_off++; 1996 fc_for_each_tl(start, end, tl) { 1997 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", 1998 tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr); 1999 switch (le16_to_cpu(tl->fc_tag)) { 2000 case EXT4_FC_TAG_ADD_RANGE: 2001 ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl); 2002 ex = (struct ext4_extent *)&ext->fc_ex; 2003 ret = ext4_fc_record_regions(sb, 2004 le32_to_cpu(ext->fc_ino), 2005 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), 2006 ext4_ext_get_actual_len(ex)); 2007 if (ret < 0) 2008 break; 2009 ret = JBD2_FC_REPLAY_CONTINUE; 2010 fallthrough; 2011 case EXT4_FC_TAG_DEL_RANGE: 2012 case EXT4_FC_TAG_LINK: 2013 case EXT4_FC_TAG_UNLINK: 2014 case EXT4_FC_TAG_CREAT: 2015 case EXT4_FC_TAG_INODE: 2016 case EXT4_FC_TAG_PAD: 2017 state->fc_cur_tag++; 2018 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 2019 sizeof(*tl) + ext4_fc_tag_len(tl)); 2020 break; 2021 case EXT4_FC_TAG_TAIL: 2022 state->fc_cur_tag++; 2023 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 2024 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 2025 sizeof(*tl) + 2026 offsetof(struct ext4_fc_tail, 2027 fc_crc)); 2028 if (le32_to_cpu(tail->fc_tid) == expected_tid && 2029 le32_to_cpu(tail->fc_crc) == state->fc_crc) { 2030 state->fc_replay_num_tags = state->fc_cur_tag; 2031 state->fc_regions_valid = 2032 state->fc_regions_used; 2033 } else { 2034 ret = state->fc_replay_num_tags ? 2035 JBD2_FC_REPLAY_STOP : -EFSBADCRC; 2036 } 2037 state->fc_crc = 0; 2038 break; 2039 case EXT4_FC_TAG_HEAD: 2040 head = (struct ext4_fc_head *)ext4_fc_tag_val(tl); 2041 if (le32_to_cpu(head->fc_features) & 2042 ~EXT4_FC_SUPPORTED_FEATURES) { 2043 ret = -EOPNOTSUPP; 2044 break; 2045 } 2046 if (le32_to_cpu(head->fc_tid) != expected_tid) { 2047 ret = JBD2_FC_REPLAY_STOP; 2048 break; 2049 } 2050 state->fc_cur_tag++; 2051 state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl, 2052 sizeof(*tl) + ext4_fc_tag_len(tl)); 2053 break; 2054 default: 2055 ret = state->fc_replay_num_tags ? 2056 JBD2_FC_REPLAY_STOP : -ECANCELED; 2057 } 2058 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) 2059 break; 2060 } 2061 2062 out_err: 2063 trace_ext4_fc_replay_scan(sb, ret, off); 2064 return ret; 2065 } 2066 2067 /* 2068 * Main recovery path entry point. 2069 * The meaning of return codes is similar as above. 2070 */ 2071 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, 2072 enum passtype pass, int off, tid_t expected_tid) 2073 { 2074 struct super_block *sb = journal->j_private; 2075 struct ext4_sb_info *sbi = EXT4_SB(sb); 2076 struct ext4_fc_tl *tl; 2077 __u8 *start, *end; 2078 int ret = JBD2_FC_REPLAY_CONTINUE; 2079 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; 2080 struct ext4_fc_tail *tail; 2081 2082 if (pass == PASS_SCAN) { 2083 state->fc_current_pass = PASS_SCAN; 2084 return ext4_fc_replay_scan(journal, bh, off, expected_tid); 2085 } 2086 2087 if (state->fc_current_pass != pass) { 2088 state->fc_current_pass = pass; 2089 sbi->s_mount_state |= EXT4_FC_REPLAY; 2090 } 2091 if (!sbi->s_fc_replay_state.fc_replay_num_tags) { 2092 jbd_debug(1, "Replay stops\n"); 2093 ext4_fc_set_bitmaps_and_counters(sb); 2094 return 0; 2095 } 2096 2097 #ifdef CONFIG_EXT4_DEBUG 2098 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { 2099 pr_warn("Dropping fc block %d because max_replay set\n", off); 2100 return JBD2_FC_REPLAY_STOP; 2101 } 2102 #endif 2103 2104 start = (u8 *)bh->b_data; 2105 end = (__u8 *)bh->b_data + journal->j_blocksize - 1; 2106 2107 fc_for_each_tl(start, end, tl) { 2108 if (state->fc_replay_num_tags == 0) { 2109 ret = JBD2_FC_REPLAY_STOP; 2110 ext4_fc_set_bitmaps_and_counters(sb); 2111 break; 2112 } 2113 jbd_debug(3, "Replay phase, tag:%s\n", 2114 tag2str(le16_to_cpu(tl->fc_tag))); 2115 state->fc_replay_num_tags--; 2116 switch (le16_to_cpu(tl->fc_tag)) { 2117 case EXT4_FC_TAG_LINK: 2118 ret = ext4_fc_replay_link(sb, tl); 2119 break; 2120 case EXT4_FC_TAG_UNLINK: 2121 ret = ext4_fc_replay_unlink(sb, tl); 2122 break; 2123 case EXT4_FC_TAG_ADD_RANGE: 2124 ret = ext4_fc_replay_add_range(sb, tl); 2125 break; 2126 case EXT4_FC_TAG_CREAT: 2127 ret = ext4_fc_replay_create(sb, tl); 2128 break; 2129 case EXT4_FC_TAG_DEL_RANGE: 2130 ret = ext4_fc_replay_del_range(sb, tl); 2131 break; 2132 case EXT4_FC_TAG_INODE: 2133 ret = ext4_fc_replay_inode(sb, tl); 2134 break; 2135 case EXT4_FC_TAG_PAD: 2136 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, 2137 ext4_fc_tag_len(tl), 0); 2138 break; 2139 case EXT4_FC_TAG_TAIL: 2140 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, 2141 ext4_fc_tag_len(tl), 0); 2142 tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl); 2143 WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid); 2144 break; 2145 case EXT4_FC_TAG_HEAD: 2146 break; 2147 default: 2148 trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0, 2149 ext4_fc_tag_len(tl), 0); 2150 ret = -ECANCELED; 2151 break; 2152 } 2153 if (ret < 0) 2154 break; 2155 ret = JBD2_FC_REPLAY_CONTINUE; 2156 } 2157 return ret; 2158 } 2159 2160 void ext4_fc_init(struct super_block *sb, journal_t *journal) 2161 { 2162 /* 2163 * We set replay callback even if fast commit disabled because we may 2164 * could still have fast commit blocks that need to be replayed even if 2165 * fast commit has now been turned off. 2166 */ 2167 journal->j_fc_replay_callback = ext4_fc_replay; 2168 if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) 2169 return; 2170 journal->j_fc_cleanup_callback = ext4_fc_cleanup; 2171 } 2172 2173 static const char *fc_ineligible_reasons[] = { 2174 "Extended attributes changed", 2175 "Cross rename", 2176 "Journal flag changed", 2177 "Insufficient memory", 2178 "Swap boot", 2179 "Resize", 2180 "Dir renamed", 2181 "Falloc range op", 2182 "Data journalling", 2183 "FC Commit Failed" 2184 }; 2185 2186 int ext4_fc_info_show(struct seq_file *seq, void *v) 2187 { 2188 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); 2189 struct ext4_fc_stats *stats = &sbi->s_fc_stats; 2190 int i; 2191 2192 if (v != SEQ_START_TOKEN) 2193 return 0; 2194 2195 seq_printf(seq, 2196 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", 2197 stats->fc_num_commits, stats->fc_ineligible_commits, 2198 stats->fc_numblks, 2199 div_u64(sbi->s_fc_avg_commit_time, 1000)); 2200 seq_puts(seq, "Ineligible reasons:\n"); 2201 for (i = 0; i < EXT4_FC_REASON_MAX; i++) 2202 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], 2203 stats->fc_ineligible_reason_count[i]); 2204 2205 return 0; 2206 } 2207 2208 int __init ext4_fc_init_dentry_cache(void) 2209 { 2210 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, 2211 SLAB_RECLAIM_ACCOUNT); 2212 2213 if (ext4_fc_dentry_cachep == NULL) 2214 return -ENOMEM; 2215 2216 return 0; 2217 } 2218