1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * linux/fs/jbd2/commit.c 4 * 5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 6 * 7 * Copyright 1998 Red Hat corp --- All Rights Reserved 8 * 9 * Journal commit routines for the generic filesystem journaling code; 10 * part of the ext2fs journaling system. 11 */ 12 13 #include <linux/time.h> 14 #include <linux/fs.h> 15 #include <linux/jbd2.h> 16 #include <linux/errno.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 #include <linux/pagemap.h> 20 #include <linux/jiffies.h> 21 #include <linux/crc32.h> 22 #include <linux/writeback.h> 23 #include <linux/backing-dev.h> 24 #include <linux/bio.h> 25 #include <linux/blkdev.h> 26 #include <linux/bitops.h> 27 #include <trace/events/jbd2.h> 28 29 /* 30 * IO end handler for temporary buffer_heads handling writes to the journal. 31 */ 32 static void journal_end_buffer_io_sync(struct bio *bio) 33 { 34 struct buffer_head *bh; 35 bool uptodate = bio_endio_bh(bio, &bh); 36 struct buffer_head *orig_bh = bh->b_private; 37 38 BUFFER_TRACE(bh, ""); 39 if (uptodate) 40 set_buffer_uptodate(bh); 41 else 42 clear_buffer_uptodate(bh); 43 if (orig_bh) { 44 clear_and_wake_up_bit(BH_Shadow, &orig_bh->b_state); 45 } 46 unlock_buffer(bh); 47 } 48 49 /* 50 * When an ext4 file is truncated, it is possible that some pages are not 51 * successfully freed, because they are attached to a committing transaction. 52 * After the transaction commits, these pages are left on the LRU, with no 53 * ->mapping, and with attached buffers. These pages are trivially reclaimable 54 * by the VM, but their apparent absence upsets the VM accounting, and it makes 55 * the numbers in /proc/meminfo look odd. 56 * 57 * So here, we have a buffer which has just come off the forget list. Look to 58 * see if we can strip all buffers from the backing page. 59 * 60 * Called under j_list_lock. The caller provided us with a ref against the 61 * buffer, and we drop that here. 62 */ 63 static void release_buffer_page(struct buffer_head *bh) 64 { 65 struct folio *folio; 66 67 if (buffer_dirty(bh)) 68 goto nope; 69 if (atomic_read(&bh->b_count) != 1) 70 goto nope; 71 folio = bh->b_folio; 72 if (folio->mapping) 73 goto nope; 74 75 /* OK, it's a truncated page */ 76 if (!folio_trylock(folio)) 77 goto nope; 78 79 folio_get(folio); 80 __brelse(bh); 81 try_to_free_buffers(folio); 82 folio_unlock(folio); 83 folio_put(folio); 84 return; 85 86 nope: 87 __brelse(bh); 88 } 89 90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 91 { 92 struct commit_header *h; 93 __u32 csum; 94 95 if (!jbd2_journal_has_csum_v2or3(j)) 96 return; 97 98 h = (struct commit_header *)(bh->b_data); 99 h->h_chksum_type = 0; 100 h->h_chksum_size = 0; 101 h->h_chksum[0] = 0; 102 csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); 103 h->h_chksum[0] = cpu_to_be32(csum); 104 } 105 106 /* 107 * Done it all: now submit the commit record. We should have 108 * cleaned up our previous buffers by now, so if we are in abort 109 * mode we can now just skip the rest of the journal write 110 * entirely. 111 * 112 * Returns 1 if the journal needs to be aborted or 0 on success 113 */ 114 static int journal_submit_commit_record(journal_t *journal, 115 transaction_t *commit_transaction, 116 struct buffer_head **cbh, 117 __u32 crc32_sum) 118 { 119 struct commit_header *tmp; 120 struct buffer_head *bh; 121 struct timespec64 now; 122 blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; 123 124 *cbh = NULL; 125 126 if (is_journal_aborted(journal)) 127 return 0; 128 129 bh = jbd2_journal_get_descriptor_buffer(commit_transaction, 130 JBD2_COMMIT_BLOCK); 131 if (!bh) 132 return 1; 133 134 tmp = (struct commit_header *)bh->b_data; 135 ktime_get_coarse_real_ts64(&now); 136 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 137 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 138 139 if (jbd2_has_feature_checksum(journal)) { 140 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 141 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 142 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 143 } 144 jbd2_commit_block_csum_set(journal, bh); 145 146 BUFFER_TRACE(bh, "submit commit block"); 147 lock_buffer(bh); 148 clear_buffer_dirty(bh); 149 set_buffer_uptodate(bh); 150 151 if (journal->j_flags & JBD2_BARRIER && 152 !jbd2_has_feature_async_commit(journal)) 153 write_flags |= REQ_PREFLUSH | REQ_FUA; 154 155 bh_submit(bh, write_flags, journal_end_buffer_io_sync); 156 *cbh = bh; 157 return 0; 158 } 159 160 /* 161 * This function along with journal_submit_commit_record 162 * allows to write the commit record asynchronously. 163 */ 164 static int journal_wait_on_commit_record(journal_t *journal, 165 struct buffer_head *bh) 166 { 167 int ret = 0; 168 169 clear_buffer_dirty(bh); 170 wait_on_buffer(bh); 171 172 if (unlikely(!buffer_uptodate(bh))) 173 ret = -EIO; 174 put_bh(bh); /* One for getblk() */ 175 176 return ret; 177 } 178 179 /* Send all the data buffers related to an inode */ 180 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) 181 { 182 unsigned long flags; 183 184 if (!jinode) 185 return 0; 186 187 flags = READ_ONCE(jinode->i_flags); 188 if (!(flags & JI_WRITE_DATA)) 189 return 0; 190 191 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 192 return journal->j_submit_inode_data_buffers(jinode); 193 194 } 195 EXPORT_SYMBOL(jbd2_submit_inode_data); 196 197 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) 198 { 199 struct address_space *mapping; 200 struct inode *inode; 201 unsigned long flags; 202 loff_t start_byte, end_byte; 203 204 if (!jinode) 205 return 0; 206 207 flags = READ_ONCE(jinode->i_flags); 208 if (!(flags & JI_WAIT_DATA)) 209 return 0; 210 211 inode = jinode->i_vfs_inode; 212 if (!inode) 213 return 0; 214 215 mapping = inode->i_mapping; 216 if (!mapping) 217 return 0; 218 219 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 220 return 0; 221 return filemap_fdatawait_range_keep_errors( 222 mapping, start_byte, end_byte); 223 } 224 EXPORT_SYMBOL(jbd2_wait_inode_data); 225 226 /* 227 * Submit all the data buffers of inode associated with the transaction to 228 * disk. 229 * 230 * We are in a committing transaction. Therefore no new inode can be added to 231 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 232 * operate on from being released while we write out pages. 233 */ 234 static int journal_submit_data_buffers(journal_t *journal, 235 transaction_t *commit_transaction) 236 { 237 struct jbd2_inode *jinode; 238 int err, ret = 0; 239 240 spin_lock(&journal->j_list_lock); 241 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 242 if (!(jinode->i_flags & JI_WRITE_DATA)) 243 continue; 244 WRITE_ONCE(jinode->i_flags, 245 jinode->i_flags | JI_COMMIT_RUNNING); 246 spin_unlock(&journal->j_list_lock); 247 /* submit the inode data buffers. */ 248 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 249 if (journal->j_submit_inode_data_buffers) { 250 err = journal->j_submit_inode_data_buffers(jinode); 251 if (!ret) 252 ret = err; 253 } 254 spin_lock(&journal->j_list_lock); 255 J_ASSERT(jinode->i_transaction == commit_transaction); 256 WRITE_ONCE(jinode->i_flags, 257 jinode->i_flags & ~JI_COMMIT_RUNNING); 258 smp_mb(); 259 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 260 } 261 spin_unlock(&journal->j_list_lock); 262 return ret; 263 } 264 265 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 266 { 267 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 268 loff_t start_byte, end_byte; 269 270 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 271 return 0; 272 273 return filemap_fdatawait_range_keep_errors(mapping, 274 start_byte, end_byte); 275 } 276 277 /* 278 * Wait for data submitted for writeout, refile inodes to proper 279 * transaction if needed. 280 * 281 */ 282 static int journal_finish_inode_data_buffers(journal_t *journal, 283 transaction_t *commit_transaction) 284 { 285 struct jbd2_inode *jinode, *next_i; 286 int err, ret = 0; 287 288 /* For locking, see the comment in journal_submit_data_buffers() */ 289 spin_lock(&journal->j_list_lock); 290 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 291 if (!(jinode->i_flags & JI_WAIT_DATA)) 292 continue; 293 WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); 294 spin_unlock(&journal->j_list_lock); 295 /* wait for the inode data buffers writeout. */ 296 if (journal->j_finish_inode_data_buffers) { 297 err = journal->j_finish_inode_data_buffers(jinode); 298 if (!ret) 299 ret = err; 300 } 301 cond_resched(); 302 spin_lock(&journal->j_list_lock); 303 WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); 304 smp_mb(); 305 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 306 } 307 308 /* Now refile inode to proper lists */ 309 list_for_each_entry_safe(jinode, next_i, 310 &commit_transaction->t_inode_list, i_list) { 311 list_del(&jinode->i_list); 312 if (jinode->i_next_transaction) { 313 jinode->i_transaction = jinode->i_next_transaction; 314 jinode->i_next_transaction = NULL; 315 list_add(&jinode->i_list, 316 &jinode->i_transaction->t_inode_list); 317 } else { 318 jinode->i_transaction = NULL; 319 WRITE_ONCE(jinode->i_dirty_start_page, 0); 320 WRITE_ONCE(jinode->i_dirty_end_page, 0); 321 } 322 } 323 spin_unlock(&journal->j_list_lock); 324 325 return ret; 326 } 327 328 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 329 { 330 char *addr; 331 __u32 checksum; 332 333 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 334 checksum = crc32_be(crc32_sum, addr, bh->b_size); 335 kunmap_local(addr); 336 337 return checksum; 338 } 339 340 static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 341 unsigned long long block) 342 { 343 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 344 if (jbd2_has_feature_64bit(j)) 345 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 346 } 347 348 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 349 struct buffer_head *bh, __u32 sequence) 350 { 351 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 352 __u8 *addr; 353 __u32 csum32; 354 __be32 seq; 355 356 if (!jbd2_journal_has_csum_v2or3(j)) 357 return; 358 359 seq = cpu_to_be32(sequence); 360 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 361 csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 362 csum32 = jbd2_chksum(csum32, addr, bh->b_size); 363 kunmap_local(addr); 364 365 if (jbd2_has_feature_csum3(j)) 366 tag3->t_checksum = cpu_to_be32(csum32); 367 else 368 tag->t_checksum = cpu_to_be16(csum32); 369 } 370 /* 371 * jbd2_journal_commit_transaction 372 * 373 * The primary function for committing a transaction to the log. This 374 * function is called by the journal thread to begin a complete commit. 375 */ 376 void jbd2_journal_commit_transaction(journal_t *journal) 377 { 378 struct transaction_stats_s stats; 379 transaction_t *commit_transaction; 380 struct journal_head *jh; 381 struct buffer_head *descriptor; 382 struct buffer_head **wbuf = journal->j_wbuf; 383 int bufs; 384 int escape; 385 int err; 386 unsigned long long blocknr; 387 ktime_t start_time; 388 u64 commit_time; 389 char *tagp = NULL; 390 journal_block_tag_t *tag = NULL; 391 int space_left = 0; 392 int first_tag = 0; 393 int tag_flag; 394 int i; 395 int tag_bytes = journal_tag_bytes(journal); 396 struct buffer_head *cbh = NULL; /* For transactional checksums */ 397 __u32 crc32_sum = ~0; 398 struct blk_plug plug; 399 /* Tail of the journal */ 400 unsigned long first_block; 401 tid_t first_tid; 402 int update_tail; 403 int csum_size = 0; 404 LIST_HEAD(io_bufs); 405 LIST_HEAD(log_bufs); 406 407 if (jbd2_journal_has_csum_v2or3(journal)) 408 csum_size = sizeof(struct jbd2_journal_block_tail); 409 410 /* 411 * First job: lock down the current transaction and wait for 412 * all outstanding updates to complete. 413 */ 414 415 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 416 if (journal->j_flags & JBD2_FLUSHED) { 417 jbd2_debug(3, "super block updated\n"); 418 mutex_lock_io(&journal->j_checkpoint_mutex); 419 /* 420 * We hold j_checkpoint_mutex so tail cannot change under us. 421 * We don't need any special data guarantees for writing sb 422 * since journal is empty and it is ok for write to be 423 * flushed only with transaction commit. 424 */ 425 jbd2_journal_update_sb_log_tail(journal, 426 journal->j_tail_sequence, 427 journal->j_tail, 0); 428 mutex_unlock(&journal->j_checkpoint_mutex); 429 } else { 430 jbd2_debug(3, "superblock not updated\n"); 431 } 432 433 J_ASSERT(journal->j_running_transaction != NULL); 434 J_ASSERT(journal->j_committing_transaction == NULL); 435 436 write_lock(&journal->j_state_lock); 437 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; 438 while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { 439 DEFINE_WAIT(wait); 440 441 prepare_to_wait(&journal->j_fc_wait, &wait, 442 TASK_UNINTERRUPTIBLE); 443 write_unlock(&journal->j_state_lock); 444 schedule(); 445 write_lock(&journal->j_state_lock); 446 finish_wait(&journal->j_fc_wait, &wait); 447 /* 448 * TODO: by blocking fast commits here, we are increasing 449 * fsync() latency slightly. Strictly speaking, we don't need 450 * to block fast commits until the transaction enters T_FLUSH 451 * state. So an optimization is possible where we block new fast 452 * commits here and wait for existing ones to complete 453 * just before we enter T_FLUSH. That way, the existing fast 454 * commits and this full commit can proceed parallely. 455 */ 456 } 457 write_unlock(&journal->j_state_lock); 458 459 commit_transaction = journal->j_running_transaction; 460 461 trace_jbd2_start_commit(journal, commit_transaction); 462 jbd2_debug(1, "JBD2: starting commit of transaction %d\n", 463 commit_transaction->t_tid); 464 465 write_lock(&journal->j_state_lock); 466 journal->j_fc_off = 0; 467 J_ASSERT(commit_transaction->t_state == T_RUNNING); 468 commit_transaction->t_state = T_LOCKED; 469 470 trace_jbd2_commit_locking(journal, commit_transaction); 471 stats.run.rs_wait = commit_transaction->t_max_wait; 472 stats.run.rs_request_delay = 0; 473 stats.run.rs_locked = jiffies; 474 if (commit_transaction->t_requested) 475 stats.run.rs_request_delay = 476 jbd2_time_diff(commit_transaction->t_requested, 477 stats.run.rs_locked); 478 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 479 stats.run.rs_locked); 480 481 // waits for any t_updates to finish 482 jbd2_journal_wait_updates(journal); 483 484 commit_transaction->t_state = T_SWITCH; 485 486 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 487 journal->j_max_transaction_buffers); 488 489 /* 490 * First thing we are allowed to do is to discard any remaining 491 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 492 * that there are no such buffers: if a large filesystem 493 * operation like a truncate needs to split itself over multiple 494 * transactions, then it may try to do a jbd2_journal_restart() while 495 * there are still BJ_Reserved buffers outstanding. These must 496 * be released cleanly from the current transaction. 497 * 498 * In this case, the filesystem must still reserve write access 499 * again before modifying the buffer in the new transaction, but 500 * we do not require it to remember exactly which old buffers it 501 * has reserved. This is consistent with the existing behaviour 502 * that multiple jbd2_journal_get_write_access() calls to the same 503 * buffer are perfectly permissible. 504 * We use journal->j_state_lock here to serialize processing of 505 * t_reserved_list with eviction of buffers from journal_unmap_buffer(). 506 */ 507 while (commit_transaction->t_reserved_list) { 508 jh = commit_transaction->t_reserved_list; 509 JBUFFER_TRACE(jh, "reserved, unused: refile"); 510 /* 511 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 512 * leave undo-committed data. 513 */ 514 if (jh->b_committed_data) { 515 spin_lock(&jh->b_state_lock); 516 kfree(jh->b_committed_data); 517 jh->b_committed_data = NULL; 518 spin_unlock(&jh->b_state_lock); 519 } 520 jbd2_journal_refile_buffer(journal, jh); 521 } 522 523 write_unlock(&journal->j_state_lock); 524 /* 525 * Now try to drop any written-back buffers from the journal's 526 * checkpoint lists. We do this *before* commit because it potentially 527 * frees some memory 528 */ 529 spin_lock(&journal->j_list_lock); 530 __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); 531 spin_unlock(&journal->j_list_lock); 532 533 jbd2_debug(3, "JBD2: commit phase 1\n"); 534 535 /* 536 * Clear revoked flag to reflect there is no revoked buffers 537 * in the next transaction which is going to be started. 538 */ 539 jbd2_clear_buffer_revoked_flags(journal); 540 541 /* 542 * Switch to a new revoke table. 543 */ 544 jbd2_journal_switch_revoke_table(journal); 545 546 write_lock(&journal->j_state_lock); 547 /* 548 * Reserved credits cannot be claimed anymore, free them 549 */ 550 atomic_sub(atomic_read(&journal->j_reserved_credits), 551 &commit_transaction->t_outstanding_credits); 552 553 trace_jbd2_commit_flushing(journal, commit_transaction); 554 stats.run.rs_flushing = jiffies; 555 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 556 stats.run.rs_flushing); 557 558 commit_transaction->t_state = T_FLUSH; 559 journal->j_committing_transaction = commit_transaction; 560 journal->j_running_transaction = NULL; 561 start_time = ktime_get(); 562 commit_transaction->t_log_start = journal->j_head; 563 wake_up_all(&journal->j_wait_transaction_locked); 564 write_unlock(&journal->j_state_lock); 565 566 jbd2_debug(3, "JBD2: commit phase 2a\n"); 567 568 /* 569 * Now start flushing things to disk, in the order they appear 570 * on the transaction lists. Data blocks go first. 571 */ 572 err = journal_submit_data_buffers(journal, commit_transaction); 573 if (err) 574 jbd2_journal_abort(journal, err); 575 576 blk_start_plug(&plug); 577 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); 578 579 jbd2_debug(3, "JBD2: commit phase 2b\n"); 580 581 /* 582 * Way to go: we have now written out all of the data for a 583 * transaction! Now comes the tricky part: we need to write out 584 * metadata. Loop over the transaction's entire buffer list: 585 */ 586 write_lock(&journal->j_state_lock); 587 commit_transaction->t_state = T_COMMIT; 588 write_unlock(&journal->j_state_lock); 589 590 trace_jbd2_commit_logging(journal, commit_transaction); 591 stats.run.rs_logging = jiffies; 592 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 593 stats.run.rs_logging); 594 stats.run.rs_blocks = commit_transaction->t_nr_buffers; 595 stats.run.rs_blocks_logged = 0; 596 597 J_ASSERT(commit_transaction->t_nr_buffers <= 598 atomic_read(&commit_transaction->t_outstanding_credits)); 599 600 bufs = 0; 601 descriptor = NULL; 602 while (commit_transaction->t_buffers) { 603 604 /* Find the next buffer to be journaled... */ 605 606 jh = commit_transaction->t_buffers; 607 608 /* If we're in abort mode, we just un-journal the buffer and 609 release it. */ 610 611 if (is_journal_aborted(journal)) { 612 clear_buffer_jbddirty(jh2bh(jh)); 613 JBUFFER_TRACE(jh, "journal is aborting: refile"); 614 jbd2_buffer_abort_trigger(jh, 615 jh->b_frozen_data ? 616 jh->b_frozen_triggers : 617 jh->b_triggers); 618 jbd2_journal_refile_buffer(journal, jh); 619 /* If that was the last one, we need to clean up 620 * any descriptor buffers which may have been 621 * already allocated, even if we are now 622 * aborting. */ 623 if (!commit_transaction->t_buffers) 624 goto start_journal_io; 625 continue; 626 } 627 628 /* Make sure we have a descriptor block in which to 629 record the metadata buffer. */ 630 631 if (!descriptor) { 632 J_ASSERT (bufs == 0); 633 634 jbd2_debug(4, "JBD2: get descriptor\n"); 635 636 descriptor = jbd2_journal_get_descriptor_buffer( 637 commit_transaction, 638 JBD2_DESCRIPTOR_BLOCK); 639 if (!descriptor) { 640 jbd2_journal_abort(journal, -EIO); 641 continue; 642 } 643 644 jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", 645 (unsigned long long)descriptor->b_blocknr, 646 descriptor->b_data); 647 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 648 space_left = descriptor->b_size - 649 sizeof(journal_header_t); 650 first_tag = 1; 651 set_buffer_jwrite(descriptor); 652 set_buffer_dirty(descriptor); 653 wbuf[bufs++] = descriptor; 654 655 /* Record it so that we can wait for IO 656 completion later */ 657 BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 658 jbd2_file_log_bh(&log_bufs, descriptor); 659 } 660 661 /* Where is the buffer to be written? */ 662 663 err = jbd2_journal_next_log_block(journal, &blocknr); 664 /* If the block mapping failed, just abandon the buffer 665 and repeat this loop: we'll fall into the 666 refile-on-abort condition above. */ 667 if (err) { 668 jbd2_journal_abort(journal, err); 669 continue; 670 } 671 672 /* 673 * start_this_handle() uses t_outstanding_credits to determine 674 * the free space in the log. 675 */ 676 atomic_dec(&commit_transaction->t_outstanding_credits); 677 678 /* Bump b_count to prevent truncate from stumbling over 679 the shadowed buffer! @@@ This can go if we ever get 680 rid of the shadow pairing of buffers. */ 681 atomic_inc(&jh2bh(jh)->b_count); 682 683 /* 684 * Make a temporary IO buffer with which to write it out 685 * (this will requeue the metadata buffer to BJ_Shadow). 686 */ 687 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 688 JBUFFER_TRACE(jh, "ph3: write metadata"); 689 escape = jbd2_journal_write_metadata_buffer(commit_transaction, 690 jh, &wbuf[bufs], blocknr); 691 jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 692 693 /* Record the new block's tag in the current descriptor 694 buffer */ 695 696 tag_flag = 0; 697 if (escape) 698 tag_flag |= JBD2_FLAG_ESCAPE; 699 if (!first_tag) 700 tag_flag |= JBD2_FLAG_SAME_UUID; 701 702 tag = (journal_block_tag_t *) tagp; 703 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 704 tag->t_flags = cpu_to_be16(tag_flag); 705 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 706 commit_transaction->t_tid); 707 tagp += tag_bytes; 708 space_left -= tag_bytes; 709 bufs++; 710 711 if (first_tag) { 712 memcpy (tagp, journal->j_uuid, 16); 713 tagp += 16; 714 space_left -= 16; 715 first_tag = 0; 716 } 717 718 /* If there's no more to do, or if the descriptor is full, 719 let the IO rip! */ 720 721 if (bufs == journal->j_wbufsize || 722 commit_transaction->t_buffers == NULL || 723 space_left < tag_bytes + 16 + csum_size) { 724 725 jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); 726 727 /* Write an end-of-descriptor marker before 728 submitting the IOs. "tag" still points to 729 the last tag we set up. */ 730 731 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 732 start_journal_io: 733 if (descriptor) 734 jbd2_descriptor_block_csum_set(journal, 735 descriptor); 736 737 for (i = 0; i < bufs; i++) { 738 struct buffer_head *bh = wbuf[i]; 739 740 /* 741 * Compute checksum. 742 */ 743 if (jbd2_has_feature_checksum(journal)) { 744 crc32_sum = 745 jbd2_checksum_data(crc32_sum, bh); 746 } 747 748 lock_buffer(bh); 749 clear_buffer_dirty(bh); 750 set_buffer_uptodate(bh); 751 bh_submit(bh, 752 REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, 753 journal_end_buffer_io_sync); 754 } 755 cond_resched(); 756 757 /* Force a new descriptor to be generated next 758 time round the loop. */ 759 descriptor = NULL; 760 bufs = 0; 761 } 762 } 763 764 err = journal_finish_inode_data_buffers(journal, commit_transaction); 765 if (err) { 766 printk(KERN_WARNING 767 "JBD2: Detected IO errors %d while flushing file data on %s\n", 768 err, journal->j_devname); 769 err = 0; 770 } 771 772 /* 773 * Get current oldest transaction in the log before we issue flush 774 * to the filesystem device. After the flush we can be sure that 775 * blocks of all older transactions are checkpointed to persistent 776 * storage and we will be safe to update journal start in the 777 * superblock with the numbers we get here. 778 */ 779 update_tail = 780 jbd2_journal_get_log_tail(journal, &first_tid, &first_block); 781 782 write_lock(&journal->j_state_lock); 783 if (update_tail) { 784 long freed = first_block - journal->j_tail; 785 786 if (first_block < journal->j_tail) 787 freed += journal->j_last - journal->j_first; 788 /* Update tail only if we free significant amount of space */ 789 if (freed < journal->j_max_transaction_buffers) 790 update_tail = 0; 791 } 792 J_ASSERT(commit_transaction->t_state == T_COMMIT); 793 commit_transaction->t_state = T_COMMIT_DFLUSH; 794 write_unlock(&journal->j_state_lock); 795 796 /* 797 * If the journal is not located on the file system device, 798 * then we must flush the file system device before we issue 799 * the commit record and update the journal tail sequence. 800 */ 801 if ((commit_transaction->t_need_data_flush || update_tail) && 802 (journal->j_fs_dev != journal->j_dev) && 803 (journal->j_flags & JBD2_BARRIER)) 804 blkdev_issue_flush(journal->j_fs_dev); 805 806 /* Done it all: now write the commit record asynchronously. */ 807 if (jbd2_has_feature_async_commit(journal)) { 808 err = journal_submit_commit_record(journal, commit_transaction, 809 &cbh, crc32_sum); 810 if (err) 811 jbd2_journal_abort(journal, err); 812 } 813 814 blk_finish_plug(&plug); 815 816 /* Lo and behold: we have just managed to send a transaction to 817 the log. Before we can commit it, wait for the IO so far to 818 complete. Control buffers being written are on the 819 transaction's t_log_list queue, and metadata buffers are on 820 the io_bufs list. 821 822 Wait for the buffers in reverse order. That way we are 823 less likely to be woken up until all IOs have completed, and 824 so we incur less scheduling load. 825 */ 826 827 jbd2_debug(3, "JBD2: commit phase 3\n"); 828 829 while (!list_empty(&io_bufs)) { 830 struct buffer_head *bh = list_entry(io_bufs.prev, 831 struct buffer_head, 832 b_assoc_buffers); 833 834 wait_on_buffer(bh); 835 cond_resched(); 836 837 if (unlikely(!buffer_uptodate(bh))) 838 err = -EIO; 839 jbd2_unfile_log_bh(bh); 840 stats.run.rs_blocks_logged++; 841 842 /* 843 * The list contains temporary buffer heads created by 844 * jbd2_journal_write_metadata_buffer(). 845 */ 846 BUFFER_TRACE(bh, "dumping temporary bh"); 847 __brelse(bh); 848 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 849 free_buffer_head(bh); 850 851 /* We also have to refile the corresponding shadowed buffer */ 852 jh = commit_transaction->t_shadow_list->b_tprev; 853 bh = jh2bh(jh); 854 clear_buffer_jwrite(bh); 855 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 856 J_ASSERT_BH(bh, !buffer_shadow(bh)); 857 858 /* The metadata is now released for reuse, but we need 859 to remember it against this transaction so that when 860 we finally commit, we can do any checkpointing 861 required. */ 862 JBUFFER_TRACE(jh, "file as BJ_Forget"); 863 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 864 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 865 __brelse(bh); 866 } 867 868 J_ASSERT (commit_transaction->t_shadow_list == NULL); 869 870 jbd2_debug(3, "JBD2: commit phase 4\n"); 871 872 /* Here we wait for the revoke record and descriptor record buffers */ 873 while (!list_empty(&log_bufs)) { 874 struct buffer_head *bh; 875 876 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 877 wait_on_buffer(bh); 878 cond_resched(); 879 880 if (unlikely(!buffer_uptodate(bh))) 881 err = -EIO; 882 883 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 884 clear_buffer_jwrite(bh); 885 jbd2_unfile_log_bh(bh); 886 stats.run.rs_blocks_logged++; 887 __brelse(bh); /* One for getblk */ 888 /* AKPM: bforget here */ 889 } 890 891 if (err) 892 jbd2_journal_abort(journal, err); 893 894 jbd2_debug(3, "JBD2: commit phase 5\n"); 895 write_lock(&journal->j_state_lock); 896 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 897 commit_transaction->t_state = T_COMMIT_JFLUSH; 898 write_unlock(&journal->j_state_lock); 899 900 if (!jbd2_has_feature_async_commit(journal)) { 901 err = journal_submit_commit_record(journal, commit_transaction, 902 &cbh, crc32_sum); 903 if (err) 904 jbd2_journal_abort(journal, err); 905 } 906 if (cbh) 907 err = journal_wait_on_commit_record(journal, cbh); 908 stats.run.rs_blocks_logged++; 909 if (jbd2_has_feature_async_commit(journal) && 910 journal->j_flags & JBD2_BARRIER) { 911 blkdev_issue_flush(journal->j_dev); 912 } 913 914 if (err) 915 jbd2_journal_abort(journal, err); 916 917 WARN_ON_ONCE( 918 atomic_read(&commit_transaction->t_outstanding_credits) < 0); 919 920 /* 921 * Now disk caches for filesystem device are flushed so we are safe to 922 * erase checkpointed transactions from the log by updating journal 923 * superblock. 924 */ 925 if (update_tail) 926 jbd2_update_log_tail(journal, first_tid, first_block); 927 928 /* End of a transaction! Finally, we can do checkpoint 929 processing: any buffers committed as a result of this 930 transaction can be removed from any checkpoint list it was on 931 before. */ 932 933 jbd2_debug(3, "JBD2: commit phase 6\n"); 934 935 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 936 J_ASSERT(commit_transaction->t_buffers == NULL); 937 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 938 J_ASSERT(commit_transaction->t_shadow_list == NULL); 939 940 restart_loop: 941 /* 942 * As there are other places (journal_unmap_buffer()) adding buffers 943 * to this list we have to be careful and hold the j_list_lock. 944 */ 945 spin_lock(&journal->j_list_lock); 946 while (commit_transaction->t_forget) { 947 transaction_t *cp_transaction; 948 struct buffer_head *bh; 949 int try_to_free = 0; 950 bool drop_ref; 951 952 jh = commit_transaction->t_forget; 953 spin_unlock(&journal->j_list_lock); 954 bh = jh2bh(jh); 955 /* 956 * Get a reference so that bh cannot be freed before we are 957 * done with it. 958 */ 959 get_bh(bh); 960 spin_lock(&jh->b_state_lock); 961 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 962 963 /* 964 * If there is undo-protected committed data against 965 * this buffer, then we can remove it now. If it is a 966 * buffer needing such protection, the old frozen_data 967 * field now points to a committed version of the 968 * buffer, so rotate that field to the new committed 969 * data. 970 * 971 * Otherwise, we can just throw away the frozen data now. 972 * 973 * We also know that the frozen data has already fired 974 * its triggers if they exist, so we can clear that too. 975 */ 976 if (jh->b_committed_data) { 977 kfree(jh->b_committed_data); 978 jh->b_committed_data = NULL; 979 if (jh->b_frozen_data) { 980 jh->b_committed_data = jh->b_frozen_data; 981 jh->b_frozen_data = NULL; 982 jh->b_frozen_triggers = NULL; 983 } 984 } else if (jh->b_frozen_data) { 985 kfree(jh->b_frozen_data); 986 jh->b_frozen_data = NULL; 987 jh->b_frozen_triggers = NULL; 988 } 989 990 spin_lock(&journal->j_list_lock); 991 cp_transaction = jh->b_cp_transaction; 992 if (cp_transaction) { 993 JBUFFER_TRACE(jh, "remove from old cp transaction"); 994 cp_transaction->t_chp_stats.cs_dropped++; 995 __jbd2_journal_remove_checkpoint(jh); 996 } 997 998 /* Only re-checkpoint the buffer_head if it is marked 999 * dirty. If the buffer was added to the BJ_Forget list 1000 * by jbd2_journal_forget, it may no longer be dirty and 1001 * there's no point in keeping a checkpoint record for 1002 * it. */ 1003 1004 /* 1005 * A buffer which has been freed while still being journaled 1006 * by a previous transaction, refile the buffer to BJ_Forget of 1007 * the running transaction. If the just committed transaction 1008 * contains "add to orphan" operation, we can completely 1009 * invalidate the buffer now. We are rather through in that 1010 * since the buffer may be still accessible when blocksize < 1011 * pagesize and it is attached to the last partial page. 1012 */ 1013 if (buffer_freed(bh) && !jh->b_next_transaction) { 1014 struct address_space *mapping; 1015 1016 clear_buffer_freed(bh); 1017 clear_buffer_jbddirty(bh); 1018 1019 /* 1020 * Block device buffers need to stay mapped all the 1021 * time, so it is enough to clear buffer_jbddirty and 1022 * buffer_freed bits. For the file mapping buffers (i.e. 1023 * journalled data) we need to unmap buffer and clear 1024 * more bits. We also need to be careful about the check 1025 * because the data page mapping can get cleared under 1026 * our hands. Note that if mapping == NULL, we don't 1027 * need to make buffer unmapped because the page is 1028 * already detached from the mapping and buffers cannot 1029 * get reused. 1030 */ 1031 mapping = READ_ONCE(bh->b_folio->mapping); 1032 if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { 1033 clear_buffer_mapped(bh); 1034 clear_buffer_new(bh); 1035 clear_buffer_req(bh); 1036 bh->b_bdev = NULL; 1037 } 1038 } 1039 1040 if (buffer_jbddirty(bh)) { 1041 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 1042 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 1043 if (is_journal_aborted(journal)) 1044 clear_buffer_jbddirty(bh); 1045 } else { 1046 J_ASSERT_BH(bh, !buffer_dirty(bh)); 1047 /* 1048 * The buffer on BJ_Forget list and not jbddirty means 1049 * it has been freed by this transaction and hence it 1050 * could not have been reallocated until this 1051 * transaction has committed. *BUT* it could be 1052 * reallocated once we have written all the data to 1053 * disk and before we process the buffer on BJ_Forget 1054 * list. 1055 */ 1056 if (!jh->b_next_transaction) 1057 try_to_free = 1; 1058 } 1059 JBUFFER_TRACE(jh, "refile or unfile buffer"); 1060 drop_ref = __jbd2_journal_refile_buffer(jh); 1061 spin_unlock(&jh->b_state_lock); 1062 if (drop_ref) 1063 jbd2_journal_put_journal_head(jh); 1064 if (try_to_free) 1065 release_buffer_page(bh); /* Drops bh reference */ 1066 else 1067 __brelse(bh); 1068 cond_resched_lock(&journal->j_list_lock); 1069 } 1070 spin_unlock(&journal->j_list_lock); 1071 /* 1072 * This is a bit sleazy. We use j_list_lock to protect transition 1073 * of a transaction into T_FINISHED state and calling 1074 * __jbd2_journal_drop_transaction(). Otherwise we could race with 1075 * other checkpointing code processing the transaction... 1076 */ 1077 write_lock(&journal->j_state_lock); 1078 spin_lock(&journal->j_list_lock); 1079 /* 1080 * Now recheck if some buffers did not get attached to the transaction 1081 * while the lock was dropped... 1082 */ 1083 if (commit_transaction->t_forget) { 1084 spin_unlock(&journal->j_list_lock); 1085 write_unlock(&journal->j_state_lock); 1086 goto restart_loop; 1087 } 1088 1089 /* Add the transaction to the checkpoint list 1090 * __journal_remove_checkpoint() can not destroy transaction 1091 * under us because it is not marked as T_FINISHED yet */ 1092 if (journal->j_checkpoint_transactions == NULL) { 1093 journal->j_checkpoint_transactions = commit_transaction; 1094 commit_transaction->t_cpnext = commit_transaction; 1095 commit_transaction->t_cpprev = commit_transaction; 1096 } else { 1097 commit_transaction->t_cpnext = 1098 journal->j_checkpoint_transactions; 1099 commit_transaction->t_cpprev = 1100 commit_transaction->t_cpnext->t_cpprev; 1101 commit_transaction->t_cpnext->t_cpprev = 1102 commit_transaction; 1103 commit_transaction->t_cpprev->t_cpnext = 1104 commit_transaction; 1105 } 1106 spin_unlock(&journal->j_list_lock); 1107 1108 /* Done with this transaction! */ 1109 1110 jbd2_debug(3, "JBD2: commit phase 7\n"); 1111 1112 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 1113 1114 commit_transaction->t_start = jiffies; 1115 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 1116 commit_transaction->t_start); 1117 1118 /* 1119 * File the transaction statistics 1120 */ 1121 stats.ts_tid = commit_transaction->t_tid; 1122 stats.run.rs_handle_count = 1123 atomic_read(&commit_transaction->t_handle_count); 1124 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1125 commit_transaction->t_tid, &stats.run); 1126 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1127 1128 commit_transaction->t_state = T_COMMIT_CALLBACK; 1129 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1130 WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); 1131 journal->j_committing_transaction = NULL; 1132 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1133 1134 /* 1135 * weight the commit time higher than the average time so we don't 1136 * react too strongly to vast changes in the commit time 1137 */ 1138 if (likely(journal->j_average_commit_time)) 1139 journal->j_average_commit_time = (commit_time + 1140 journal->j_average_commit_time*3) / 4; 1141 else 1142 journal->j_average_commit_time = commit_time; 1143 1144 write_unlock(&journal->j_state_lock); 1145 1146 if (journal->j_commit_callback) 1147 journal->j_commit_callback(journal, commit_transaction); 1148 if (journal->j_fc_cleanup_callback) 1149 journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); 1150 1151 trace_jbd2_end_commit(journal, commit_transaction); 1152 jbd2_debug(1, "JBD2: commit %d complete, head %d\n", 1153 journal->j_commit_sequence, journal->j_tail_sequence); 1154 1155 write_lock(&journal->j_state_lock); 1156 journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; 1157 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 1158 spin_lock(&journal->j_list_lock); 1159 commit_transaction->t_state = T_FINISHED; 1160 /* Check if the transaction can be dropped now that we are finished */ 1161 if (commit_transaction->t_checkpoint_list == NULL) { 1162 __jbd2_journal_drop_transaction(journal, commit_transaction); 1163 jbd2_journal_free_transaction(commit_transaction); 1164 } 1165 spin_unlock(&journal->j_list_lock); 1166 write_unlock(&journal->j_state_lock); 1167 wake_up(&journal->j_wait_done_commit); 1168 wake_up(&journal->j_fc_wait); 1169 1170 /* 1171 * Calculate overall stats 1172 */ 1173 spin_lock(&journal->j_history_lock); 1174 journal->j_stats.ts_tid++; 1175 journal->j_stats.ts_requested += stats.ts_requested; 1176 journal->j_stats.run.rs_wait += stats.run.rs_wait; 1177 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1178 journal->j_stats.run.rs_running += stats.run.rs_running; 1179 journal->j_stats.run.rs_locked += stats.run.rs_locked; 1180 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1181 journal->j_stats.run.rs_logging += stats.run.rs_logging; 1182 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1183 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1184 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1185 spin_unlock(&journal->j_history_lock); 1186 } 1187