1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * linux/fs/jbd2/commit.c 4 * 5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 6 * 7 * Copyright 1998 Red Hat corp --- All Rights Reserved 8 * 9 * Journal commit routines for the generic filesystem journaling code; 10 * part of the ext2fs journaling system. 11 */ 12 13 #include <linux/time.h> 14 #include <linux/fs.h> 15 #include <linux/jbd2.h> 16 #include <linux/errno.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 #include <linux/pagemap.h> 20 #include <linux/jiffies.h> 21 #include <linux/crc32.h> 22 #include <linux/writeback.h> 23 #include <linux/backing-dev.h> 24 #include <linux/bio.h> 25 #include <linux/blkdev.h> 26 #include <linux/bitops.h> 27 #include <trace/events/jbd2.h> 28 29 /* 30 * IO end handler for temporary buffer_heads handling writes to the journal. 31 */ 32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 33 { 34 struct buffer_head *orig_bh = bh->b_private; 35 36 BUFFER_TRACE(bh, ""); 37 if (uptodate) 38 set_buffer_uptodate(bh); 39 else 40 clear_buffer_uptodate(bh); 41 if (orig_bh) { 42 clear_bit_unlock(BH_Shadow, &orig_bh->b_state); 43 smp_mb__after_atomic(); 44 wake_up_bit(&orig_bh->b_state, BH_Shadow); 45 } 46 unlock_buffer(bh); 47 } 48 49 /* 50 * When an ext4 file is truncated, it is possible that some pages are not 51 * successfully freed, because they are attached to a committing transaction. 52 * After the transaction commits, these pages are left on the LRU, with no 53 * ->mapping, and with attached buffers. These pages are trivially reclaimable 54 * by the VM, but their apparent absence upsets the VM accounting, and it makes 55 * the numbers in /proc/meminfo look odd. 56 * 57 * So here, we have a buffer which has just come off the forget list. Look to 58 * see if we can strip all buffers from the backing page. 59 * 60 * Called under j_list_lock. The caller provided us with a ref against the 61 * buffer, and we drop that here. 62 */ 63 static void release_buffer_page(struct buffer_head *bh) 64 { 65 struct folio *folio; 66 67 if (buffer_dirty(bh)) 68 goto nope; 69 if (atomic_read(&bh->b_count) != 1) 70 goto nope; 71 folio = bh->b_folio; 72 if (folio->mapping) 73 goto nope; 74 75 /* OK, it's a truncated page */ 76 if (!folio_trylock(folio)) 77 goto nope; 78 79 folio_get(folio); 80 __brelse(bh); 81 try_to_free_buffers(folio); 82 folio_unlock(folio); 83 folio_put(folio); 84 return; 85 86 nope: 87 __brelse(bh); 88 } 89 90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 91 { 92 struct commit_header *h; 93 __u32 csum; 94 95 if (!jbd2_journal_has_csum_v2or3(j)) 96 return; 97 98 h = (struct commit_header *)(bh->b_data); 99 h->h_chksum_type = 0; 100 h->h_chksum_size = 0; 101 h->h_chksum[0] = 0; 102 csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); 103 h->h_chksum[0] = cpu_to_be32(csum); 104 } 105 106 /* 107 * Done it all: now submit the commit record. We should have 108 * cleaned up our previous buffers by now, so if we are in abort 109 * mode we can now just skip the rest of the journal write 110 * entirely. 111 * 112 * Returns 1 if the journal needs to be aborted or 0 on success 113 */ 114 static int journal_submit_commit_record(journal_t *journal, 115 transaction_t *commit_transaction, 116 struct buffer_head **cbh, 117 __u32 crc32_sum) 118 { 119 struct commit_header *tmp; 120 struct buffer_head *bh; 121 struct timespec64 now; 122 blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; 123 124 *cbh = NULL; 125 126 if (is_journal_aborted(journal)) 127 return 0; 128 129 bh = jbd2_journal_get_descriptor_buffer(commit_transaction, 130 JBD2_COMMIT_BLOCK); 131 if (!bh) 132 return 1; 133 134 tmp = (struct commit_header *)bh->b_data; 135 ktime_get_coarse_real_ts64(&now); 136 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 137 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 138 139 if (jbd2_has_feature_checksum(journal)) { 140 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 141 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 142 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 143 } 144 jbd2_commit_block_csum_set(journal, bh); 145 146 BUFFER_TRACE(bh, "submit commit block"); 147 lock_buffer(bh); 148 clear_buffer_dirty(bh); 149 set_buffer_uptodate(bh); 150 bh->b_end_io = journal_end_buffer_io_sync; 151 152 if (journal->j_flags & JBD2_BARRIER && 153 !jbd2_has_feature_async_commit(journal)) 154 write_flags |= REQ_PREFLUSH | REQ_FUA; 155 156 submit_bh(write_flags, bh); 157 *cbh = bh; 158 return 0; 159 } 160 161 /* 162 * This function along with journal_submit_commit_record 163 * allows to write the commit record asynchronously. 164 */ 165 static int journal_wait_on_commit_record(journal_t *journal, 166 struct buffer_head *bh) 167 { 168 int ret = 0; 169 170 clear_buffer_dirty(bh); 171 wait_on_buffer(bh); 172 173 if (unlikely(!buffer_uptodate(bh))) 174 ret = -EIO; 175 put_bh(bh); /* One for getblk() */ 176 177 return ret; 178 } 179 180 /* Send all the data buffers related to an inode */ 181 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) 182 { 183 unsigned long flags; 184 185 if (!jinode) 186 return 0; 187 188 flags = READ_ONCE(jinode->i_flags); 189 if (!(flags & JI_WRITE_DATA)) 190 return 0; 191 192 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 193 return journal->j_submit_inode_data_buffers(jinode); 194 195 } 196 EXPORT_SYMBOL(jbd2_submit_inode_data); 197 198 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) 199 { 200 struct address_space *mapping; 201 struct inode *inode; 202 unsigned long flags; 203 loff_t start_byte, end_byte; 204 205 if (!jinode) 206 return 0; 207 208 flags = READ_ONCE(jinode->i_flags); 209 if (!(flags & JI_WAIT_DATA)) 210 return 0; 211 212 inode = jinode->i_vfs_inode; 213 if (!inode) 214 return 0; 215 216 mapping = inode->i_mapping; 217 if (!mapping) 218 return 0; 219 220 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 221 return 0; 222 return filemap_fdatawait_range_keep_errors( 223 mapping, start_byte, end_byte); 224 } 225 EXPORT_SYMBOL(jbd2_wait_inode_data); 226 227 /* 228 * Submit all the data buffers of inode associated with the transaction to 229 * disk. 230 * 231 * We are in a committing transaction. Therefore no new inode can be added to 232 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 233 * operate on from being released while we write out pages. 234 */ 235 static int journal_submit_data_buffers(journal_t *journal, 236 transaction_t *commit_transaction) 237 { 238 struct jbd2_inode *jinode; 239 int err, ret = 0; 240 241 spin_lock(&journal->j_list_lock); 242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 243 if (!(jinode->i_flags & JI_WRITE_DATA)) 244 continue; 245 WRITE_ONCE(jinode->i_flags, 246 jinode->i_flags | JI_COMMIT_RUNNING); 247 spin_unlock(&journal->j_list_lock); 248 /* submit the inode data buffers. */ 249 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 250 if (journal->j_submit_inode_data_buffers) { 251 err = journal->j_submit_inode_data_buffers(jinode); 252 if (!ret) 253 ret = err; 254 } 255 spin_lock(&journal->j_list_lock); 256 J_ASSERT(jinode->i_transaction == commit_transaction); 257 WRITE_ONCE(jinode->i_flags, 258 jinode->i_flags & ~JI_COMMIT_RUNNING); 259 smp_mb(); 260 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 261 } 262 spin_unlock(&journal->j_list_lock); 263 return ret; 264 } 265 266 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 267 { 268 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 269 loff_t start_byte, end_byte; 270 271 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 272 return 0; 273 274 return filemap_fdatawait_range_keep_errors(mapping, 275 start_byte, end_byte); 276 } 277 278 /* 279 * Wait for data submitted for writeout, refile inodes to proper 280 * transaction if needed. 281 * 282 */ 283 static int journal_finish_inode_data_buffers(journal_t *journal, 284 transaction_t *commit_transaction) 285 { 286 struct jbd2_inode *jinode, *next_i; 287 int err, ret = 0; 288 289 /* For locking, see the comment in journal_submit_data_buffers() */ 290 spin_lock(&journal->j_list_lock); 291 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 292 if (!(jinode->i_flags & JI_WAIT_DATA)) 293 continue; 294 WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); 295 spin_unlock(&journal->j_list_lock); 296 /* wait for the inode data buffers writeout. */ 297 if (journal->j_finish_inode_data_buffers) { 298 err = journal->j_finish_inode_data_buffers(jinode); 299 if (!ret) 300 ret = err; 301 } 302 cond_resched(); 303 spin_lock(&journal->j_list_lock); 304 WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); 305 smp_mb(); 306 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 307 } 308 309 /* Now refile inode to proper lists */ 310 list_for_each_entry_safe(jinode, next_i, 311 &commit_transaction->t_inode_list, i_list) { 312 list_del(&jinode->i_list); 313 if (jinode->i_next_transaction) { 314 jinode->i_transaction = jinode->i_next_transaction; 315 jinode->i_next_transaction = NULL; 316 list_add(&jinode->i_list, 317 &jinode->i_transaction->t_inode_list); 318 } else { 319 jinode->i_transaction = NULL; 320 WRITE_ONCE(jinode->i_dirty_start_page, 0); 321 WRITE_ONCE(jinode->i_dirty_end_page, 0); 322 } 323 } 324 spin_unlock(&journal->j_list_lock); 325 326 return ret; 327 } 328 329 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 330 { 331 char *addr; 332 __u32 checksum; 333 334 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 335 checksum = crc32_be(crc32_sum, addr, bh->b_size); 336 kunmap_local(addr); 337 338 return checksum; 339 } 340 341 static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 342 unsigned long long block) 343 { 344 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 345 if (jbd2_has_feature_64bit(j)) 346 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 347 } 348 349 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 350 struct buffer_head *bh, __u32 sequence) 351 { 352 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 353 __u8 *addr; 354 __u32 csum32; 355 __be32 seq; 356 357 if (!jbd2_journal_has_csum_v2or3(j)) 358 return; 359 360 seq = cpu_to_be32(sequence); 361 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 362 csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 363 csum32 = jbd2_chksum(csum32, addr, bh->b_size); 364 kunmap_local(addr); 365 366 if (jbd2_has_feature_csum3(j)) 367 tag3->t_checksum = cpu_to_be32(csum32); 368 else 369 tag->t_checksum = cpu_to_be16(csum32); 370 } 371 /* 372 * jbd2_journal_commit_transaction 373 * 374 * The primary function for committing a transaction to the log. This 375 * function is called by the journal thread to begin a complete commit. 376 */ 377 void jbd2_journal_commit_transaction(journal_t *journal) 378 { 379 struct transaction_stats_s stats; 380 transaction_t *commit_transaction; 381 struct journal_head *jh; 382 struct buffer_head *descriptor; 383 struct buffer_head **wbuf = journal->j_wbuf; 384 int bufs; 385 int escape; 386 int err; 387 unsigned long long blocknr; 388 ktime_t start_time; 389 u64 commit_time; 390 char *tagp = NULL; 391 journal_block_tag_t *tag = NULL; 392 int space_left = 0; 393 int first_tag = 0; 394 int tag_flag; 395 int i; 396 int tag_bytes = journal_tag_bytes(journal); 397 struct buffer_head *cbh = NULL; /* For transactional checksums */ 398 __u32 crc32_sum = ~0; 399 struct blk_plug plug; 400 /* Tail of the journal */ 401 unsigned long first_block; 402 tid_t first_tid; 403 int update_tail; 404 int csum_size = 0; 405 LIST_HEAD(io_bufs); 406 LIST_HEAD(log_bufs); 407 408 if (jbd2_journal_has_csum_v2or3(journal)) 409 csum_size = sizeof(struct jbd2_journal_block_tail); 410 411 /* 412 * First job: lock down the current transaction and wait for 413 * all outstanding updates to complete. 414 */ 415 416 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 417 if (journal->j_flags & JBD2_FLUSHED) { 418 jbd2_debug(3, "super block updated\n"); 419 mutex_lock_io(&journal->j_checkpoint_mutex); 420 /* 421 * We hold j_checkpoint_mutex so tail cannot change under us. 422 * We don't need any special data guarantees for writing sb 423 * since journal is empty and it is ok for write to be 424 * flushed only with transaction commit. 425 */ 426 jbd2_journal_update_sb_log_tail(journal, 427 journal->j_tail_sequence, 428 journal->j_tail, 0); 429 mutex_unlock(&journal->j_checkpoint_mutex); 430 } else { 431 jbd2_debug(3, "superblock not updated\n"); 432 } 433 434 J_ASSERT(journal->j_running_transaction != NULL); 435 J_ASSERT(journal->j_committing_transaction == NULL); 436 437 write_lock(&journal->j_state_lock); 438 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; 439 while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { 440 DEFINE_WAIT(wait); 441 442 prepare_to_wait(&journal->j_fc_wait, &wait, 443 TASK_UNINTERRUPTIBLE); 444 write_unlock(&journal->j_state_lock); 445 schedule(); 446 write_lock(&journal->j_state_lock); 447 finish_wait(&journal->j_fc_wait, &wait); 448 /* 449 * TODO: by blocking fast commits here, we are increasing 450 * fsync() latency slightly. Strictly speaking, we don't need 451 * to block fast commits until the transaction enters T_FLUSH 452 * state. So an optimization is possible where we block new fast 453 * commits here and wait for existing ones to complete 454 * just before we enter T_FLUSH. That way, the existing fast 455 * commits and this full commit can proceed parallely. 456 */ 457 } 458 write_unlock(&journal->j_state_lock); 459 460 commit_transaction = journal->j_running_transaction; 461 462 trace_jbd2_start_commit(journal, commit_transaction); 463 jbd2_debug(1, "JBD2: starting commit of transaction %d\n", 464 commit_transaction->t_tid); 465 466 write_lock(&journal->j_state_lock); 467 journal->j_fc_off = 0; 468 J_ASSERT(commit_transaction->t_state == T_RUNNING); 469 commit_transaction->t_state = T_LOCKED; 470 471 trace_jbd2_commit_locking(journal, commit_transaction); 472 stats.run.rs_wait = commit_transaction->t_max_wait; 473 stats.run.rs_request_delay = 0; 474 stats.run.rs_locked = jiffies; 475 if (commit_transaction->t_requested) 476 stats.run.rs_request_delay = 477 jbd2_time_diff(commit_transaction->t_requested, 478 stats.run.rs_locked); 479 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 480 stats.run.rs_locked); 481 482 // waits for any t_updates to finish 483 jbd2_journal_wait_updates(journal); 484 485 commit_transaction->t_state = T_SWITCH; 486 487 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 488 journal->j_max_transaction_buffers); 489 490 /* 491 * First thing we are allowed to do is to discard any remaining 492 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 493 * that there are no such buffers: if a large filesystem 494 * operation like a truncate needs to split itself over multiple 495 * transactions, then it may try to do a jbd2_journal_restart() while 496 * there are still BJ_Reserved buffers outstanding. These must 497 * be released cleanly from the current transaction. 498 * 499 * In this case, the filesystem must still reserve write access 500 * again before modifying the buffer in the new transaction, but 501 * we do not require it to remember exactly which old buffers it 502 * has reserved. This is consistent with the existing behaviour 503 * that multiple jbd2_journal_get_write_access() calls to the same 504 * buffer are perfectly permissible. 505 * We use journal->j_state_lock here to serialize processing of 506 * t_reserved_list with eviction of buffers from journal_unmap_buffer(). 507 */ 508 while (commit_transaction->t_reserved_list) { 509 jh = commit_transaction->t_reserved_list; 510 JBUFFER_TRACE(jh, "reserved, unused: refile"); 511 /* 512 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 513 * leave undo-committed data. 514 */ 515 if (jh->b_committed_data) { 516 struct buffer_head *bh = jh2bh(jh); 517 518 spin_lock(&jh->b_state_lock); 519 jbd2_free(jh->b_committed_data, bh->b_size); 520 jh->b_committed_data = NULL; 521 spin_unlock(&jh->b_state_lock); 522 } 523 jbd2_journal_refile_buffer(journal, jh); 524 } 525 526 write_unlock(&journal->j_state_lock); 527 /* 528 * Now try to drop any written-back buffers from the journal's 529 * checkpoint lists. We do this *before* commit because it potentially 530 * frees some memory 531 */ 532 spin_lock(&journal->j_list_lock); 533 __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); 534 spin_unlock(&journal->j_list_lock); 535 536 jbd2_debug(3, "JBD2: commit phase 1\n"); 537 538 /* 539 * Clear revoked flag to reflect there is no revoked buffers 540 * in the next transaction which is going to be started. 541 */ 542 jbd2_clear_buffer_revoked_flags(journal); 543 544 /* 545 * Switch to a new revoke table. 546 */ 547 jbd2_journal_switch_revoke_table(journal); 548 549 write_lock(&journal->j_state_lock); 550 /* 551 * Reserved credits cannot be claimed anymore, free them 552 */ 553 atomic_sub(atomic_read(&journal->j_reserved_credits), 554 &commit_transaction->t_outstanding_credits); 555 556 trace_jbd2_commit_flushing(journal, commit_transaction); 557 stats.run.rs_flushing = jiffies; 558 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 559 stats.run.rs_flushing); 560 561 commit_transaction->t_state = T_FLUSH; 562 journal->j_committing_transaction = commit_transaction; 563 journal->j_running_transaction = NULL; 564 start_time = ktime_get(); 565 commit_transaction->t_log_start = journal->j_head; 566 wake_up_all(&journal->j_wait_transaction_locked); 567 write_unlock(&journal->j_state_lock); 568 569 jbd2_debug(3, "JBD2: commit phase 2a\n"); 570 571 /* 572 * Now start flushing things to disk, in the order they appear 573 * on the transaction lists. Data blocks go first. 574 */ 575 err = journal_submit_data_buffers(journal, commit_transaction); 576 if (err) 577 jbd2_journal_abort(journal, err); 578 579 blk_start_plug(&plug); 580 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); 581 582 jbd2_debug(3, "JBD2: commit phase 2b\n"); 583 584 /* 585 * Way to go: we have now written out all of the data for a 586 * transaction! Now comes the tricky part: we need to write out 587 * metadata. Loop over the transaction's entire buffer list: 588 */ 589 write_lock(&journal->j_state_lock); 590 commit_transaction->t_state = T_COMMIT; 591 write_unlock(&journal->j_state_lock); 592 593 trace_jbd2_commit_logging(journal, commit_transaction); 594 stats.run.rs_logging = jiffies; 595 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 596 stats.run.rs_logging); 597 stats.run.rs_blocks = commit_transaction->t_nr_buffers; 598 stats.run.rs_blocks_logged = 0; 599 600 J_ASSERT(commit_transaction->t_nr_buffers <= 601 atomic_read(&commit_transaction->t_outstanding_credits)); 602 603 bufs = 0; 604 descriptor = NULL; 605 while (commit_transaction->t_buffers) { 606 607 /* Find the next buffer to be journaled... */ 608 609 jh = commit_transaction->t_buffers; 610 611 /* If we're in abort mode, we just un-journal the buffer and 612 release it. */ 613 614 if (is_journal_aborted(journal)) { 615 clear_buffer_jbddirty(jh2bh(jh)); 616 JBUFFER_TRACE(jh, "journal is aborting: refile"); 617 jbd2_buffer_abort_trigger(jh, 618 jh->b_frozen_data ? 619 jh->b_frozen_triggers : 620 jh->b_triggers); 621 jbd2_journal_refile_buffer(journal, jh); 622 /* If that was the last one, we need to clean up 623 * any descriptor buffers which may have been 624 * already allocated, even if we are now 625 * aborting. */ 626 if (!commit_transaction->t_buffers) 627 goto start_journal_io; 628 continue; 629 } 630 631 /* Make sure we have a descriptor block in which to 632 record the metadata buffer. */ 633 634 if (!descriptor) { 635 J_ASSERT (bufs == 0); 636 637 jbd2_debug(4, "JBD2: get descriptor\n"); 638 639 descriptor = jbd2_journal_get_descriptor_buffer( 640 commit_transaction, 641 JBD2_DESCRIPTOR_BLOCK); 642 if (!descriptor) { 643 jbd2_journal_abort(journal, -EIO); 644 continue; 645 } 646 647 jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", 648 (unsigned long long)descriptor->b_blocknr, 649 descriptor->b_data); 650 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 651 space_left = descriptor->b_size - 652 sizeof(journal_header_t); 653 first_tag = 1; 654 set_buffer_jwrite(descriptor); 655 set_buffer_dirty(descriptor); 656 wbuf[bufs++] = descriptor; 657 658 /* Record it so that we can wait for IO 659 completion later */ 660 BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 661 jbd2_file_log_bh(&log_bufs, descriptor); 662 } 663 664 /* Where is the buffer to be written? */ 665 666 err = jbd2_journal_next_log_block(journal, &blocknr); 667 /* If the block mapping failed, just abandon the buffer 668 and repeat this loop: we'll fall into the 669 refile-on-abort condition above. */ 670 if (err) { 671 jbd2_journal_abort(journal, err); 672 continue; 673 } 674 675 /* 676 * start_this_handle() uses t_outstanding_credits to determine 677 * the free space in the log. 678 */ 679 atomic_dec(&commit_transaction->t_outstanding_credits); 680 681 /* Bump b_count to prevent truncate from stumbling over 682 the shadowed buffer! @@@ This can go if we ever get 683 rid of the shadow pairing of buffers. */ 684 atomic_inc(&jh2bh(jh)->b_count); 685 686 /* 687 * Make a temporary IO buffer with which to write it out 688 * (this will requeue the metadata buffer to BJ_Shadow). 689 */ 690 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 691 JBUFFER_TRACE(jh, "ph3: write metadata"); 692 escape = jbd2_journal_write_metadata_buffer(commit_transaction, 693 jh, &wbuf[bufs], blocknr); 694 jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 695 696 /* Record the new block's tag in the current descriptor 697 buffer */ 698 699 tag_flag = 0; 700 if (escape) 701 tag_flag |= JBD2_FLAG_ESCAPE; 702 if (!first_tag) 703 tag_flag |= JBD2_FLAG_SAME_UUID; 704 705 tag = (journal_block_tag_t *) tagp; 706 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 707 tag->t_flags = cpu_to_be16(tag_flag); 708 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 709 commit_transaction->t_tid); 710 tagp += tag_bytes; 711 space_left -= tag_bytes; 712 bufs++; 713 714 if (first_tag) { 715 memcpy (tagp, journal->j_uuid, 16); 716 tagp += 16; 717 space_left -= 16; 718 first_tag = 0; 719 } 720 721 /* If there's no more to do, or if the descriptor is full, 722 let the IO rip! */ 723 724 if (bufs == journal->j_wbufsize || 725 commit_transaction->t_buffers == NULL || 726 space_left < tag_bytes + 16 + csum_size) { 727 728 jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); 729 730 /* Write an end-of-descriptor marker before 731 submitting the IOs. "tag" still points to 732 the last tag we set up. */ 733 734 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 735 start_journal_io: 736 if (descriptor) 737 jbd2_descriptor_block_csum_set(journal, 738 descriptor); 739 740 for (i = 0; i < bufs; i++) { 741 struct buffer_head *bh = wbuf[i]; 742 743 /* 744 * Compute checksum. 745 */ 746 if (jbd2_has_feature_checksum(journal)) { 747 crc32_sum = 748 jbd2_checksum_data(crc32_sum, bh); 749 } 750 751 lock_buffer(bh); 752 clear_buffer_dirty(bh); 753 set_buffer_uptodate(bh); 754 bh->b_end_io = journal_end_buffer_io_sync; 755 submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, 756 bh); 757 } 758 cond_resched(); 759 760 /* Force a new descriptor to be generated next 761 time round the loop. */ 762 descriptor = NULL; 763 bufs = 0; 764 } 765 } 766 767 err = journal_finish_inode_data_buffers(journal, commit_transaction); 768 if (err) { 769 printk(KERN_WARNING 770 "JBD2: Detected IO errors %d while flushing file data on %s\n", 771 err, journal->j_devname); 772 err = 0; 773 } 774 775 /* 776 * Get current oldest transaction in the log before we issue flush 777 * to the filesystem device. After the flush we can be sure that 778 * blocks of all older transactions are checkpointed to persistent 779 * storage and we will be safe to update journal start in the 780 * superblock with the numbers we get here. 781 */ 782 update_tail = 783 jbd2_journal_get_log_tail(journal, &first_tid, &first_block); 784 785 write_lock(&journal->j_state_lock); 786 if (update_tail) { 787 long freed = first_block - journal->j_tail; 788 789 if (first_block < journal->j_tail) 790 freed += journal->j_last - journal->j_first; 791 /* Update tail only if we free significant amount of space */ 792 if (freed < journal->j_max_transaction_buffers) 793 update_tail = 0; 794 } 795 J_ASSERT(commit_transaction->t_state == T_COMMIT); 796 commit_transaction->t_state = T_COMMIT_DFLUSH; 797 write_unlock(&journal->j_state_lock); 798 799 /* 800 * If the journal is not located on the file system device, 801 * then we must flush the file system device before we issue 802 * the commit record and update the journal tail sequence. 803 */ 804 if ((commit_transaction->t_need_data_flush || update_tail) && 805 (journal->j_fs_dev != journal->j_dev) && 806 (journal->j_flags & JBD2_BARRIER)) 807 blkdev_issue_flush(journal->j_fs_dev); 808 809 /* Done it all: now write the commit record asynchronously. */ 810 if (jbd2_has_feature_async_commit(journal)) { 811 err = journal_submit_commit_record(journal, commit_transaction, 812 &cbh, crc32_sum); 813 if (err) 814 jbd2_journal_abort(journal, err); 815 } 816 817 blk_finish_plug(&plug); 818 819 /* Lo and behold: we have just managed to send a transaction to 820 the log. Before we can commit it, wait for the IO so far to 821 complete. Control buffers being written are on the 822 transaction's t_log_list queue, and metadata buffers are on 823 the io_bufs list. 824 825 Wait for the buffers in reverse order. That way we are 826 less likely to be woken up until all IOs have completed, and 827 so we incur less scheduling load. 828 */ 829 830 jbd2_debug(3, "JBD2: commit phase 3\n"); 831 832 while (!list_empty(&io_bufs)) { 833 struct buffer_head *bh = list_entry(io_bufs.prev, 834 struct buffer_head, 835 b_assoc_buffers); 836 837 wait_on_buffer(bh); 838 cond_resched(); 839 840 if (unlikely(!buffer_uptodate(bh))) 841 err = -EIO; 842 jbd2_unfile_log_bh(bh); 843 stats.run.rs_blocks_logged++; 844 845 /* 846 * The list contains temporary buffer heads created by 847 * jbd2_journal_write_metadata_buffer(). 848 */ 849 BUFFER_TRACE(bh, "dumping temporary bh"); 850 __brelse(bh); 851 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 852 free_buffer_head(bh); 853 854 /* We also have to refile the corresponding shadowed buffer */ 855 jh = commit_transaction->t_shadow_list->b_tprev; 856 bh = jh2bh(jh); 857 clear_buffer_jwrite(bh); 858 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 859 J_ASSERT_BH(bh, !buffer_shadow(bh)); 860 861 /* The metadata is now released for reuse, but we need 862 to remember it against this transaction so that when 863 we finally commit, we can do any checkpointing 864 required. */ 865 JBUFFER_TRACE(jh, "file as BJ_Forget"); 866 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 867 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 868 __brelse(bh); 869 } 870 871 J_ASSERT (commit_transaction->t_shadow_list == NULL); 872 873 jbd2_debug(3, "JBD2: commit phase 4\n"); 874 875 /* Here we wait for the revoke record and descriptor record buffers */ 876 while (!list_empty(&log_bufs)) { 877 struct buffer_head *bh; 878 879 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 880 wait_on_buffer(bh); 881 cond_resched(); 882 883 if (unlikely(!buffer_uptodate(bh))) 884 err = -EIO; 885 886 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 887 clear_buffer_jwrite(bh); 888 jbd2_unfile_log_bh(bh); 889 stats.run.rs_blocks_logged++; 890 __brelse(bh); /* One for getblk */ 891 /* AKPM: bforget here */ 892 } 893 894 if (err) 895 jbd2_journal_abort(journal, err); 896 897 jbd2_debug(3, "JBD2: commit phase 5\n"); 898 write_lock(&journal->j_state_lock); 899 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 900 commit_transaction->t_state = T_COMMIT_JFLUSH; 901 write_unlock(&journal->j_state_lock); 902 903 if (!jbd2_has_feature_async_commit(journal)) { 904 err = journal_submit_commit_record(journal, commit_transaction, 905 &cbh, crc32_sum); 906 if (err) 907 jbd2_journal_abort(journal, err); 908 } 909 if (cbh) 910 err = journal_wait_on_commit_record(journal, cbh); 911 stats.run.rs_blocks_logged++; 912 if (jbd2_has_feature_async_commit(journal) && 913 journal->j_flags & JBD2_BARRIER) { 914 blkdev_issue_flush(journal->j_dev); 915 } 916 917 if (err) 918 jbd2_journal_abort(journal, err); 919 920 WARN_ON_ONCE( 921 atomic_read(&commit_transaction->t_outstanding_credits) < 0); 922 923 /* 924 * Now disk caches for filesystem device are flushed so we are safe to 925 * erase checkpointed transactions from the log by updating journal 926 * superblock. 927 */ 928 if (update_tail) 929 jbd2_update_log_tail(journal, first_tid, first_block); 930 931 /* End of a transaction! Finally, we can do checkpoint 932 processing: any buffers committed as a result of this 933 transaction can be removed from any checkpoint list it was on 934 before. */ 935 936 jbd2_debug(3, "JBD2: commit phase 6\n"); 937 938 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 939 J_ASSERT(commit_transaction->t_buffers == NULL); 940 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 941 J_ASSERT(commit_transaction->t_shadow_list == NULL); 942 943 restart_loop: 944 /* 945 * As there are other places (journal_unmap_buffer()) adding buffers 946 * to this list we have to be careful and hold the j_list_lock. 947 */ 948 spin_lock(&journal->j_list_lock); 949 while (commit_transaction->t_forget) { 950 transaction_t *cp_transaction; 951 struct buffer_head *bh; 952 int try_to_free = 0; 953 bool drop_ref; 954 955 jh = commit_transaction->t_forget; 956 spin_unlock(&journal->j_list_lock); 957 bh = jh2bh(jh); 958 /* 959 * Get a reference so that bh cannot be freed before we are 960 * done with it. 961 */ 962 get_bh(bh); 963 spin_lock(&jh->b_state_lock); 964 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 965 966 /* 967 * If there is undo-protected committed data against 968 * this buffer, then we can remove it now. If it is a 969 * buffer needing such protection, the old frozen_data 970 * field now points to a committed version of the 971 * buffer, so rotate that field to the new committed 972 * data. 973 * 974 * Otherwise, we can just throw away the frozen data now. 975 * 976 * We also know that the frozen data has already fired 977 * its triggers if they exist, so we can clear that too. 978 */ 979 if (jh->b_committed_data) { 980 jbd2_free(jh->b_committed_data, bh->b_size); 981 jh->b_committed_data = NULL; 982 if (jh->b_frozen_data) { 983 jh->b_committed_data = jh->b_frozen_data; 984 jh->b_frozen_data = NULL; 985 jh->b_frozen_triggers = NULL; 986 } 987 } else if (jh->b_frozen_data) { 988 jbd2_free(jh->b_frozen_data, bh->b_size); 989 jh->b_frozen_data = NULL; 990 jh->b_frozen_triggers = NULL; 991 } 992 993 spin_lock(&journal->j_list_lock); 994 cp_transaction = jh->b_cp_transaction; 995 if (cp_transaction) { 996 JBUFFER_TRACE(jh, "remove from old cp transaction"); 997 cp_transaction->t_chp_stats.cs_dropped++; 998 __jbd2_journal_remove_checkpoint(jh); 999 } 1000 1001 /* Only re-checkpoint the buffer_head if it is marked 1002 * dirty. If the buffer was added to the BJ_Forget list 1003 * by jbd2_journal_forget, it may no longer be dirty and 1004 * there's no point in keeping a checkpoint record for 1005 * it. */ 1006 1007 /* 1008 * A buffer which has been freed while still being journaled 1009 * by a previous transaction, refile the buffer to BJ_Forget of 1010 * the running transaction. If the just committed transaction 1011 * contains "add to orphan" operation, we can completely 1012 * invalidate the buffer now. We are rather through in that 1013 * since the buffer may be still accessible when blocksize < 1014 * pagesize and it is attached to the last partial page. 1015 */ 1016 if (buffer_freed(bh) && !jh->b_next_transaction) { 1017 struct address_space *mapping; 1018 1019 clear_buffer_freed(bh); 1020 clear_buffer_jbddirty(bh); 1021 1022 /* 1023 * Block device buffers need to stay mapped all the 1024 * time, so it is enough to clear buffer_jbddirty and 1025 * buffer_freed bits. For the file mapping buffers (i.e. 1026 * journalled data) we need to unmap buffer and clear 1027 * more bits. We also need to be careful about the check 1028 * because the data page mapping can get cleared under 1029 * our hands. Note that if mapping == NULL, we don't 1030 * need to make buffer unmapped because the page is 1031 * already detached from the mapping and buffers cannot 1032 * get reused. 1033 */ 1034 mapping = READ_ONCE(bh->b_folio->mapping); 1035 if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { 1036 clear_buffer_mapped(bh); 1037 clear_buffer_new(bh); 1038 clear_buffer_req(bh); 1039 bh->b_bdev = NULL; 1040 } 1041 } 1042 1043 if (buffer_jbddirty(bh)) { 1044 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 1045 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 1046 if (is_journal_aborted(journal)) 1047 clear_buffer_jbddirty(bh); 1048 } else { 1049 J_ASSERT_BH(bh, !buffer_dirty(bh)); 1050 /* 1051 * The buffer on BJ_Forget list and not jbddirty means 1052 * it has been freed by this transaction and hence it 1053 * could not have been reallocated until this 1054 * transaction has committed. *BUT* it could be 1055 * reallocated once we have written all the data to 1056 * disk and before we process the buffer on BJ_Forget 1057 * list. 1058 */ 1059 if (!jh->b_next_transaction) 1060 try_to_free = 1; 1061 } 1062 JBUFFER_TRACE(jh, "refile or unfile buffer"); 1063 drop_ref = __jbd2_journal_refile_buffer(jh); 1064 spin_unlock(&jh->b_state_lock); 1065 if (drop_ref) 1066 jbd2_journal_put_journal_head(jh); 1067 if (try_to_free) 1068 release_buffer_page(bh); /* Drops bh reference */ 1069 else 1070 __brelse(bh); 1071 cond_resched_lock(&journal->j_list_lock); 1072 } 1073 spin_unlock(&journal->j_list_lock); 1074 /* 1075 * This is a bit sleazy. We use j_list_lock to protect transition 1076 * of a transaction into T_FINISHED state and calling 1077 * __jbd2_journal_drop_transaction(). Otherwise we could race with 1078 * other checkpointing code processing the transaction... 1079 */ 1080 write_lock(&journal->j_state_lock); 1081 spin_lock(&journal->j_list_lock); 1082 /* 1083 * Now recheck if some buffers did not get attached to the transaction 1084 * while the lock was dropped... 1085 */ 1086 if (commit_transaction->t_forget) { 1087 spin_unlock(&journal->j_list_lock); 1088 write_unlock(&journal->j_state_lock); 1089 goto restart_loop; 1090 } 1091 1092 /* Add the transaction to the checkpoint list 1093 * __journal_remove_checkpoint() can not destroy transaction 1094 * under us because it is not marked as T_FINISHED yet */ 1095 if (journal->j_checkpoint_transactions == NULL) { 1096 journal->j_checkpoint_transactions = commit_transaction; 1097 commit_transaction->t_cpnext = commit_transaction; 1098 commit_transaction->t_cpprev = commit_transaction; 1099 } else { 1100 commit_transaction->t_cpnext = 1101 journal->j_checkpoint_transactions; 1102 commit_transaction->t_cpprev = 1103 commit_transaction->t_cpnext->t_cpprev; 1104 commit_transaction->t_cpnext->t_cpprev = 1105 commit_transaction; 1106 commit_transaction->t_cpprev->t_cpnext = 1107 commit_transaction; 1108 } 1109 spin_unlock(&journal->j_list_lock); 1110 1111 /* Done with this transaction! */ 1112 1113 jbd2_debug(3, "JBD2: commit phase 7\n"); 1114 1115 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 1116 1117 commit_transaction->t_start = jiffies; 1118 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 1119 commit_transaction->t_start); 1120 1121 /* 1122 * File the transaction statistics 1123 */ 1124 stats.ts_tid = commit_transaction->t_tid; 1125 stats.run.rs_handle_count = 1126 atomic_read(&commit_transaction->t_handle_count); 1127 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1128 commit_transaction->t_tid, &stats.run); 1129 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1130 1131 commit_transaction->t_state = T_COMMIT_CALLBACK; 1132 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1133 WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); 1134 journal->j_committing_transaction = NULL; 1135 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1136 1137 /* 1138 * weight the commit time higher than the average time so we don't 1139 * react too strongly to vast changes in the commit time 1140 */ 1141 if (likely(journal->j_average_commit_time)) 1142 journal->j_average_commit_time = (commit_time + 1143 journal->j_average_commit_time*3) / 4; 1144 else 1145 journal->j_average_commit_time = commit_time; 1146 1147 write_unlock(&journal->j_state_lock); 1148 1149 if (journal->j_commit_callback) 1150 journal->j_commit_callback(journal, commit_transaction); 1151 if (journal->j_fc_cleanup_callback) 1152 journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); 1153 1154 trace_jbd2_end_commit(journal, commit_transaction); 1155 jbd2_debug(1, "JBD2: commit %d complete, head %d\n", 1156 journal->j_commit_sequence, journal->j_tail_sequence); 1157 1158 write_lock(&journal->j_state_lock); 1159 journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; 1160 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 1161 spin_lock(&journal->j_list_lock); 1162 commit_transaction->t_state = T_FINISHED; 1163 /* Check if the transaction can be dropped now that we are finished */ 1164 if (commit_transaction->t_checkpoint_list == NULL) { 1165 __jbd2_journal_drop_transaction(journal, commit_transaction); 1166 jbd2_journal_free_transaction(commit_transaction); 1167 } 1168 spin_unlock(&journal->j_list_lock); 1169 write_unlock(&journal->j_state_lock); 1170 wake_up(&journal->j_wait_done_commit); 1171 wake_up(&journal->j_fc_wait); 1172 1173 /* 1174 * Calculate overall stats 1175 */ 1176 spin_lock(&journal->j_history_lock); 1177 journal->j_stats.ts_tid++; 1178 journal->j_stats.ts_requested += stats.ts_requested; 1179 journal->j_stats.run.rs_wait += stats.run.rs_wait; 1180 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1181 journal->j_stats.run.rs_running += stats.run.rs_running; 1182 journal->j_stats.run.rs_locked += stats.run.rs_locked; 1183 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1184 journal->j_stats.run.rs_logging += stats.run.rs_logging; 1185 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1186 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1187 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1188 spin_unlock(&journal->j_history_lock); 1189 } 1190