1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * linux/fs/jbd2/commit.c 4 * 5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 6 * 7 * Copyright 1998 Red Hat corp --- All Rights Reserved 8 * 9 * Journal commit routines for the generic filesystem journaling code; 10 * part of the ext2fs journaling system. 11 */ 12 13 #include <linux/time.h> 14 #include <linux/fs.h> 15 #include <linux/jbd2.h> 16 #include <linux/errno.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 #include <linux/pagemap.h> 20 #include <linux/jiffies.h> 21 #include <linux/crc32.h> 22 #include <linux/writeback.h> 23 #include <linux/backing-dev.h> 24 #include <linux/bio.h> 25 #include <linux/blkdev.h> 26 #include <linux/bitops.h> 27 #include <trace/events/jbd2.h> 28 29 /* 30 * IO end handler for temporary buffer_heads handling writes to the journal. 31 */ 32 static void journal_end_buffer_io_sync(struct bio *bio) 33 { 34 struct buffer_head *bh; 35 bool uptodate = bio_endio_bh(bio, &bh); 36 struct buffer_head *orig_bh = bh->b_private; 37 38 BUFFER_TRACE(bh, ""); 39 if (uptodate) 40 set_buffer_uptodate(bh); 41 else 42 clear_buffer_uptodate(bh); 43 if (orig_bh) { 44 clear_and_wake_up_bit(BH_Shadow, &orig_bh->b_state); 45 } 46 unlock_buffer(bh); 47 } 48 49 /* 50 * When an ext4 file is truncated, it is possible that some pages are not 51 * successfully freed, because they are attached to a committing transaction. 52 * After the transaction commits, these pages are left on the LRU, with no 53 * ->mapping, and with attached buffers. These pages are trivially reclaimable 54 * by the VM, but their apparent absence upsets the VM accounting, and it makes 55 * the numbers in /proc/meminfo look odd. 56 * 57 * So here, we have a buffer which has just come off the forget list. Look to 58 * see if we can strip all buffers from the backing page. 59 * 60 * Called under j_list_lock. The caller provided us with a ref against the 61 * buffer, and we drop that here. 62 */ 63 static void release_buffer_page(struct buffer_head *bh) 64 { 65 struct folio *folio; 66 67 if (buffer_dirty(bh)) 68 goto nope; 69 if (atomic_read(&bh->b_count) != 1) 70 goto nope; 71 folio = bh->b_folio; 72 if (folio->mapping) 73 goto nope; 74 75 /* OK, it's a truncated page */ 76 if (!folio_trylock(folio)) 77 goto nope; 78 79 folio_get(folio); 80 __brelse(bh); 81 try_to_free_buffers(folio); 82 folio_unlock(folio); 83 folio_put(folio); 84 return; 85 86 nope: 87 __brelse(bh); 88 } 89 90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 91 { 92 struct commit_header *h; 93 __u32 csum; 94 95 if (!jbd2_journal_has_csum_v2or3(j)) 96 return; 97 98 h = (struct commit_header *)(bh->b_data); 99 h->h_chksum_type = 0; 100 h->h_chksum_size = 0; 101 h->h_chksum[0] = 0; 102 csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); 103 h->h_chksum[0] = cpu_to_be32(csum); 104 } 105 106 /* 107 * Done it all: now submit the commit record. We should have 108 * cleaned up our previous buffers by now, so if we are in abort 109 * mode we can now just skip the rest of the journal write 110 * entirely. 111 * 112 * Returns 1 if the journal needs to be aborted or 0 on success 113 */ 114 static int journal_submit_commit_record(journal_t *journal, 115 transaction_t *commit_transaction, 116 struct buffer_head **cbh, 117 __u32 crc32_sum) 118 { 119 struct commit_header *tmp; 120 struct buffer_head *bh; 121 struct timespec64 now; 122 blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; 123 124 *cbh = NULL; 125 126 if (is_journal_aborted(journal)) 127 return 0; 128 129 bh = jbd2_journal_get_descriptor_buffer(commit_transaction, 130 JBD2_COMMIT_BLOCK); 131 if (!bh) 132 return 1; 133 134 tmp = (struct commit_header *)bh->b_data; 135 ktime_get_coarse_real_ts64(&now); 136 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 137 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 138 139 if (jbd2_has_feature_checksum(journal)) { 140 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 141 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 142 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 143 } 144 jbd2_commit_block_csum_set(journal, bh); 145 146 BUFFER_TRACE(bh, "submit commit block"); 147 lock_buffer(bh); 148 clear_buffer_dirty(bh); 149 set_buffer_uptodate(bh); 150 151 if (journal->j_flags & JBD2_BARRIER && 152 !jbd2_has_feature_async_commit(journal)) 153 write_flags |= REQ_PREFLUSH | REQ_FUA; 154 155 bh_submit(bh, write_flags, journal_end_buffer_io_sync); 156 *cbh = bh; 157 return 0; 158 } 159 160 /* 161 * This function along with journal_submit_commit_record 162 * allows to write the commit record asynchronously. 163 */ 164 static int journal_wait_on_commit_record(journal_t *journal, 165 struct buffer_head *bh) 166 { 167 int ret = 0; 168 169 clear_buffer_dirty(bh); 170 wait_on_buffer(bh); 171 172 if (unlikely(!buffer_uptodate(bh))) 173 ret = -EIO; 174 put_bh(bh); /* One for getblk() */ 175 176 return ret; 177 } 178 179 /* Send all the data buffers related to an inode */ 180 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) 181 { 182 unsigned long flags; 183 184 if (!jinode) 185 return 0; 186 187 flags = READ_ONCE(jinode->i_flags); 188 if (!(flags & JI_WRITE_DATA)) 189 return 0; 190 191 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 192 return journal->j_submit_inode_data_buffers(jinode); 193 194 } 195 EXPORT_SYMBOL(jbd2_submit_inode_data); 196 197 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) 198 { 199 struct address_space *mapping; 200 struct inode *inode; 201 unsigned long flags; 202 loff_t start_byte, end_byte; 203 204 if (!jinode) 205 return 0; 206 207 flags = READ_ONCE(jinode->i_flags); 208 if (!(flags & JI_WAIT_DATA)) 209 return 0; 210 211 inode = jinode->i_vfs_inode; 212 if (!inode) 213 return 0; 214 215 mapping = inode->i_mapping; 216 if (!mapping) 217 return 0; 218 219 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 220 return 0; 221 return filemap_fdatawait_range_keep_errors( 222 mapping, start_byte, end_byte); 223 } 224 EXPORT_SYMBOL(jbd2_wait_inode_data); 225 226 /* 227 * Submit all the data buffers of inode associated with the transaction to 228 * disk. 229 * 230 * We are in a committing transaction. Therefore no new inode can be added to 231 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 232 * operate on from being released while we write out pages. 233 */ 234 static int journal_submit_data_buffers(journal_t *journal, 235 transaction_t *commit_transaction) 236 { 237 struct jbd2_inode *jinode; 238 int err, ret = 0; 239 240 spin_lock(&journal->j_list_lock); 241 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 242 if (!(jinode->i_flags & JI_WRITE_DATA)) 243 continue; 244 WRITE_ONCE(jinode->i_flags, 245 jinode->i_flags | JI_COMMIT_RUNNING); 246 spin_unlock(&journal->j_list_lock); 247 /* submit the inode data buffers. */ 248 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 249 if (journal->j_submit_inode_data_buffers) { 250 err = journal->j_submit_inode_data_buffers(jinode); 251 if (!ret) 252 ret = err; 253 } 254 spin_lock(&journal->j_list_lock); 255 J_ASSERT(jinode->i_transaction == commit_transaction); 256 WRITE_ONCE(jinode->i_flags, 257 jinode->i_flags & ~JI_COMMIT_RUNNING); 258 smp_mb(); 259 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 260 } 261 spin_unlock(&journal->j_list_lock); 262 return ret; 263 } 264 265 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 266 { 267 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 268 loff_t start_byte, end_byte; 269 270 if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 271 return 0; 272 273 return filemap_fdatawait_range_keep_errors(mapping, 274 start_byte, end_byte); 275 } 276 277 /* 278 * Wait for data submitted for writeout, refile inodes to proper 279 * transaction if needed. 280 * 281 */ 282 static int journal_finish_inode_data_buffers(journal_t *journal, 283 transaction_t *commit_transaction) 284 { 285 struct jbd2_inode *jinode, *next_i; 286 int err, ret = 0; 287 288 /* For locking, see the comment in journal_submit_data_buffers() */ 289 spin_lock(&journal->j_list_lock); 290 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 291 if (!(jinode->i_flags & JI_WAIT_DATA)) 292 continue; 293 WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); 294 spin_unlock(&journal->j_list_lock); 295 /* wait for the inode data buffers writeout. */ 296 if (journal->j_finish_inode_data_buffers) { 297 err = journal->j_finish_inode_data_buffers(jinode); 298 if (!ret) 299 ret = err; 300 } 301 cond_resched(); 302 spin_lock(&journal->j_list_lock); 303 WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); 304 smp_mb(); 305 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 306 } 307 308 /* Now refile inode to proper lists */ 309 list_for_each_entry_safe(jinode, next_i, 310 &commit_transaction->t_inode_list, i_list) { 311 list_del(&jinode->i_list); 312 if (jinode->i_next_transaction) { 313 jinode->i_transaction = jinode->i_next_transaction; 314 jinode->i_next_transaction = NULL; 315 list_add(&jinode->i_list, 316 &jinode->i_transaction->t_inode_list); 317 } else { 318 jinode->i_transaction = NULL; 319 WRITE_ONCE(jinode->i_dirty_start_page, 0); 320 WRITE_ONCE(jinode->i_dirty_end_page, 0); 321 } 322 } 323 spin_unlock(&journal->j_list_lock); 324 325 return ret; 326 } 327 328 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 329 { 330 char *addr; 331 __u32 checksum; 332 333 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 334 checksum = crc32_be(crc32_sum, addr, bh->b_size); 335 kunmap_local(addr); 336 337 return checksum; 338 } 339 340 static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 341 unsigned long long block) 342 { 343 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 344 if (jbd2_has_feature_64bit(j)) 345 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 346 } 347 348 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 349 struct buffer_head *bh, __u32 sequence) 350 { 351 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 352 __u8 *addr; 353 __u32 csum32; 354 __be32 seq; 355 356 if (!jbd2_journal_has_csum_v2or3(j)) 357 return; 358 359 seq = cpu_to_be32(sequence); 360 addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); 361 csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 362 csum32 = jbd2_chksum(csum32, addr, bh->b_size); 363 kunmap_local(addr); 364 365 if (jbd2_has_feature_csum3(j)) 366 tag3->t_checksum = cpu_to_be32(csum32); 367 else 368 tag->t_checksum = cpu_to_be16(csum32); 369 } 370 /* 371 * jbd2_journal_commit_transaction 372 * 373 * The primary function for committing a transaction to the log. This 374 * function is called by the journal thread to begin a complete commit. 375 */ 376 void jbd2_journal_commit_transaction(journal_t *journal) 377 { 378 struct transaction_stats_s stats; 379 transaction_t *commit_transaction; 380 struct journal_head *jh; 381 struct buffer_head *descriptor; 382 struct buffer_head **wbuf = journal->j_wbuf; 383 int bufs; 384 int escape; 385 int err; 386 unsigned long long blocknr; 387 ktime_t start_time; 388 u64 commit_time; 389 char *tagp = NULL; 390 journal_block_tag_t *tag = NULL; 391 int space_left = 0; 392 int first_tag = 0; 393 int tag_flag; 394 int i; 395 int tag_bytes = journal_tag_bytes(journal); 396 struct buffer_head *cbh = NULL; /* For transactional checksums */ 397 __u32 crc32_sum = ~0; 398 struct blk_plug plug; 399 /* Tail of the journal */ 400 unsigned long first_block; 401 tid_t first_tid; 402 int update_tail; 403 int csum_size = 0; 404 LIST_HEAD(io_bufs); 405 LIST_HEAD(log_bufs); 406 407 if (jbd2_journal_has_csum_v2or3(journal)) 408 csum_size = sizeof(struct jbd2_journal_block_tail); 409 410 /* 411 * First job: lock down the current transaction and wait for 412 * all outstanding updates to complete. 413 */ 414 415 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 416 if (journal->j_flags & JBD2_FLUSHED) { 417 jbd2_debug(3, "super block updated\n"); 418 mutex_lock_io(&journal->j_checkpoint_mutex); 419 /* 420 * We hold j_checkpoint_mutex so tail cannot change under us. 421 * We don't need any special data guarantees for writing sb 422 * since journal is empty and it is ok for write to be 423 * flushed only with transaction commit. 424 */ 425 jbd2_journal_update_sb_log_tail(journal, 426 journal->j_tail_sequence, 427 journal->j_tail, 0); 428 mutex_unlock(&journal->j_checkpoint_mutex); 429 } else { 430 jbd2_debug(3, "superblock not updated\n"); 431 } 432 433 J_ASSERT(journal->j_running_transaction != NULL); 434 J_ASSERT(journal->j_committing_transaction == NULL); 435 436 write_lock(&journal->j_state_lock); 437 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; 438 while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { 439 DEFINE_WAIT(wait); 440 441 prepare_to_wait(&journal->j_fc_wait, &wait, 442 TASK_UNINTERRUPTIBLE); 443 write_unlock(&journal->j_state_lock); 444 schedule(); 445 write_lock(&journal->j_state_lock); 446 finish_wait(&journal->j_fc_wait, &wait); 447 /* 448 * TODO: by blocking fast commits here, we are increasing 449 * fsync() latency slightly. Strictly speaking, we don't need 450 * to block fast commits until the transaction enters T_FLUSH 451 * state. So an optimization is possible where we block new fast 452 * commits here and wait for existing ones to complete 453 * just before we enter T_FLUSH. That way, the existing fast 454 * commits and this full commit can proceed parallely. 455 */ 456 } 457 write_unlock(&journal->j_state_lock); 458 459 commit_transaction = journal->j_running_transaction; 460 461 trace_jbd2_start_commit(journal, commit_transaction); 462 jbd2_debug(1, "JBD2: starting commit of transaction %d\n", 463 commit_transaction->t_tid); 464 465 write_lock(&journal->j_state_lock); 466 journal->j_fc_off = 0; 467 J_ASSERT(commit_transaction->t_state == T_RUNNING); 468 commit_transaction->t_state = T_LOCKED; 469 470 trace_jbd2_commit_locking(journal, commit_transaction); 471 stats.run.rs_wait = commit_transaction->t_max_wait; 472 stats.run.rs_request_delay = 0; 473 stats.run.rs_locked = jiffies; 474 if (commit_transaction->t_requested) 475 stats.run.rs_request_delay = 476 jbd2_time_diff(commit_transaction->t_requested, 477 stats.run.rs_locked); 478 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 479 stats.run.rs_locked); 480 481 // waits for any t_updates to finish 482 jbd2_journal_wait_updates(journal); 483 484 commit_transaction->t_state = T_SWITCH; 485 486 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 487 journal->j_max_transaction_buffers); 488 489 /* 490 * First thing we are allowed to do is to discard any remaining 491 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 492 * that there are no such buffers: if a large filesystem 493 * operation like a truncate needs to split itself over multiple 494 * transactions, then it may try to do a jbd2_journal_restart() while 495 * there are still BJ_Reserved buffers outstanding. These must 496 * be released cleanly from the current transaction. 497 * 498 * In this case, the filesystem must still reserve write access 499 * again before modifying the buffer in the new transaction, but 500 * we do not require it to remember exactly which old buffers it 501 * has reserved. This is consistent with the existing behaviour 502 * that multiple jbd2_journal_get_write_access() calls to the same 503 * buffer are perfectly permissible. 504 * We use journal->j_state_lock here to serialize processing of 505 * t_reserved_list with eviction of buffers from journal_unmap_buffer(). 506 */ 507 while (commit_transaction->t_reserved_list) { 508 jh = commit_transaction->t_reserved_list; 509 JBUFFER_TRACE(jh, "reserved, unused: refile"); 510 /* 511 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 512 * leave undo-committed data. 513 */ 514 if (jh->b_committed_data) { 515 struct buffer_head *bh = jh2bh(jh); 516 517 spin_lock(&jh->b_state_lock); 518 jbd2_free(jh->b_committed_data, bh->b_size); 519 jh->b_committed_data = NULL; 520 spin_unlock(&jh->b_state_lock); 521 } 522 jbd2_journal_refile_buffer(journal, jh); 523 } 524 525 write_unlock(&journal->j_state_lock); 526 /* 527 * Now try to drop any written-back buffers from the journal's 528 * checkpoint lists. We do this *before* commit because it potentially 529 * frees some memory 530 */ 531 spin_lock(&journal->j_list_lock); 532 __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); 533 spin_unlock(&journal->j_list_lock); 534 535 jbd2_debug(3, "JBD2: commit phase 1\n"); 536 537 /* 538 * Clear revoked flag to reflect there is no revoked buffers 539 * in the next transaction which is going to be started. 540 */ 541 jbd2_clear_buffer_revoked_flags(journal); 542 543 /* 544 * Switch to a new revoke table. 545 */ 546 jbd2_journal_switch_revoke_table(journal); 547 548 write_lock(&journal->j_state_lock); 549 /* 550 * Reserved credits cannot be claimed anymore, free them 551 */ 552 atomic_sub(atomic_read(&journal->j_reserved_credits), 553 &commit_transaction->t_outstanding_credits); 554 555 trace_jbd2_commit_flushing(journal, commit_transaction); 556 stats.run.rs_flushing = jiffies; 557 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 558 stats.run.rs_flushing); 559 560 commit_transaction->t_state = T_FLUSH; 561 journal->j_committing_transaction = commit_transaction; 562 journal->j_running_transaction = NULL; 563 start_time = ktime_get(); 564 commit_transaction->t_log_start = journal->j_head; 565 wake_up_all(&journal->j_wait_transaction_locked); 566 write_unlock(&journal->j_state_lock); 567 568 jbd2_debug(3, "JBD2: commit phase 2a\n"); 569 570 /* 571 * Now start flushing things to disk, in the order they appear 572 * on the transaction lists. Data blocks go first. 573 */ 574 err = journal_submit_data_buffers(journal, commit_transaction); 575 if (err) 576 jbd2_journal_abort(journal, err); 577 578 blk_start_plug(&plug); 579 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); 580 581 jbd2_debug(3, "JBD2: commit phase 2b\n"); 582 583 /* 584 * Way to go: we have now written out all of the data for a 585 * transaction! Now comes the tricky part: we need to write out 586 * metadata. Loop over the transaction's entire buffer list: 587 */ 588 write_lock(&journal->j_state_lock); 589 commit_transaction->t_state = T_COMMIT; 590 write_unlock(&journal->j_state_lock); 591 592 trace_jbd2_commit_logging(journal, commit_transaction); 593 stats.run.rs_logging = jiffies; 594 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 595 stats.run.rs_logging); 596 stats.run.rs_blocks = commit_transaction->t_nr_buffers; 597 stats.run.rs_blocks_logged = 0; 598 599 J_ASSERT(commit_transaction->t_nr_buffers <= 600 atomic_read(&commit_transaction->t_outstanding_credits)); 601 602 bufs = 0; 603 descriptor = NULL; 604 while (commit_transaction->t_buffers) { 605 606 /* Find the next buffer to be journaled... */ 607 608 jh = commit_transaction->t_buffers; 609 610 /* If we're in abort mode, we just un-journal the buffer and 611 release it. */ 612 613 if (is_journal_aborted(journal)) { 614 clear_buffer_jbddirty(jh2bh(jh)); 615 JBUFFER_TRACE(jh, "journal is aborting: refile"); 616 jbd2_buffer_abort_trigger(jh, 617 jh->b_frozen_data ? 618 jh->b_frozen_triggers : 619 jh->b_triggers); 620 jbd2_journal_refile_buffer(journal, jh); 621 /* If that was the last one, we need to clean up 622 * any descriptor buffers which may have been 623 * already allocated, even if we are now 624 * aborting. */ 625 if (!commit_transaction->t_buffers) 626 goto start_journal_io; 627 continue; 628 } 629 630 /* Make sure we have a descriptor block in which to 631 record the metadata buffer. */ 632 633 if (!descriptor) { 634 J_ASSERT (bufs == 0); 635 636 jbd2_debug(4, "JBD2: get descriptor\n"); 637 638 descriptor = jbd2_journal_get_descriptor_buffer( 639 commit_transaction, 640 JBD2_DESCRIPTOR_BLOCK); 641 if (!descriptor) { 642 jbd2_journal_abort(journal, -EIO); 643 continue; 644 } 645 646 jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", 647 (unsigned long long)descriptor->b_blocknr, 648 descriptor->b_data); 649 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 650 space_left = descriptor->b_size - 651 sizeof(journal_header_t); 652 first_tag = 1; 653 set_buffer_jwrite(descriptor); 654 set_buffer_dirty(descriptor); 655 wbuf[bufs++] = descriptor; 656 657 /* Record it so that we can wait for IO 658 completion later */ 659 BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 660 jbd2_file_log_bh(&log_bufs, descriptor); 661 } 662 663 /* Where is the buffer to be written? */ 664 665 err = jbd2_journal_next_log_block(journal, &blocknr); 666 /* If the block mapping failed, just abandon the buffer 667 and repeat this loop: we'll fall into the 668 refile-on-abort condition above. */ 669 if (err) { 670 jbd2_journal_abort(journal, err); 671 continue; 672 } 673 674 /* 675 * start_this_handle() uses t_outstanding_credits to determine 676 * the free space in the log. 677 */ 678 atomic_dec(&commit_transaction->t_outstanding_credits); 679 680 /* Bump b_count to prevent truncate from stumbling over 681 the shadowed buffer! @@@ This can go if we ever get 682 rid of the shadow pairing of buffers. */ 683 atomic_inc(&jh2bh(jh)->b_count); 684 685 /* 686 * Make a temporary IO buffer with which to write it out 687 * (this will requeue the metadata buffer to BJ_Shadow). 688 */ 689 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 690 JBUFFER_TRACE(jh, "ph3: write metadata"); 691 escape = jbd2_journal_write_metadata_buffer(commit_transaction, 692 jh, &wbuf[bufs], blocknr); 693 jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 694 695 /* Record the new block's tag in the current descriptor 696 buffer */ 697 698 tag_flag = 0; 699 if (escape) 700 tag_flag |= JBD2_FLAG_ESCAPE; 701 if (!first_tag) 702 tag_flag |= JBD2_FLAG_SAME_UUID; 703 704 tag = (journal_block_tag_t *) tagp; 705 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 706 tag->t_flags = cpu_to_be16(tag_flag); 707 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 708 commit_transaction->t_tid); 709 tagp += tag_bytes; 710 space_left -= tag_bytes; 711 bufs++; 712 713 if (first_tag) { 714 memcpy (tagp, journal->j_uuid, 16); 715 tagp += 16; 716 space_left -= 16; 717 first_tag = 0; 718 } 719 720 /* If there's no more to do, or if the descriptor is full, 721 let the IO rip! */ 722 723 if (bufs == journal->j_wbufsize || 724 commit_transaction->t_buffers == NULL || 725 space_left < tag_bytes + 16 + csum_size) { 726 727 jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); 728 729 /* Write an end-of-descriptor marker before 730 submitting the IOs. "tag" still points to 731 the last tag we set up. */ 732 733 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 734 start_journal_io: 735 if (descriptor) 736 jbd2_descriptor_block_csum_set(journal, 737 descriptor); 738 739 for (i = 0; i < bufs; i++) { 740 struct buffer_head *bh = wbuf[i]; 741 742 /* 743 * Compute checksum. 744 */ 745 if (jbd2_has_feature_checksum(journal)) { 746 crc32_sum = 747 jbd2_checksum_data(crc32_sum, bh); 748 } 749 750 lock_buffer(bh); 751 clear_buffer_dirty(bh); 752 set_buffer_uptodate(bh); 753 bh_submit(bh, 754 REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, 755 journal_end_buffer_io_sync); 756 } 757 cond_resched(); 758 759 /* Force a new descriptor to be generated next 760 time round the loop. */ 761 descriptor = NULL; 762 bufs = 0; 763 } 764 } 765 766 err = journal_finish_inode_data_buffers(journal, commit_transaction); 767 if (err) { 768 printk(KERN_WARNING 769 "JBD2: Detected IO errors %d while flushing file data on %s\n", 770 err, journal->j_devname); 771 err = 0; 772 } 773 774 /* 775 * Get current oldest transaction in the log before we issue flush 776 * to the filesystem device. After the flush we can be sure that 777 * blocks of all older transactions are checkpointed to persistent 778 * storage and we will be safe to update journal start in the 779 * superblock with the numbers we get here. 780 */ 781 update_tail = 782 jbd2_journal_get_log_tail(journal, &first_tid, &first_block); 783 784 write_lock(&journal->j_state_lock); 785 if (update_tail) { 786 long freed = first_block - journal->j_tail; 787 788 if (first_block < journal->j_tail) 789 freed += journal->j_last - journal->j_first; 790 /* Update tail only if we free significant amount of space */ 791 if (freed < journal->j_max_transaction_buffers) 792 update_tail = 0; 793 } 794 J_ASSERT(commit_transaction->t_state == T_COMMIT); 795 commit_transaction->t_state = T_COMMIT_DFLUSH; 796 write_unlock(&journal->j_state_lock); 797 798 /* 799 * If the journal is not located on the file system device, 800 * then we must flush the file system device before we issue 801 * the commit record and update the journal tail sequence. 802 */ 803 if ((commit_transaction->t_need_data_flush || update_tail) && 804 (journal->j_fs_dev != journal->j_dev) && 805 (journal->j_flags & JBD2_BARRIER)) 806 blkdev_issue_flush(journal->j_fs_dev); 807 808 /* Done it all: now write the commit record asynchronously. */ 809 if (jbd2_has_feature_async_commit(journal)) { 810 err = journal_submit_commit_record(journal, commit_transaction, 811 &cbh, crc32_sum); 812 if (err) 813 jbd2_journal_abort(journal, err); 814 } 815 816 blk_finish_plug(&plug); 817 818 /* Lo and behold: we have just managed to send a transaction to 819 the log. Before we can commit it, wait for the IO so far to 820 complete. Control buffers being written are on the 821 transaction's t_log_list queue, and metadata buffers are on 822 the io_bufs list. 823 824 Wait for the buffers in reverse order. That way we are 825 less likely to be woken up until all IOs have completed, and 826 so we incur less scheduling load. 827 */ 828 829 jbd2_debug(3, "JBD2: commit phase 3\n"); 830 831 while (!list_empty(&io_bufs)) { 832 struct buffer_head *bh = list_entry(io_bufs.prev, 833 struct buffer_head, 834 b_assoc_buffers); 835 836 wait_on_buffer(bh); 837 cond_resched(); 838 839 if (unlikely(!buffer_uptodate(bh))) 840 err = -EIO; 841 jbd2_unfile_log_bh(bh); 842 stats.run.rs_blocks_logged++; 843 844 /* 845 * The list contains temporary buffer heads created by 846 * jbd2_journal_write_metadata_buffer(). 847 */ 848 BUFFER_TRACE(bh, "dumping temporary bh"); 849 __brelse(bh); 850 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 851 free_buffer_head(bh); 852 853 /* We also have to refile the corresponding shadowed buffer */ 854 jh = commit_transaction->t_shadow_list->b_tprev; 855 bh = jh2bh(jh); 856 clear_buffer_jwrite(bh); 857 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 858 J_ASSERT_BH(bh, !buffer_shadow(bh)); 859 860 /* The metadata is now released for reuse, but we need 861 to remember it against this transaction so that when 862 we finally commit, we can do any checkpointing 863 required. */ 864 JBUFFER_TRACE(jh, "file as BJ_Forget"); 865 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 866 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 867 __brelse(bh); 868 } 869 870 J_ASSERT (commit_transaction->t_shadow_list == NULL); 871 872 jbd2_debug(3, "JBD2: commit phase 4\n"); 873 874 /* Here we wait for the revoke record and descriptor record buffers */ 875 while (!list_empty(&log_bufs)) { 876 struct buffer_head *bh; 877 878 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 879 wait_on_buffer(bh); 880 cond_resched(); 881 882 if (unlikely(!buffer_uptodate(bh))) 883 err = -EIO; 884 885 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 886 clear_buffer_jwrite(bh); 887 jbd2_unfile_log_bh(bh); 888 stats.run.rs_blocks_logged++; 889 __brelse(bh); /* One for getblk */ 890 /* AKPM: bforget here */ 891 } 892 893 if (err) 894 jbd2_journal_abort(journal, err); 895 896 jbd2_debug(3, "JBD2: commit phase 5\n"); 897 write_lock(&journal->j_state_lock); 898 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 899 commit_transaction->t_state = T_COMMIT_JFLUSH; 900 write_unlock(&journal->j_state_lock); 901 902 if (!jbd2_has_feature_async_commit(journal)) { 903 err = journal_submit_commit_record(journal, commit_transaction, 904 &cbh, crc32_sum); 905 if (err) 906 jbd2_journal_abort(journal, err); 907 } 908 if (cbh) 909 err = journal_wait_on_commit_record(journal, cbh); 910 stats.run.rs_blocks_logged++; 911 if (jbd2_has_feature_async_commit(journal) && 912 journal->j_flags & JBD2_BARRIER) { 913 blkdev_issue_flush(journal->j_dev); 914 } 915 916 if (err) 917 jbd2_journal_abort(journal, err); 918 919 WARN_ON_ONCE( 920 atomic_read(&commit_transaction->t_outstanding_credits) < 0); 921 922 /* 923 * Now disk caches for filesystem device are flushed so we are safe to 924 * erase checkpointed transactions from the log by updating journal 925 * superblock. 926 */ 927 if (update_tail) 928 jbd2_update_log_tail(journal, first_tid, first_block); 929 930 /* End of a transaction! Finally, we can do checkpoint 931 processing: any buffers committed as a result of this 932 transaction can be removed from any checkpoint list it was on 933 before. */ 934 935 jbd2_debug(3, "JBD2: commit phase 6\n"); 936 937 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 938 J_ASSERT(commit_transaction->t_buffers == NULL); 939 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 940 J_ASSERT(commit_transaction->t_shadow_list == NULL); 941 942 restart_loop: 943 /* 944 * As there are other places (journal_unmap_buffer()) adding buffers 945 * to this list we have to be careful and hold the j_list_lock. 946 */ 947 spin_lock(&journal->j_list_lock); 948 while (commit_transaction->t_forget) { 949 transaction_t *cp_transaction; 950 struct buffer_head *bh; 951 int try_to_free = 0; 952 bool drop_ref; 953 954 jh = commit_transaction->t_forget; 955 spin_unlock(&journal->j_list_lock); 956 bh = jh2bh(jh); 957 /* 958 * Get a reference so that bh cannot be freed before we are 959 * done with it. 960 */ 961 get_bh(bh); 962 spin_lock(&jh->b_state_lock); 963 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 964 965 /* 966 * If there is undo-protected committed data against 967 * this buffer, then we can remove it now. If it is a 968 * buffer needing such protection, the old frozen_data 969 * field now points to a committed version of the 970 * buffer, so rotate that field to the new committed 971 * data. 972 * 973 * Otherwise, we can just throw away the frozen data now. 974 * 975 * We also know that the frozen data has already fired 976 * its triggers if they exist, so we can clear that too. 977 */ 978 if (jh->b_committed_data) { 979 jbd2_free(jh->b_committed_data, bh->b_size); 980 jh->b_committed_data = NULL; 981 if (jh->b_frozen_data) { 982 jh->b_committed_data = jh->b_frozen_data; 983 jh->b_frozen_data = NULL; 984 jh->b_frozen_triggers = NULL; 985 } 986 } else if (jh->b_frozen_data) { 987 jbd2_free(jh->b_frozen_data, bh->b_size); 988 jh->b_frozen_data = NULL; 989 jh->b_frozen_triggers = NULL; 990 } 991 992 spin_lock(&journal->j_list_lock); 993 cp_transaction = jh->b_cp_transaction; 994 if (cp_transaction) { 995 JBUFFER_TRACE(jh, "remove from old cp transaction"); 996 cp_transaction->t_chp_stats.cs_dropped++; 997 __jbd2_journal_remove_checkpoint(jh); 998 } 999 1000 /* Only re-checkpoint the buffer_head if it is marked 1001 * dirty. If the buffer was added to the BJ_Forget list 1002 * by jbd2_journal_forget, it may no longer be dirty and 1003 * there's no point in keeping a checkpoint record for 1004 * it. */ 1005 1006 /* 1007 * A buffer which has been freed while still being journaled 1008 * by a previous transaction, refile the buffer to BJ_Forget of 1009 * the running transaction. If the just committed transaction 1010 * contains "add to orphan" operation, we can completely 1011 * invalidate the buffer now. We are rather through in that 1012 * since the buffer may be still accessible when blocksize < 1013 * pagesize and it is attached to the last partial page. 1014 */ 1015 if (buffer_freed(bh) && !jh->b_next_transaction) { 1016 struct address_space *mapping; 1017 1018 clear_buffer_freed(bh); 1019 clear_buffer_jbddirty(bh); 1020 1021 /* 1022 * Block device buffers need to stay mapped all the 1023 * time, so it is enough to clear buffer_jbddirty and 1024 * buffer_freed bits. For the file mapping buffers (i.e. 1025 * journalled data) we need to unmap buffer and clear 1026 * more bits. We also need to be careful about the check 1027 * because the data page mapping can get cleared under 1028 * our hands. Note that if mapping == NULL, we don't 1029 * need to make buffer unmapped because the page is 1030 * already detached from the mapping and buffers cannot 1031 * get reused. 1032 */ 1033 mapping = READ_ONCE(bh->b_folio->mapping); 1034 if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { 1035 clear_buffer_mapped(bh); 1036 clear_buffer_new(bh); 1037 clear_buffer_req(bh); 1038 bh->b_bdev = NULL; 1039 } 1040 } 1041 1042 if (buffer_jbddirty(bh)) { 1043 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 1044 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 1045 if (is_journal_aborted(journal)) 1046 clear_buffer_jbddirty(bh); 1047 } else { 1048 J_ASSERT_BH(bh, !buffer_dirty(bh)); 1049 /* 1050 * The buffer on BJ_Forget list and not jbddirty means 1051 * it has been freed by this transaction and hence it 1052 * could not have been reallocated until this 1053 * transaction has committed. *BUT* it could be 1054 * reallocated once we have written all the data to 1055 * disk and before we process the buffer on BJ_Forget 1056 * list. 1057 */ 1058 if (!jh->b_next_transaction) 1059 try_to_free = 1; 1060 } 1061 JBUFFER_TRACE(jh, "refile or unfile buffer"); 1062 drop_ref = __jbd2_journal_refile_buffer(jh); 1063 spin_unlock(&jh->b_state_lock); 1064 if (drop_ref) 1065 jbd2_journal_put_journal_head(jh); 1066 if (try_to_free) 1067 release_buffer_page(bh); /* Drops bh reference */ 1068 else 1069 __brelse(bh); 1070 cond_resched_lock(&journal->j_list_lock); 1071 } 1072 spin_unlock(&journal->j_list_lock); 1073 /* 1074 * This is a bit sleazy. We use j_list_lock to protect transition 1075 * of a transaction into T_FINISHED state and calling 1076 * __jbd2_journal_drop_transaction(). Otherwise we could race with 1077 * other checkpointing code processing the transaction... 1078 */ 1079 write_lock(&journal->j_state_lock); 1080 spin_lock(&journal->j_list_lock); 1081 /* 1082 * Now recheck if some buffers did not get attached to the transaction 1083 * while the lock was dropped... 1084 */ 1085 if (commit_transaction->t_forget) { 1086 spin_unlock(&journal->j_list_lock); 1087 write_unlock(&journal->j_state_lock); 1088 goto restart_loop; 1089 } 1090 1091 /* Add the transaction to the checkpoint list 1092 * __journal_remove_checkpoint() can not destroy transaction 1093 * under us because it is not marked as T_FINISHED yet */ 1094 if (journal->j_checkpoint_transactions == NULL) { 1095 journal->j_checkpoint_transactions = commit_transaction; 1096 commit_transaction->t_cpnext = commit_transaction; 1097 commit_transaction->t_cpprev = commit_transaction; 1098 } else { 1099 commit_transaction->t_cpnext = 1100 journal->j_checkpoint_transactions; 1101 commit_transaction->t_cpprev = 1102 commit_transaction->t_cpnext->t_cpprev; 1103 commit_transaction->t_cpnext->t_cpprev = 1104 commit_transaction; 1105 commit_transaction->t_cpprev->t_cpnext = 1106 commit_transaction; 1107 } 1108 spin_unlock(&journal->j_list_lock); 1109 1110 /* Done with this transaction! */ 1111 1112 jbd2_debug(3, "JBD2: commit phase 7\n"); 1113 1114 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 1115 1116 commit_transaction->t_start = jiffies; 1117 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 1118 commit_transaction->t_start); 1119 1120 /* 1121 * File the transaction statistics 1122 */ 1123 stats.ts_tid = commit_transaction->t_tid; 1124 stats.run.rs_handle_count = 1125 atomic_read(&commit_transaction->t_handle_count); 1126 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1127 commit_transaction->t_tid, &stats.run); 1128 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1129 1130 commit_transaction->t_state = T_COMMIT_CALLBACK; 1131 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1132 WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid); 1133 journal->j_committing_transaction = NULL; 1134 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1135 1136 /* 1137 * weight the commit time higher than the average time so we don't 1138 * react too strongly to vast changes in the commit time 1139 */ 1140 if (likely(journal->j_average_commit_time)) 1141 journal->j_average_commit_time = (commit_time + 1142 journal->j_average_commit_time*3) / 4; 1143 else 1144 journal->j_average_commit_time = commit_time; 1145 1146 write_unlock(&journal->j_state_lock); 1147 1148 if (journal->j_commit_callback) 1149 journal->j_commit_callback(journal, commit_transaction); 1150 if (journal->j_fc_cleanup_callback) 1151 journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); 1152 1153 trace_jbd2_end_commit(journal, commit_transaction); 1154 jbd2_debug(1, "JBD2: commit %d complete, head %d\n", 1155 journal->j_commit_sequence, journal->j_tail_sequence); 1156 1157 write_lock(&journal->j_state_lock); 1158 journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; 1159 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 1160 spin_lock(&journal->j_list_lock); 1161 commit_transaction->t_state = T_FINISHED; 1162 /* Check if the transaction can be dropped now that we are finished */ 1163 if (commit_transaction->t_checkpoint_list == NULL) { 1164 __jbd2_journal_drop_transaction(journal, commit_transaction); 1165 jbd2_journal_free_transaction(commit_transaction); 1166 } 1167 spin_unlock(&journal->j_list_lock); 1168 write_unlock(&journal->j_state_lock); 1169 wake_up(&journal->j_wait_done_commit); 1170 wake_up(&journal->j_fc_wait); 1171 1172 /* 1173 * Calculate overall stats 1174 */ 1175 spin_lock(&journal->j_history_lock); 1176 journal->j_stats.ts_tid++; 1177 journal->j_stats.ts_requested += stats.ts_requested; 1178 journal->j_stats.run.rs_wait += stats.run.rs_wait; 1179 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1180 journal->j_stats.run.rs_running += stats.run.rs_running; 1181 journal->j_stats.run.rs_locked += stats.run.rs_locked; 1182 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1183 journal->j_stats.run.rs_logging += stats.run.rs_logging; 1184 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1185 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1186 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1187 spin_unlock(&journal->j_history_lock); 1188 } 1189