1 /* 2 * linux/fs/jbd2/commit.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal commit routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 */ 15 16 #include <linux/time.h> 17 #include <linux/fs.h> 18 #include <linux/jbd2.h> 19 #include <linux/errno.h> 20 #include <linux/slab.h> 21 #include <linux/mm.h> 22 #include <linux/pagemap.h> 23 #include <linux/jiffies.h> 24 #include <linux/crc32.h> 25 #include <linux/writeback.h> 26 #include <linux/backing-dev.h> 27 #include <linux/bio.h> 28 #include <linux/blkdev.h> 29 #include <linux/bitops.h> 30 #include <trace/events/jbd2.h> 31 32 /* 33 * Default IO end handler for temporary BJ_IO buffer_heads. 34 */ 35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 36 { 37 BUFFER_TRACE(bh, ""); 38 if (uptodate) 39 set_buffer_uptodate(bh); 40 else 41 clear_buffer_uptodate(bh); 42 unlock_buffer(bh); 43 } 44 45 /* 46 * When an ext4 file is truncated, it is possible that some pages are not 47 * successfully freed, because they are attached to a committing transaction. 48 * After the transaction commits, these pages are left on the LRU, with no 49 * ->mapping, and with attached buffers. These pages are trivially reclaimable 50 * by the VM, but their apparent absence upsets the VM accounting, and it makes 51 * the numbers in /proc/meminfo look odd. 52 * 53 * So here, we have a buffer which has just come off the forget list. Look to 54 * see if we can strip all buffers from the backing page. 55 * 56 * Called under lock_journal(), and possibly under journal_datalist_lock. The 57 * caller provided us with a ref against the buffer, and we drop that here. 58 */ 59 static void release_buffer_page(struct buffer_head *bh) 60 { 61 struct page *page; 62 63 if (buffer_dirty(bh)) 64 goto nope; 65 if (atomic_read(&bh->b_count) != 1) 66 goto nope; 67 page = bh->b_page; 68 if (!page) 69 goto nope; 70 if (page->mapping) 71 goto nope; 72 73 /* OK, it's a truncated page */ 74 if (!trylock_page(page)) 75 goto nope; 76 77 page_cache_get(page); 78 __brelse(bh); 79 try_to_free_buffers(page); 80 unlock_page(page); 81 page_cache_release(page); 82 return; 83 84 nope: 85 __brelse(bh); 86 } 87 88 /* 89 * Done it all: now submit the commit record. We should have 90 * cleaned up our previous buffers by now, so if we are in abort 91 * mode we can now just skip the rest of the journal write 92 * entirely. 93 * 94 * Returns 1 if the journal needs to be aborted or 0 on success 95 */ 96 static int journal_submit_commit_record(journal_t *journal, 97 transaction_t *commit_transaction, 98 struct buffer_head **cbh, 99 __u32 crc32_sum) 100 { 101 struct journal_head *descriptor; 102 struct commit_header *tmp; 103 struct buffer_head *bh; 104 int ret; 105 struct timespec now = current_kernel_time(); 106 107 *cbh = NULL; 108 109 if (is_journal_aborted(journal)) 110 return 0; 111 112 descriptor = jbd2_journal_get_descriptor_buffer(journal); 113 if (!descriptor) 114 return 1; 115 116 bh = jh2bh(descriptor); 117 118 tmp = (struct commit_header *)bh->b_data; 119 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 120 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 121 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); 122 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 123 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 124 125 if (JBD2_HAS_COMPAT_FEATURE(journal, 126 JBD2_FEATURE_COMPAT_CHECKSUM)) { 127 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 128 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 129 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 130 } 131 132 JBUFFER_TRACE(descriptor, "submit commit block"); 133 lock_buffer(bh); 134 clear_buffer_dirty(bh); 135 set_buffer_uptodate(bh); 136 bh->b_end_io = journal_end_buffer_io_sync; 137 138 if (journal->j_flags & JBD2_BARRIER && 139 !JBD2_HAS_INCOMPAT_FEATURE(journal, 140 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) 141 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); 142 else 143 ret = submit_bh(WRITE_SYNC, bh); 144 145 *cbh = bh; 146 return ret; 147 } 148 149 /* 150 * This function along with journal_submit_commit_record 151 * allows to write the commit record asynchronously. 152 */ 153 static int journal_wait_on_commit_record(journal_t *journal, 154 struct buffer_head *bh) 155 { 156 int ret = 0; 157 158 clear_buffer_dirty(bh); 159 wait_on_buffer(bh); 160 161 if (unlikely(!buffer_uptodate(bh))) 162 ret = -EIO; 163 put_bh(bh); /* One for getblk() */ 164 jbd2_journal_put_journal_head(bh2jh(bh)); 165 166 return ret; 167 } 168 169 /* 170 * write the filemap data using writepage() address_space_operations. 171 * We don't do block allocation here even for delalloc. We don't 172 * use writepages() because with dealyed allocation we may be doing 173 * block allocation in writepages(). 174 */ 175 static int journal_submit_inode_data_buffers(struct address_space *mapping) 176 { 177 int ret; 178 struct writeback_control wbc = { 179 .sync_mode = WB_SYNC_ALL, 180 .nr_to_write = mapping->nrpages * 2, 181 .range_start = 0, 182 .range_end = i_size_read(mapping->host), 183 }; 184 185 ret = generic_writepages(mapping, &wbc); 186 return ret; 187 } 188 189 /* 190 * Submit all the data buffers of inode associated with the transaction to 191 * disk. 192 * 193 * We are in a committing transaction. Therefore no new inode can be added to 194 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 195 * operate on from being released while we write out pages. 196 */ 197 static int journal_submit_data_buffers(journal_t *journal, 198 transaction_t *commit_transaction) 199 { 200 struct jbd2_inode *jinode; 201 int err, ret = 0; 202 struct address_space *mapping; 203 204 spin_lock(&journal->j_list_lock); 205 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 206 mapping = jinode->i_vfs_inode->i_mapping; 207 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 208 spin_unlock(&journal->j_list_lock); 209 /* 210 * submit the inode data buffers. We use writepage 211 * instead of writepages. Because writepages can do 212 * block allocation with delalloc. We need to write 213 * only allocated blocks here. 214 */ 215 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 216 err = journal_submit_inode_data_buffers(mapping); 217 if (!ret) 218 ret = err; 219 spin_lock(&journal->j_list_lock); 220 J_ASSERT(jinode->i_transaction == commit_transaction); 221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 222 smp_mb__after_clear_bit(); 223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 224 } 225 spin_unlock(&journal->j_list_lock); 226 return ret; 227 } 228 229 /* 230 * Wait for data submitted for writeout, refile inodes to proper 231 * transaction if needed. 232 * 233 */ 234 static int journal_finish_inode_data_buffers(journal_t *journal, 235 transaction_t *commit_transaction) 236 { 237 struct jbd2_inode *jinode, *next_i; 238 int err, ret = 0; 239 240 /* For locking, see the comment in journal_submit_data_buffers() */ 241 spin_lock(&journal->j_list_lock); 242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 244 spin_unlock(&journal->j_list_lock); 245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 246 if (err) { 247 /* 248 * Because AS_EIO is cleared by 249 * filemap_fdatawait_range(), set it again so 250 * that user process can get -EIO from fsync(). 251 */ 252 set_bit(AS_EIO, 253 &jinode->i_vfs_inode->i_mapping->flags); 254 255 if (!ret) 256 ret = err; 257 } 258 spin_lock(&journal->j_list_lock); 259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 260 smp_mb__after_clear_bit(); 261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 262 } 263 264 /* Now refile inode to proper lists */ 265 list_for_each_entry_safe(jinode, next_i, 266 &commit_transaction->t_inode_list, i_list) { 267 list_del(&jinode->i_list); 268 if (jinode->i_next_transaction) { 269 jinode->i_transaction = jinode->i_next_transaction; 270 jinode->i_next_transaction = NULL; 271 list_add(&jinode->i_list, 272 &jinode->i_transaction->t_inode_list); 273 } else { 274 jinode->i_transaction = NULL; 275 } 276 } 277 spin_unlock(&journal->j_list_lock); 278 279 return ret; 280 } 281 282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 283 { 284 struct page *page = bh->b_page; 285 char *addr; 286 __u32 checksum; 287 288 addr = kmap_atomic(page); 289 checksum = crc32_be(crc32_sum, 290 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); 291 kunmap_atomic(addr); 292 293 return checksum; 294 } 295 296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 297 unsigned long long block) 298 { 299 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 300 if (tag_bytes > JBD2_TAG_SIZE32) 301 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 302 } 303 304 /* 305 * jbd2_journal_commit_transaction 306 * 307 * The primary function for committing a transaction to the log. This 308 * function is called by the journal thread to begin a complete commit. 309 */ 310 void jbd2_journal_commit_transaction(journal_t *journal) 311 { 312 struct transaction_stats_s stats; 313 transaction_t *commit_transaction; 314 struct journal_head *jh, *new_jh, *descriptor; 315 struct buffer_head **wbuf = journal->j_wbuf; 316 int bufs; 317 int flags; 318 int err; 319 unsigned long long blocknr; 320 ktime_t start_time; 321 u64 commit_time; 322 char *tagp = NULL; 323 journal_header_t *header; 324 journal_block_tag_t *tag = NULL; 325 int space_left = 0; 326 int first_tag = 0; 327 int tag_flag; 328 int i, to_free = 0; 329 int tag_bytes = journal_tag_bytes(journal); 330 struct buffer_head *cbh = NULL; /* For transactional checksums */ 331 __u32 crc32_sum = ~0; 332 struct blk_plug plug; 333 334 /* 335 * First job: lock down the current transaction and wait for 336 * all outstanding updates to complete. 337 */ 338 339 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 340 if (journal->j_flags & JBD2_FLUSHED) { 341 jbd_debug(3, "super block updated\n"); 342 jbd2_journal_update_superblock(journal, 1); 343 } else { 344 jbd_debug(3, "superblock not updated\n"); 345 } 346 347 J_ASSERT(journal->j_running_transaction != NULL); 348 J_ASSERT(journal->j_committing_transaction == NULL); 349 350 commit_transaction = journal->j_running_transaction; 351 J_ASSERT(commit_transaction->t_state == T_RUNNING); 352 353 trace_jbd2_start_commit(journal, commit_transaction); 354 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 355 commit_transaction->t_tid); 356 357 write_lock(&journal->j_state_lock); 358 commit_transaction->t_state = T_LOCKED; 359 360 trace_jbd2_commit_locking(journal, commit_transaction); 361 stats.run.rs_wait = commit_transaction->t_max_wait; 362 stats.run.rs_locked = jiffies; 363 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 364 stats.run.rs_locked); 365 366 spin_lock(&commit_transaction->t_handle_lock); 367 while (atomic_read(&commit_transaction->t_updates)) { 368 DEFINE_WAIT(wait); 369 370 prepare_to_wait(&journal->j_wait_updates, &wait, 371 TASK_UNINTERRUPTIBLE); 372 if (atomic_read(&commit_transaction->t_updates)) { 373 spin_unlock(&commit_transaction->t_handle_lock); 374 write_unlock(&journal->j_state_lock); 375 schedule(); 376 write_lock(&journal->j_state_lock); 377 spin_lock(&commit_transaction->t_handle_lock); 378 } 379 finish_wait(&journal->j_wait_updates, &wait); 380 } 381 spin_unlock(&commit_transaction->t_handle_lock); 382 383 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 384 journal->j_max_transaction_buffers); 385 386 /* 387 * First thing we are allowed to do is to discard any remaining 388 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 389 * that there are no such buffers: if a large filesystem 390 * operation like a truncate needs to split itself over multiple 391 * transactions, then it may try to do a jbd2_journal_restart() while 392 * there are still BJ_Reserved buffers outstanding. These must 393 * be released cleanly from the current transaction. 394 * 395 * In this case, the filesystem must still reserve write access 396 * again before modifying the buffer in the new transaction, but 397 * we do not require it to remember exactly which old buffers it 398 * has reserved. This is consistent with the existing behaviour 399 * that multiple jbd2_journal_get_write_access() calls to the same 400 * buffer are perfectly permissible. 401 */ 402 while (commit_transaction->t_reserved_list) { 403 jh = commit_transaction->t_reserved_list; 404 JBUFFER_TRACE(jh, "reserved, unused: refile"); 405 /* 406 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 407 * leave undo-committed data. 408 */ 409 if (jh->b_committed_data) { 410 struct buffer_head *bh = jh2bh(jh); 411 412 jbd_lock_bh_state(bh); 413 jbd2_free(jh->b_committed_data, bh->b_size); 414 jh->b_committed_data = NULL; 415 jbd_unlock_bh_state(bh); 416 } 417 jbd2_journal_refile_buffer(journal, jh); 418 } 419 420 /* 421 * Now try to drop any written-back buffers from the journal's 422 * checkpoint lists. We do this *before* commit because it potentially 423 * frees some memory 424 */ 425 spin_lock(&journal->j_list_lock); 426 __jbd2_journal_clean_checkpoint_list(journal); 427 spin_unlock(&journal->j_list_lock); 428 429 jbd_debug(3, "JBD2: commit phase 1\n"); 430 431 /* 432 * Clear revoked flag to reflect there is no revoked buffers 433 * in the next transaction which is going to be started. 434 */ 435 jbd2_clear_buffer_revoked_flags(journal); 436 437 /* 438 * Switch to a new revoke table. 439 */ 440 jbd2_journal_switch_revoke_table(journal); 441 442 trace_jbd2_commit_flushing(journal, commit_transaction); 443 stats.run.rs_flushing = jiffies; 444 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 445 stats.run.rs_flushing); 446 447 commit_transaction->t_state = T_FLUSH; 448 journal->j_committing_transaction = commit_transaction; 449 journal->j_running_transaction = NULL; 450 start_time = ktime_get(); 451 commit_transaction->t_log_start = journal->j_head; 452 wake_up(&journal->j_wait_transaction_locked); 453 write_unlock(&journal->j_state_lock); 454 455 jbd_debug(3, "JBD2: commit phase 2\n"); 456 457 /* 458 * Now start flushing things to disk, in the order they appear 459 * on the transaction lists. Data blocks go first. 460 */ 461 err = journal_submit_data_buffers(journal, commit_transaction); 462 if (err) 463 jbd2_journal_abort(journal, err); 464 465 blk_start_plug(&plug); 466 jbd2_journal_write_revoke_records(journal, commit_transaction, 467 WRITE_SYNC); 468 blk_finish_plug(&plug); 469 470 jbd_debug(3, "JBD2: commit phase 2\n"); 471 472 /* 473 * Way to go: we have now written out all of the data for a 474 * transaction! Now comes the tricky part: we need to write out 475 * metadata. Loop over the transaction's entire buffer list: 476 */ 477 write_lock(&journal->j_state_lock); 478 commit_transaction->t_state = T_COMMIT; 479 write_unlock(&journal->j_state_lock); 480 481 trace_jbd2_commit_logging(journal, commit_transaction); 482 stats.run.rs_logging = jiffies; 483 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 484 stats.run.rs_logging); 485 stats.run.rs_blocks = 486 atomic_read(&commit_transaction->t_outstanding_credits); 487 stats.run.rs_blocks_logged = 0; 488 489 J_ASSERT(commit_transaction->t_nr_buffers <= 490 atomic_read(&commit_transaction->t_outstanding_credits)); 491 492 err = 0; 493 descriptor = NULL; 494 bufs = 0; 495 blk_start_plug(&plug); 496 while (commit_transaction->t_buffers) { 497 498 /* Find the next buffer to be journaled... */ 499 500 jh = commit_transaction->t_buffers; 501 502 /* If we're in abort mode, we just un-journal the buffer and 503 release it. */ 504 505 if (is_journal_aborted(journal)) { 506 clear_buffer_jbddirty(jh2bh(jh)); 507 JBUFFER_TRACE(jh, "journal is aborting: refile"); 508 jbd2_buffer_abort_trigger(jh, 509 jh->b_frozen_data ? 510 jh->b_frozen_triggers : 511 jh->b_triggers); 512 jbd2_journal_refile_buffer(journal, jh); 513 /* If that was the last one, we need to clean up 514 * any descriptor buffers which may have been 515 * already allocated, even if we are now 516 * aborting. */ 517 if (!commit_transaction->t_buffers) 518 goto start_journal_io; 519 continue; 520 } 521 522 /* Make sure we have a descriptor block in which to 523 record the metadata buffer. */ 524 525 if (!descriptor) { 526 struct buffer_head *bh; 527 528 J_ASSERT (bufs == 0); 529 530 jbd_debug(4, "JBD2: get descriptor\n"); 531 532 descriptor = jbd2_journal_get_descriptor_buffer(journal); 533 if (!descriptor) { 534 jbd2_journal_abort(journal, -EIO); 535 continue; 536 } 537 538 bh = jh2bh(descriptor); 539 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 540 (unsigned long long)bh->b_blocknr, bh->b_data); 541 header = (journal_header_t *)&bh->b_data[0]; 542 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 543 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 544 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 545 546 tagp = &bh->b_data[sizeof(journal_header_t)]; 547 space_left = bh->b_size - sizeof(journal_header_t); 548 first_tag = 1; 549 set_buffer_jwrite(bh); 550 set_buffer_dirty(bh); 551 wbuf[bufs++] = bh; 552 553 /* Record it so that we can wait for IO 554 completion later */ 555 BUFFER_TRACE(bh, "ph3: file as descriptor"); 556 jbd2_journal_file_buffer(descriptor, commit_transaction, 557 BJ_LogCtl); 558 } 559 560 /* Where is the buffer to be written? */ 561 562 err = jbd2_journal_next_log_block(journal, &blocknr); 563 /* If the block mapping failed, just abandon the buffer 564 and repeat this loop: we'll fall into the 565 refile-on-abort condition above. */ 566 if (err) { 567 jbd2_journal_abort(journal, err); 568 continue; 569 } 570 571 /* 572 * start_this_handle() uses t_outstanding_credits to determine 573 * the free space in the log, but this counter is changed 574 * by jbd2_journal_next_log_block() also. 575 */ 576 atomic_dec(&commit_transaction->t_outstanding_credits); 577 578 /* Bump b_count to prevent truncate from stumbling over 579 the shadowed buffer! @@@ This can go if we ever get 580 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 581 atomic_inc(&jh2bh(jh)->b_count); 582 583 /* Make a temporary IO buffer with which to write it out 584 (this will requeue both the metadata buffer and the 585 temporary IO buffer). new_bh goes on BJ_IO*/ 586 587 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 588 /* 589 * akpm: jbd2_journal_write_metadata_buffer() sets 590 * new_bh->b_transaction to commit_transaction. 591 * We need to clean this up before we release new_bh 592 * (which is of type BJ_IO) 593 */ 594 JBUFFER_TRACE(jh, "ph3: write metadata"); 595 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 596 jh, &new_jh, blocknr); 597 if (flags < 0) { 598 jbd2_journal_abort(journal, flags); 599 continue; 600 } 601 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 602 wbuf[bufs++] = jh2bh(new_jh); 603 604 /* Record the new block's tag in the current descriptor 605 buffer */ 606 607 tag_flag = 0; 608 if (flags & 1) 609 tag_flag |= JBD2_FLAG_ESCAPE; 610 if (!first_tag) 611 tag_flag |= JBD2_FLAG_SAME_UUID; 612 613 tag = (journal_block_tag_t *) tagp; 614 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 615 tag->t_flags = cpu_to_be32(tag_flag); 616 tagp += tag_bytes; 617 space_left -= tag_bytes; 618 619 if (first_tag) { 620 memcpy (tagp, journal->j_uuid, 16); 621 tagp += 16; 622 space_left -= 16; 623 first_tag = 0; 624 } 625 626 /* If there's no more to do, or if the descriptor is full, 627 let the IO rip! */ 628 629 if (bufs == journal->j_wbufsize || 630 commit_transaction->t_buffers == NULL || 631 space_left < tag_bytes + 16) { 632 633 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); 634 635 /* Write an end-of-descriptor marker before 636 submitting the IOs. "tag" still points to 637 the last tag we set up. */ 638 639 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); 640 641 start_journal_io: 642 for (i = 0; i < bufs; i++) { 643 struct buffer_head *bh = wbuf[i]; 644 /* 645 * Compute checksum. 646 */ 647 if (JBD2_HAS_COMPAT_FEATURE(journal, 648 JBD2_FEATURE_COMPAT_CHECKSUM)) { 649 crc32_sum = 650 jbd2_checksum_data(crc32_sum, bh); 651 } 652 653 lock_buffer(bh); 654 clear_buffer_dirty(bh); 655 set_buffer_uptodate(bh); 656 bh->b_end_io = journal_end_buffer_io_sync; 657 submit_bh(WRITE_SYNC, bh); 658 } 659 cond_resched(); 660 stats.run.rs_blocks_logged += bufs; 661 662 /* Force a new descriptor to be generated next 663 time round the loop. */ 664 descriptor = NULL; 665 bufs = 0; 666 } 667 } 668 669 err = journal_finish_inode_data_buffers(journal, commit_transaction); 670 if (err) { 671 printk(KERN_WARNING 672 "JBD2: Detected IO errors while flushing file data " 673 "on %s\n", journal->j_devname); 674 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) 675 jbd2_journal_abort(journal, err); 676 err = 0; 677 } 678 679 write_lock(&journal->j_state_lock); 680 J_ASSERT(commit_transaction->t_state == T_COMMIT); 681 commit_transaction->t_state = T_COMMIT_DFLUSH; 682 write_unlock(&journal->j_state_lock); 683 /* 684 * If the journal is not located on the file system device, 685 * then we must flush the file system device before we issue 686 * the commit record 687 */ 688 if (commit_transaction->t_need_data_flush && 689 (journal->j_fs_dev != journal->j_dev) && 690 (journal->j_flags & JBD2_BARRIER)) 691 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 692 693 /* Done it all: now write the commit record asynchronously. */ 694 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 695 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 696 err = journal_submit_commit_record(journal, commit_transaction, 697 &cbh, crc32_sum); 698 if (err) 699 __jbd2_journal_abort_hard(journal); 700 } 701 702 blk_finish_plug(&plug); 703 704 /* Lo and behold: we have just managed to send a transaction to 705 the log. Before we can commit it, wait for the IO so far to 706 complete. Control buffers being written are on the 707 transaction's t_log_list queue, and metadata buffers are on 708 the t_iobuf_list queue. 709 710 Wait for the buffers in reverse order. That way we are 711 less likely to be woken up until all IOs have completed, and 712 so we incur less scheduling load. 713 */ 714 715 jbd_debug(3, "JBD2: commit phase 3\n"); 716 717 /* 718 * akpm: these are BJ_IO, and j_list_lock is not needed. 719 * See __journal_try_to_free_buffer. 720 */ 721 wait_for_iobuf: 722 while (commit_transaction->t_iobuf_list != NULL) { 723 struct buffer_head *bh; 724 725 jh = commit_transaction->t_iobuf_list->b_tprev; 726 bh = jh2bh(jh); 727 if (buffer_locked(bh)) { 728 wait_on_buffer(bh); 729 goto wait_for_iobuf; 730 } 731 if (cond_resched()) 732 goto wait_for_iobuf; 733 734 if (unlikely(!buffer_uptodate(bh))) 735 err = -EIO; 736 737 clear_buffer_jwrite(bh); 738 739 JBUFFER_TRACE(jh, "ph4: unfile after journal write"); 740 jbd2_journal_unfile_buffer(journal, jh); 741 742 /* 743 * ->t_iobuf_list should contain only dummy buffer_heads 744 * which were created by jbd2_journal_write_metadata_buffer(). 745 */ 746 BUFFER_TRACE(bh, "dumping temporary bh"); 747 jbd2_journal_put_journal_head(jh); 748 __brelse(bh); 749 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 750 free_buffer_head(bh); 751 752 /* We also have to unlock and free the corresponding 753 shadowed buffer */ 754 jh = commit_transaction->t_shadow_list->b_tprev; 755 bh = jh2bh(jh); 756 clear_bit(BH_JWrite, &bh->b_state); 757 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 758 759 /* The metadata is now released for reuse, but we need 760 to remember it against this transaction so that when 761 we finally commit, we can do any checkpointing 762 required. */ 763 JBUFFER_TRACE(jh, "file as BJ_Forget"); 764 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 765 /* 766 * Wake up any transactions which were waiting for this IO to 767 * complete. The barrier must be here so that changes by 768 * jbd2_journal_file_buffer() take effect before wake_up_bit() 769 * does the waitqueue check. 770 */ 771 smp_mb(); 772 wake_up_bit(&bh->b_state, BH_Unshadow); 773 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 774 __brelse(bh); 775 } 776 777 J_ASSERT (commit_transaction->t_shadow_list == NULL); 778 779 jbd_debug(3, "JBD2: commit phase 4\n"); 780 781 /* Here we wait for the revoke record and descriptor record buffers */ 782 wait_for_ctlbuf: 783 while (commit_transaction->t_log_list != NULL) { 784 struct buffer_head *bh; 785 786 jh = commit_transaction->t_log_list->b_tprev; 787 bh = jh2bh(jh); 788 if (buffer_locked(bh)) { 789 wait_on_buffer(bh); 790 goto wait_for_ctlbuf; 791 } 792 if (cond_resched()) 793 goto wait_for_ctlbuf; 794 795 if (unlikely(!buffer_uptodate(bh))) 796 err = -EIO; 797 798 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 799 clear_buffer_jwrite(bh); 800 jbd2_journal_unfile_buffer(journal, jh); 801 jbd2_journal_put_journal_head(jh); 802 __brelse(bh); /* One for getblk */ 803 /* AKPM: bforget here */ 804 } 805 806 if (err) 807 jbd2_journal_abort(journal, err); 808 809 jbd_debug(3, "JBD2: commit phase 5\n"); 810 write_lock(&journal->j_state_lock); 811 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 812 commit_transaction->t_state = T_COMMIT_JFLUSH; 813 write_unlock(&journal->j_state_lock); 814 815 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 816 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 817 err = journal_submit_commit_record(journal, commit_transaction, 818 &cbh, crc32_sum); 819 if (err) 820 __jbd2_journal_abort_hard(journal); 821 } 822 if (cbh) 823 err = journal_wait_on_commit_record(journal, cbh); 824 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 825 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && 826 journal->j_flags & JBD2_BARRIER) { 827 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL); 828 } 829 830 if (err) 831 jbd2_journal_abort(journal, err); 832 833 /* End of a transaction! Finally, we can do checkpoint 834 processing: any buffers committed as a result of this 835 transaction can be removed from any checkpoint list it was on 836 before. */ 837 838 jbd_debug(3, "JBD2: commit phase 6\n"); 839 840 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 841 J_ASSERT(commit_transaction->t_buffers == NULL); 842 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 843 J_ASSERT(commit_transaction->t_iobuf_list == NULL); 844 J_ASSERT(commit_transaction->t_shadow_list == NULL); 845 J_ASSERT(commit_transaction->t_log_list == NULL); 846 847 restart_loop: 848 /* 849 * As there are other places (journal_unmap_buffer()) adding buffers 850 * to this list we have to be careful and hold the j_list_lock. 851 */ 852 spin_lock(&journal->j_list_lock); 853 while (commit_transaction->t_forget) { 854 transaction_t *cp_transaction; 855 struct buffer_head *bh; 856 int try_to_free = 0; 857 858 jh = commit_transaction->t_forget; 859 spin_unlock(&journal->j_list_lock); 860 bh = jh2bh(jh); 861 /* 862 * Get a reference so that bh cannot be freed before we are 863 * done with it. 864 */ 865 get_bh(bh); 866 jbd_lock_bh_state(bh); 867 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 868 869 /* 870 * If there is undo-protected committed data against 871 * this buffer, then we can remove it now. If it is a 872 * buffer needing such protection, the old frozen_data 873 * field now points to a committed version of the 874 * buffer, so rotate that field to the new committed 875 * data. 876 * 877 * Otherwise, we can just throw away the frozen data now. 878 * 879 * We also know that the frozen data has already fired 880 * its triggers if they exist, so we can clear that too. 881 */ 882 if (jh->b_committed_data) { 883 jbd2_free(jh->b_committed_data, bh->b_size); 884 jh->b_committed_data = NULL; 885 if (jh->b_frozen_data) { 886 jh->b_committed_data = jh->b_frozen_data; 887 jh->b_frozen_data = NULL; 888 jh->b_frozen_triggers = NULL; 889 } 890 } else if (jh->b_frozen_data) { 891 jbd2_free(jh->b_frozen_data, bh->b_size); 892 jh->b_frozen_data = NULL; 893 jh->b_frozen_triggers = NULL; 894 } 895 896 spin_lock(&journal->j_list_lock); 897 cp_transaction = jh->b_cp_transaction; 898 if (cp_transaction) { 899 JBUFFER_TRACE(jh, "remove from old cp transaction"); 900 cp_transaction->t_chp_stats.cs_dropped++; 901 __jbd2_journal_remove_checkpoint(jh); 902 } 903 904 /* Only re-checkpoint the buffer_head if it is marked 905 * dirty. If the buffer was added to the BJ_Forget list 906 * by jbd2_journal_forget, it may no longer be dirty and 907 * there's no point in keeping a checkpoint record for 908 * it. */ 909 910 /* A buffer which has been freed while still being 911 * journaled by a previous transaction may end up still 912 * being dirty here, but we want to avoid writing back 913 * that buffer in the future after the "add to orphan" 914 * operation been committed, That's not only a performance 915 * gain, it also stops aliasing problems if the buffer is 916 * left behind for writeback and gets reallocated for another 917 * use in a different page. */ 918 if (buffer_freed(bh) && !jh->b_next_transaction) { 919 clear_buffer_freed(bh); 920 clear_buffer_jbddirty(bh); 921 } 922 923 if (buffer_jbddirty(bh)) { 924 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 925 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 926 if (is_journal_aborted(journal)) 927 clear_buffer_jbddirty(bh); 928 } else { 929 J_ASSERT_BH(bh, !buffer_dirty(bh)); 930 /* 931 * The buffer on BJ_Forget list and not jbddirty means 932 * it has been freed by this transaction and hence it 933 * could not have been reallocated until this 934 * transaction has committed. *BUT* it could be 935 * reallocated once we have written all the data to 936 * disk and before we process the buffer on BJ_Forget 937 * list. 938 */ 939 if (!jh->b_next_transaction) 940 try_to_free = 1; 941 } 942 JBUFFER_TRACE(jh, "refile or unfile buffer"); 943 __jbd2_journal_refile_buffer(jh); 944 jbd_unlock_bh_state(bh); 945 if (try_to_free) 946 release_buffer_page(bh); /* Drops bh reference */ 947 else 948 __brelse(bh); 949 cond_resched_lock(&journal->j_list_lock); 950 } 951 spin_unlock(&journal->j_list_lock); 952 /* 953 * This is a bit sleazy. We use j_list_lock to protect transition 954 * of a transaction into T_FINISHED state and calling 955 * __jbd2_journal_drop_transaction(). Otherwise we could race with 956 * other checkpointing code processing the transaction... 957 */ 958 write_lock(&journal->j_state_lock); 959 spin_lock(&journal->j_list_lock); 960 /* 961 * Now recheck if some buffers did not get attached to the transaction 962 * while the lock was dropped... 963 */ 964 if (commit_transaction->t_forget) { 965 spin_unlock(&journal->j_list_lock); 966 write_unlock(&journal->j_state_lock); 967 goto restart_loop; 968 } 969 970 /* Done with this transaction! */ 971 972 jbd_debug(3, "JBD2: commit phase 7\n"); 973 974 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 975 976 commit_transaction->t_start = jiffies; 977 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 978 commit_transaction->t_start); 979 980 /* 981 * File the transaction statistics 982 */ 983 stats.ts_tid = commit_transaction->t_tid; 984 stats.run.rs_handle_count = 985 atomic_read(&commit_transaction->t_handle_count); 986 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 987 commit_transaction->t_tid, &stats.run); 988 989 /* 990 * Calculate overall stats 991 */ 992 spin_lock(&journal->j_history_lock); 993 journal->j_stats.ts_tid++; 994 journal->j_stats.run.rs_wait += stats.run.rs_wait; 995 journal->j_stats.run.rs_running += stats.run.rs_running; 996 journal->j_stats.run.rs_locked += stats.run.rs_locked; 997 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 998 journal->j_stats.run.rs_logging += stats.run.rs_logging; 999 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1000 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1001 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1002 spin_unlock(&journal->j_history_lock); 1003 1004 commit_transaction->t_state = T_FINISHED; 1005 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1006 journal->j_commit_sequence = commit_transaction->t_tid; 1007 journal->j_committing_transaction = NULL; 1008 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1009 1010 /* 1011 * weight the commit time higher than the average time so we don't 1012 * react too strongly to vast changes in the commit time 1013 */ 1014 if (likely(journal->j_average_commit_time)) 1015 journal->j_average_commit_time = (commit_time + 1016 journal->j_average_commit_time*3) / 4; 1017 else 1018 journal->j_average_commit_time = commit_time; 1019 write_unlock(&journal->j_state_lock); 1020 1021 if (commit_transaction->t_checkpoint_list == NULL && 1022 commit_transaction->t_checkpoint_io_list == NULL) { 1023 __jbd2_journal_drop_transaction(journal, commit_transaction); 1024 to_free = 1; 1025 } else { 1026 if (journal->j_checkpoint_transactions == NULL) { 1027 journal->j_checkpoint_transactions = commit_transaction; 1028 commit_transaction->t_cpnext = commit_transaction; 1029 commit_transaction->t_cpprev = commit_transaction; 1030 } else { 1031 commit_transaction->t_cpnext = 1032 journal->j_checkpoint_transactions; 1033 commit_transaction->t_cpprev = 1034 commit_transaction->t_cpnext->t_cpprev; 1035 commit_transaction->t_cpnext->t_cpprev = 1036 commit_transaction; 1037 commit_transaction->t_cpprev->t_cpnext = 1038 commit_transaction; 1039 } 1040 } 1041 spin_unlock(&journal->j_list_lock); 1042 1043 if (journal->j_commit_callback) 1044 journal->j_commit_callback(journal, commit_transaction); 1045 1046 trace_jbd2_end_commit(journal, commit_transaction); 1047 jbd_debug(1, "JBD2: commit %d complete, head %d\n", 1048 journal->j_commit_sequence, journal->j_tail_sequence); 1049 if (to_free) 1050 kfree(commit_transaction); 1051 1052 wake_up(&journal->j_wait_done_commit); 1053 } 1054