1 /* 2 * linux/fs/jbd2/transaction.c 3 * 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 5 * 6 * Copyright 1998 Red Hat corp --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Generic filesystem transaction handling code; part of the ext2fs 13 * journaling system. 14 * 15 * This file manages transactions (compound commits managed by the 16 * journaling code) and handles (individual atomic operations by the 17 * filesystem). 18 */ 19 20 #include <linux/time.h> 21 #include <linux/fs.h> 22 #include <linux/jbd2.h> 23 #include <linux/errno.h> 24 #include <linux/slab.h> 25 #include <linux/timer.h> 26 #include <linux/mm.h> 27 #include <linux/highmem.h> 28 #include <linux/hrtimer.h> 29 #include <linux/backing-dev.h> 30 #include <linux/module.h> 31 32 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 33 34 /* 35 * jbd2_get_transaction: obtain a new transaction_t object. 36 * 37 * Simply allocate and initialise a new transaction. Create it in 38 * RUNNING state and add it to the current journal (which should not 39 * have an existing running transaction: we only make a new transaction 40 * once we have started to commit the old one). 41 * 42 * Preconditions: 43 * The journal MUST be locked. We don't perform atomic mallocs on the 44 * new transaction and we can't block without protecting against other 45 * processes trying to touch the journal while it is in transition. 46 * 47 */ 48 49 static transaction_t * 50 jbd2_get_transaction(journal_t *journal, transaction_t *transaction) 51 { 52 transaction->t_journal = journal; 53 transaction->t_state = T_RUNNING; 54 transaction->t_start_time = ktime_get(); 55 transaction->t_tid = journal->j_transaction_sequence++; 56 transaction->t_expires = jiffies + journal->j_commit_interval; 57 spin_lock_init(&transaction->t_handle_lock); 58 atomic_set(&transaction->t_updates, 0); 59 atomic_set(&transaction->t_outstanding_credits, 0); 60 atomic_set(&transaction->t_handle_count, 0); 61 INIT_LIST_HEAD(&transaction->t_inode_list); 62 INIT_LIST_HEAD(&transaction->t_private_list); 63 64 /* Set up the commit timer for the new transaction. */ 65 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); 66 add_timer(&journal->j_commit_timer); 67 68 J_ASSERT(journal->j_running_transaction == NULL); 69 journal->j_running_transaction = transaction; 70 transaction->t_max_wait = 0; 71 transaction->t_start = jiffies; 72 73 return transaction; 74 } 75 76 /* 77 * Handle management. 78 * 79 * A handle_t is an object which represents a single atomic update to a 80 * filesystem, and which tracks all of the modifications which form part 81 * of that one update. 82 */ 83 84 /* 85 * start_this_handle: Given a handle, deal with any locking or stalling 86 * needed to make sure that there is enough journal space for the handle 87 * to begin. Attach the handle to a transaction and set up the 88 * transaction's buffer credits. 89 */ 90 91 static int start_this_handle(journal_t *journal, handle_t *handle, 92 int gfp_mask) 93 { 94 transaction_t *transaction; 95 int needed; 96 int nblocks = handle->h_buffer_credits; 97 transaction_t *new_transaction = NULL; 98 unsigned long ts = jiffies; 99 100 if (nblocks > journal->j_max_transaction_buffers) { 101 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 102 current->comm, nblocks, 103 journal->j_max_transaction_buffers); 104 return -ENOSPC; 105 } 106 107 alloc_transaction: 108 if (!journal->j_running_transaction) { 109 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); 110 if (!new_transaction) { 111 /* 112 * If __GFP_FS is not present, then we may be 113 * being called from inside the fs writeback 114 * layer, so we MUST NOT fail. Since 115 * __GFP_NOFAIL is going away, we will arrange 116 * to retry the allocation ourselves. 117 */ 118 if ((gfp_mask & __GFP_FS) == 0) { 119 congestion_wait(BLK_RW_ASYNC, HZ/50); 120 goto alloc_transaction; 121 } 122 return -ENOMEM; 123 } 124 } 125 126 jbd_debug(3, "New handle %p going live.\n", handle); 127 128 /* 129 * We need to hold j_state_lock until t_updates has been incremented, 130 * for proper journal barrier handling 131 */ 132 repeat: 133 read_lock(&journal->j_state_lock); 134 if (is_journal_aborted(journal) || 135 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 136 read_unlock(&journal->j_state_lock); 137 kfree(new_transaction); 138 return -EROFS; 139 } 140 141 /* Wait on the journal's transaction barrier if necessary */ 142 if (journal->j_barrier_count) { 143 read_unlock(&journal->j_state_lock); 144 wait_event(journal->j_wait_transaction_locked, 145 journal->j_barrier_count == 0); 146 goto repeat; 147 } 148 149 if (!journal->j_running_transaction) { 150 read_unlock(&journal->j_state_lock); 151 if (!new_transaction) 152 goto alloc_transaction; 153 write_lock(&journal->j_state_lock); 154 if (!journal->j_running_transaction) { 155 jbd2_get_transaction(journal, new_transaction); 156 new_transaction = NULL; 157 } 158 write_unlock(&journal->j_state_lock); 159 goto repeat; 160 } 161 162 transaction = journal->j_running_transaction; 163 164 /* 165 * If the current transaction is locked down for commit, wait for the 166 * lock to be released. 167 */ 168 if (transaction->t_state == T_LOCKED) { 169 DEFINE_WAIT(wait); 170 171 prepare_to_wait(&journal->j_wait_transaction_locked, 172 &wait, TASK_UNINTERRUPTIBLE); 173 read_unlock(&journal->j_state_lock); 174 schedule(); 175 finish_wait(&journal->j_wait_transaction_locked, &wait); 176 goto repeat; 177 } 178 179 /* 180 * If there is not enough space left in the log to write all potential 181 * buffers requested by this operation, we need to stall pending a log 182 * checkpoint to free some more log space. 183 */ 184 needed = atomic_add_return(nblocks, 185 &transaction->t_outstanding_credits); 186 187 if (needed > journal->j_max_transaction_buffers) { 188 /* 189 * If the current transaction is already too large, then start 190 * to commit it: we can then go back and attach this handle to 191 * a new transaction. 192 */ 193 DEFINE_WAIT(wait); 194 195 jbd_debug(2, "Handle %p starting new commit...\n", handle); 196 atomic_sub(nblocks, &transaction->t_outstanding_credits); 197 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 198 TASK_UNINTERRUPTIBLE); 199 __jbd2_log_start_commit(journal, transaction->t_tid); 200 read_unlock(&journal->j_state_lock); 201 schedule(); 202 finish_wait(&journal->j_wait_transaction_locked, &wait); 203 goto repeat; 204 } 205 206 /* 207 * The commit code assumes that it can get enough log space 208 * without forcing a checkpoint. This is *critical* for 209 * correctness: a checkpoint of a buffer which is also 210 * associated with a committing transaction creates a deadlock, 211 * so commit simply cannot force through checkpoints. 212 * 213 * We must therefore ensure the necessary space in the journal 214 * *before* starting to dirty potentially checkpointed buffers 215 * in the new transaction. 216 * 217 * The worst part is, any transaction currently committing can 218 * reduce the free space arbitrarily. Be careful to account for 219 * those buffers when checkpointing. 220 */ 221 222 /* 223 * @@@ AKPM: This seems rather over-defensive. We're giving commit 224 * a _lot_ of headroom: 1/4 of the journal plus the size of 225 * the committing transaction. Really, we only need to give it 226 * committing_transaction->t_outstanding_credits plus "enough" for 227 * the log control blocks. 228 * Also, this test is inconsitent with the matching one in 229 * jbd2_journal_extend(). 230 */ 231 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 232 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 233 atomic_sub(nblocks, &transaction->t_outstanding_credits); 234 read_unlock(&journal->j_state_lock); 235 write_lock(&journal->j_state_lock); 236 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) 237 __jbd2_log_wait_for_space(journal); 238 write_unlock(&journal->j_state_lock); 239 goto repeat; 240 } 241 242 /* OK, account for the buffers that this operation expects to 243 * use and add the handle to the running transaction. 244 * 245 * In order for t_max_wait to be reliable, it must be 246 * protected by a lock. But doing so will mean that 247 * start_this_handle() can not be run in parallel on SMP 248 * systems, which limits our scalability. So we only enable 249 * it when debugging is enabled. We may want to use a 250 * separate flag, eventually, so we can enable this 251 * independently of debugging. 252 */ 253 #ifdef CONFIG_JBD2_DEBUG 254 if (jbd2_journal_enable_debug && 255 time_after(transaction->t_start, ts)) { 256 ts = jbd2_time_diff(ts, transaction->t_start); 257 spin_lock(&transaction->t_handle_lock); 258 if (ts > transaction->t_max_wait) 259 transaction->t_max_wait = ts; 260 spin_unlock(&transaction->t_handle_lock); 261 } 262 #endif 263 handle->h_transaction = transaction; 264 atomic_inc(&transaction->t_updates); 265 atomic_inc(&transaction->t_handle_count); 266 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 267 handle, nblocks, 268 atomic_read(&transaction->t_outstanding_credits), 269 __jbd2_log_space_left(journal)); 270 read_unlock(&journal->j_state_lock); 271 272 lock_map_acquire(&handle->h_lockdep_map); 273 kfree(new_transaction); 274 return 0; 275 } 276 277 static struct lock_class_key jbd2_handle_key; 278 279 /* Allocate a new handle. This should probably be in a slab... */ 280 static handle_t *new_handle(int nblocks) 281 { 282 handle_t *handle = jbd2_alloc_handle(GFP_NOFS); 283 if (!handle) 284 return NULL; 285 memset(handle, 0, sizeof(*handle)); 286 handle->h_buffer_credits = nblocks; 287 handle->h_ref = 1; 288 289 lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", 290 &jbd2_handle_key, 0); 291 292 return handle; 293 } 294 295 /** 296 * handle_t *jbd2_journal_start() - Obtain a new handle. 297 * @journal: Journal to start transaction on. 298 * @nblocks: number of block buffer we might modify 299 * 300 * We make sure that the transaction can guarantee at least nblocks of 301 * modified buffers in the log. We block until the log can guarantee 302 * that much space. 303 * 304 * This function is visible to journal users (like ext3fs), so is not 305 * called with the journal already locked. 306 * 307 * Return a pointer to a newly allocated handle, or NULL on failure 308 */ 309 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 310 { 311 handle_t *handle = journal_current_handle(); 312 int err; 313 314 if (!journal) 315 return ERR_PTR(-EROFS); 316 317 if (handle) { 318 J_ASSERT(handle->h_transaction->t_journal == journal); 319 handle->h_ref++; 320 return handle; 321 } 322 323 handle = new_handle(nblocks); 324 if (!handle) 325 return ERR_PTR(-ENOMEM); 326 327 current->journal_info = handle; 328 329 err = start_this_handle(journal, handle, gfp_mask); 330 if (err < 0) { 331 jbd2_free_handle(handle); 332 current->journal_info = NULL; 333 handle = ERR_PTR(err); 334 goto out; 335 } 336 out: 337 return handle; 338 } 339 EXPORT_SYMBOL(jbd2__journal_start); 340 341 342 handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 343 { 344 return jbd2__journal_start(journal, nblocks, GFP_NOFS); 345 } 346 EXPORT_SYMBOL(jbd2_journal_start); 347 348 349 /** 350 * int jbd2_journal_extend() - extend buffer credits. 351 * @handle: handle to 'extend' 352 * @nblocks: nr blocks to try to extend by. 353 * 354 * Some transactions, such as large extends and truncates, can be done 355 * atomically all at once or in several stages. The operation requests 356 * a credit for a number of buffer modications in advance, but can 357 * extend its credit if it needs more. 358 * 359 * jbd2_journal_extend tries to give the running handle more buffer credits. 360 * It does not guarantee that allocation - this is a best-effort only. 361 * The calling process MUST be able to deal cleanly with a failure to 362 * extend here. 363 * 364 * Return 0 on success, non-zero on failure. 365 * 366 * return code < 0 implies an error 367 * return code > 0 implies normal transaction-full status. 368 */ 369 int jbd2_journal_extend(handle_t *handle, int nblocks) 370 { 371 transaction_t *transaction = handle->h_transaction; 372 journal_t *journal = transaction->t_journal; 373 int result; 374 int wanted; 375 376 result = -EIO; 377 if (is_handle_aborted(handle)) 378 goto out; 379 380 result = 1; 381 382 read_lock(&journal->j_state_lock); 383 384 /* Don't extend a locked-down transaction! */ 385 if (handle->h_transaction->t_state != T_RUNNING) { 386 jbd_debug(3, "denied handle %p %d blocks: " 387 "transaction not running\n", handle, nblocks); 388 goto error_out; 389 } 390 391 spin_lock(&transaction->t_handle_lock); 392 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 393 394 if (wanted > journal->j_max_transaction_buffers) { 395 jbd_debug(3, "denied handle %p %d blocks: " 396 "transaction too large\n", handle, nblocks); 397 goto unlock; 398 } 399 400 if (wanted > __jbd2_log_space_left(journal)) { 401 jbd_debug(3, "denied handle %p %d blocks: " 402 "insufficient log space\n", handle, nblocks); 403 goto unlock; 404 } 405 406 handle->h_buffer_credits += nblocks; 407 atomic_add(nblocks, &transaction->t_outstanding_credits); 408 result = 0; 409 410 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 411 unlock: 412 spin_unlock(&transaction->t_handle_lock); 413 error_out: 414 read_unlock(&journal->j_state_lock); 415 out: 416 return result; 417 } 418 419 420 /** 421 * int jbd2_journal_restart() - restart a handle . 422 * @handle: handle to restart 423 * @nblocks: nr credits requested 424 * 425 * Restart a handle for a multi-transaction filesystem 426 * operation. 427 * 428 * If the jbd2_journal_extend() call above fails to grant new buffer credits 429 * to a running handle, a call to jbd2_journal_restart will commit the 430 * handle's transaction so far and reattach the handle to a new 431 * transaction capabable of guaranteeing the requested number of 432 * credits. 433 */ 434 int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) 435 { 436 transaction_t *transaction = handle->h_transaction; 437 journal_t *journal = transaction->t_journal; 438 int ret; 439 440 /* If we've had an abort of any type, don't even think about 441 * actually doing the restart! */ 442 if (is_handle_aborted(handle)) 443 return 0; 444 445 /* 446 * First unlink the handle from its current transaction, and start the 447 * commit on that. 448 */ 449 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 450 J_ASSERT(journal_current_handle() == handle); 451 452 read_lock(&journal->j_state_lock); 453 spin_lock(&transaction->t_handle_lock); 454 atomic_sub(handle->h_buffer_credits, 455 &transaction->t_outstanding_credits); 456 if (atomic_dec_and_test(&transaction->t_updates)) 457 wake_up(&journal->j_wait_updates); 458 spin_unlock(&transaction->t_handle_lock); 459 460 jbd_debug(2, "restarting handle %p\n", handle); 461 __jbd2_log_start_commit(journal, transaction->t_tid); 462 read_unlock(&journal->j_state_lock); 463 464 lock_map_release(&handle->h_lockdep_map); 465 handle->h_buffer_credits = nblocks; 466 ret = start_this_handle(journal, handle, gfp_mask); 467 return ret; 468 } 469 EXPORT_SYMBOL(jbd2__journal_restart); 470 471 472 int jbd2_journal_restart(handle_t *handle, int nblocks) 473 { 474 return jbd2__journal_restart(handle, nblocks, GFP_NOFS); 475 } 476 EXPORT_SYMBOL(jbd2_journal_restart); 477 478 /** 479 * void jbd2_journal_lock_updates () - establish a transaction barrier. 480 * @journal: Journal to establish a barrier on. 481 * 482 * This locks out any further updates from being started, and blocks 483 * until all existing updates have completed, returning only once the 484 * journal is in a quiescent state with no updates running. 485 * 486 * The journal lock should not be held on entry. 487 */ 488 void jbd2_journal_lock_updates(journal_t *journal) 489 { 490 DEFINE_WAIT(wait); 491 492 write_lock(&journal->j_state_lock); 493 ++journal->j_barrier_count; 494 495 /* Wait until there are no running updates */ 496 while (1) { 497 transaction_t *transaction = journal->j_running_transaction; 498 499 if (!transaction) 500 break; 501 502 spin_lock(&transaction->t_handle_lock); 503 if (!atomic_read(&transaction->t_updates)) { 504 spin_unlock(&transaction->t_handle_lock); 505 break; 506 } 507 prepare_to_wait(&journal->j_wait_updates, &wait, 508 TASK_UNINTERRUPTIBLE); 509 spin_unlock(&transaction->t_handle_lock); 510 write_unlock(&journal->j_state_lock); 511 schedule(); 512 finish_wait(&journal->j_wait_updates, &wait); 513 write_lock(&journal->j_state_lock); 514 } 515 write_unlock(&journal->j_state_lock); 516 517 /* 518 * We have now established a barrier against other normal updates, but 519 * we also need to barrier against other jbd2_journal_lock_updates() calls 520 * to make sure that we serialise special journal-locked operations 521 * too. 522 */ 523 mutex_lock(&journal->j_barrier); 524 } 525 526 /** 527 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier 528 * @journal: Journal to release the barrier on. 529 * 530 * Release a transaction barrier obtained with jbd2_journal_lock_updates(). 531 * 532 * Should be called without the journal lock held. 533 */ 534 void jbd2_journal_unlock_updates (journal_t *journal) 535 { 536 J_ASSERT(journal->j_barrier_count != 0); 537 538 mutex_unlock(&journal->j_barrier); 539 write_lock(&journal->j_state_lock); 540 --journal->j_barrier_count; 541 write_unlock(&journal->j_state_lock); 542 wake_up(&journal->j_wait_transaction_locked); 543 } 544 545 static void warn_dirty_buffer(struct buffer_head *bh) 546 { 547 char b[BDEVNAME_SIZE]; 548 549 printk(KERN_WARNING 550 "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " 551 "There's a risk of filesystem corruption in case of system " 552 "crash.\n", 553 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 554 } 555 556 /* 557 * If the buffer is already part of the current transaction, then there 558 * is nothing we need to do. If it is already part of a prior 559 * transaction which we are still committing to disk, then we need to 560 * make sure that we do not overwrite the old copy: we do copy-out to 561 * preserve the copy going to disk. We also account the buffer against 562 * the handle's metadata buffer credits (unless the buffer is already 563 * part of the transaction, that is). 564 * 565 */ 566 static int 567 do_get_write_access(handle_t *handle, struct journal_head *jh, 568 int force_copy) 569 { 570 struct buffer_head *bh; 571 transaction_t *transaction; 572 journal_t *journal; 573 int error; 574 char *frozen_buffer = NULL; 575 int need_copy = 0; 576 577 if (is_handle_aborted(handle)) 578 return -EROFS; 579 580 transaction = handle->h_transaction; 581 journal = transaction->t_journal; 582 583 jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); 584 585 JBUFFER_TRACE(jh, "entry"); 586 repeat: 587 bh = jh2bh(jh); 588 589 /* @@@ Need to check for errors here at some point. */ 590 591 lock_buffer(bh); 592 jbd_lock_bh_state(bh); 593 594 /* We now hold the buffer lock so it is safe to query the buffer 595 * state. Is the buffer dirty? 596 * 597 * If so, there are two possibilities. The buffer may be 598 * non-journaled, and undergoing a quite legitimate writeback. 599 * Otherwise, it is journaled, and we don't expect dirty buffers 600 * in that state (the buffers should be marked JBD_Dirty 601 * instead.) So either the IO is being done under our own 602 * control and this is a bug, or it's a third party IO such as 603 * dump(8) (which may leave the buffer scheduled for read --- 604 * ie. locked but not dirty) or tune2fs (which may actually have 605 * the buffer dirtied, ugh.) */ 606 607 if (buffer_dirty(bh)) { 608 /* 609 * First question: is this buffer already part of the current 610 * transaction or the existing committing transaction? 611 */ 612 if (jh->b_transaction) { 613 J_ASSERT_JH(jh, 614 jh->b_transaction == transaction || 615 jh->b_transaction == 616 journal->j_committing_transaction); 617 if (jh->b_next_transaction) 618 J_ASSERT_JH(jh, jh->b_next_transaction == 619 transaction); 620 warn_dirty_buffer(bh); 621 } 622 /* 623 * In any case we need to clean the dirty flag and we must 624 * do it under the buffer lock to be sure we don't race 625 * with running write-out. 626 */ 627 JBUFFER_TRACE(jh, "Journalling dirty buffer"); 628 clear_buffer_dirty(bh); 629 set_buffer_jbddirty(bh); 630 } 631 632 unlock_buffer(bh); 633 634 error = -EROFS; 635 if (is_handle_aborted(handle)) { 636 jbd_unlock_bh_state(bh); 637 goto out; 638 } 639 error = 0; 640 641 /* 642 * The buffer is already part of this transaction if b_transaction or 643 * b_next_transaction points to it 644 */ 645 if (jh->b_transaction == transaction || 646 jh->b_next_transaction == transaction) 647 goto done; 648 649 /* 650 * this is the first time this transaction is touching this buffer, 651 * reset the modified flag 652 */ 653 jh->b_modified = 0; 654 655 /* 656 * If there is already a copy-out version of this buffer, then we don't 657 * need to make another one 658 */ 659 if (jh->b_frozen_data) { 660 JBUFFER_TRACE(jh, "has frozen data"); 661 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 662 jh->b_next_transaction = transaction; 663 goto done; 664 } 665 666 /* Is there data here we need to preserve? */ 667 668 if (jh->b_transaction && jh->b_transaction != transaction) { 669 JBUFFER_TRACE(jh, "owned by older transaction"); 670 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 671 J_ASSERT_JH(jh, jh->b_transaction == 672 journal->j_committing_transaction); 673 674 /* There is one case we have to be very careful about. 675 * If the committing transaction is currently writing 676 * this buffer out to disk and has NOT made a copy-out, 677 * then we cannot modify the buffer contents at all 678 * right now. The essence of copy-out is that it is the 679 * extra copy, not the primary copy, which gets 680 * journaled. If the primary copy is already going to 681 * disk then we cannot do copy-out here. */ 682 683 if (jh->b_jlist == BJ_Shadow) { 684 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow); 685 wait_queue_head_t *wqh; 686 687 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow); 688 689 JBUFFER_TRACE(jh, "on shadow: sleep"); 690 jbd_unlock_bh_state(bh); 691 /* commit wakes up all shadow buffers after IO */ 692 for ( ; ; ) { 693 prepare_to_wait(wqh, &wait.wait, 694 TASK_UNINTERRUPTIBLE); 695 if (jh->b_jlist != BJ_Shadow) 696 break; 697 schedule(); 698 } 699 finish_wait(wqh, &wait.wait); 700 goto repeat; 701 } 702 703 /* Only do the copy if the currently-owning transaction 704 * still needs it. If it is on the Forget list, the 705 * committing transaction is past that stage. The 706 * buffer had better remain locked during the kmalloc, 707 * but that should be true --- we hold the journal lock 708 * still and the buffer is already on the BUF_JOURNAL 709 * list so won't be flushed. 710 * 711 * Subtle point, though: if this is a get_undo_access, 712 * then we will be relying on the frozen_data to contain 713 * the new value of the committed_data record after the 714 * transaction, so we HAVE to force the frozen_data copy 715 * in that case. */ 716 717 if (jh->b_jlist != BJ_Forget || force_copy) { 718 JBUFFER_TRACE(jh, "generate frozen data"); 719 if (!frozen_buffer) { 720 JBUFFER_TRACE(jh, "allocate memory for buffer"); 721 jbd_unlock_bh_state(bh); 722 frozen_buffer = 723 jbd2_alloc(jh2bh(jh)->b_size, 724 GFP_NOFS); 725 if (!frozen_buffer) { 726 printk(KERN_EMERG 727 "%s: OOM for frozen_buffer\n", 728 __func__); 729 JBUFFER_TRACE(jh, "oom!"); 730 error = -ENOMEM; 731 jbd_lock_bh_state(bh); 732 goto done; 733 } 734 goto repeat; 735 } 736 jh->b_frozen_data = frozen_buffer; 737 frozen_buffer = NULL; 738 need_copy = 1; 739 } 740 jh->b_next_transaction = transaction; 741 } 742 743 744 /* 745 * Finally, if the buffer is not journaled right now, we need to make 746 * sure it doesn't get written to disk before the caller actually 747 * commits the new data 748 */ 749 if (!jh->b_transaction) { 750 JBUFFER_TRACE(jh, "no transaction"); 751 J_ASSERT_JH(jh, !jh->b_next_transaction); 752 jh->b_transaction = transaction; 753 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 754 spin_lock(&journal->j_list_lock); 755 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 756 spin_unlock(&journal->j_list_lock); 757 } 758 759 done: 760 if (need_copy) { 761 struct page *page; 762 int offset; 763 char *source; 764 765 J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), 766 "Possible IO failure.\n"); 767 page = jh2bh(jh)->b_page; 768 offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; 769 source = kmap_atomic(page, KM_USER0); 770 /* Fire data frozen trigger just before we copy the data */ 771 jbd2_buffer_frozen_trigger(jh, source + offset, 772 jh->b_triggers); 773 memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); 774 kunmap_atomic(source, KM_USER0); 775 776 /* 777 * Now that the frozen data is saved off, we need to store 778 * any matching triggers. 779 */ 780 jh->b_frozen_triggers = jh->b_triggers; 781 } 782 jbd_unlock_bh_state(bh); 783 784 /* 785 * If we are about to journal a buffer, then any revoke pending on it is 786 * no longer valid 787 */ 788 jbd2_journal_cancel_revoke(handle, jh); 789 790 out: 791 if (unlikely(frozen_buffer)) /* It's usually NULL */ 792 jbd2_free(frozen_buffer, bh->b_size); 793 794 JBUFFER_TRACE(jh, "exit"); 795 return error; 796 } 797 798 /** 799 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. 800 * @handle: transaction to add buffer modifications to 801 * @bh: bh to be used for metadata writes 802 * @credits: variable that will receive credits for the buffer 803 * 804 * Returns an error code or 0 on success. 805 * 806 * In full data journalling mode the buffer may be of type BJ_AsyncData, 807 * because we're write()ing a buffer which is also part of a shared mapping. 808 */ 809 810 int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) 811 { 812 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 813 int rc; 814 815 /* We do not want to get caught playing with fields which the 816 * log thread also manipulates. Make sure that the buffer 817 * completes any outstanding IO before proceeding. */ 818 rc = do_get_write_access(handle, jh, 0); 819 jbd2_journal_put_journal_head(jh); 820 return rc; 821 } 822 823 824 /* 825 * When the user wants to journal a newly created buffer_head 826 * (ie. getblk() returned a new buffer and we are going to populate it 827 * manually rather than reading off disk), then we need to keep the 828 * buffer_head locked until it has been completely filled with new 829 * data. In this case, we should be able to make the assertion that 830 * the bh is not already part of an existing transaction. 831 * 832 * The buffer should already be locked by the caller by this point. 833 * There is no lock ranking violation: it was a newly created, 834 * unlocked buffer beforehand. */ 835 836 /** 837 * int jbd2_journal_get_create_access () - notify intent to use newly created bh 838 * @handle: transaction to new buffer to 839 * @bh: new buffer. 840 * 841 * Call this if you create a new bh. 842 */ 843 int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 844 { 845 transaction_t *transaction = handle->h_transaction; 846 journal_t *journal = transaction->t_journal; 847 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 848 int err; 849 850 jbd_debug(5, "journal_head %p\n", jh); 851 err = -EROFS; 852 if (is_handle_aborted(handle)) 853 goto out; 854 err = 0; 855 856 JBUFFER_TRACE(jh, "entry"); 857 /* 858 * The buffer may already belong to this transaction due to pre-zeroing 859 * in the filesystem's new_block code. It may also be on the previous, 860 * committing transaction's lists, but it HAS to be in Forget state in 861 * that case: the transaction must have deleted the buffer for it to be 862 * reused here. 863 */ 864 jbd_lock_bh_state(bh); 865 spin_lock(&journal->j_list_lock); 866 J_ASSERT_JH(jh, (jh->b_transaction == transaction || 867 jh->b_transaction == NULL || 868 (jh->b_transaction == journal->j_committing_transaction && 869 jh->b_jlist == BJ_Forget))); 870 871 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 872 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 873 874 if (jh->b_transaction == NULL) { 875 /* 876 * Previous jbd2_journal_forget() could have left the buffer 877 * with jbddirty bit set because it was being committed. When 878 * the commit finished, we've filed the buffer for 879 * checkpointing and marked it dirty. Now we are reallocating 880 * the buffer so the transaction freeing it must have 881 * committed and so it's safe to clear the dirty bit. 882 */ 883 clear_buffer_dirty(jh2bh(jh)); 884 jh->b_transaction = transaction; 885 886 /* first access by this transaction */ 887 jh->b_modified = 0; 888 889 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 890 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); 891 } else if (jh->b_transaction == journal->j_committing_transaction) { 892 /* first access by this transaction */ 893 jh->b_modified = 0; 894 895 JBUFFER_TRACE(jh, "set next transaction"); 896 jh->b_next_transaction = transaction; 897 } 898 spin_unlock(&journal->j_list_lock); 899 jbd_unlock_bh_state(bh); 900 901 /* 902 * akpm: I added this. ext3_alloc_branch can pick up new indirect 903 * blocks which contain freed but then revoked metadata. We need 904 * to cancel the revoke in case we end up freeing it yet again 905 * and the reallocating as data - this would cause a second revoke, 906 * which hits an assertion error. 907 */ 908 JBUFFER_TRACE(jh, "cancelling revoke"); 909 jbd2_journal_cancel_revoke(handle, jh); 910 jbd2_journal_put_journal_head(jh); 911 out: 912 return err; 913 } 914 915 /** 916 * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with 917 * non-rewindable consequences 918 * @handle: transaction 919 * @bh: buffer to undo 920 * @credits: store the number of taken credits here (if not NULL) 921 * 922 * Sometimes there is a need to distinguish between metadata which has 923 * been committed to disk and that which has not. The ext3fs code uses 924 * this for freeing and allocating space, we have to make sure that we 925 * do not reuse freed space until the deallocation has been committed, 926 * since if we overwrote that space we would make the delete 927 * un-rewindable in case of a crash. 928 * 929 * To deal with that, jbd2_journal_get_undo_access requests write access to a 930 * buffer for parts of non-rewindable operations such as delete 931 * operations on the bitmaps. The journaling code must keep a copy of 932 * the buffer's contents prior to the undo_access call until such time 933 * as we know that the buffer has definitely been committed to disk. 934 * 935 * We never need to know which transaction the committed data is part 936 * of, buffers touched here are guaranteed to be dirtied later and so 937 * will be committed to a new transaction in due course, at which point 938 * we can discard the old committed data pointer. 939 * 940 * Returns error number or 0 on success. 941 */ 942 int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) 943 { 944 int err; 945 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 946 char *committed_data = NULL; 947 948 JBUFFER_TRACE(jh, "entry"); 949 950 /* 951 * Do this first --- it can drop the journal lock, so we want to 952 * make sure that obtaining the committed_data is done 953 * atomically wrt. completion of any outstanding commits. 954 */ 955 err = do_get_write_access(handle, jh, 1); 956 if (err) 957 goto out; 958 959 repeat: 960 if (!jh->b_committed_data) { 961 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 962 if (!committed_data) { 963 printk(KERN_EMERG "%s: No memory for committed data\n", 964 __func__); 965 err = -ENOMEM; 966 goto out; 967 } 968 } 969 970 jbd_lock_bh_state(bh); 971 if (!jh->b_committed_data) { 972 /* Copy out the current buffer contents into the 973 * preserved, committed copy. */ 974 JBUFFER_TRACE(jh, "generate b_committed data"); 975 if (!committed_data) { 976 jbd_unlock_bh_state(bh); 977 goto repeat; 978 } 979 980 jh->b_committed_data = committed_data; 981 committed_data = NULL; 982 memcpy(jh->b_committed_data, bh->b_data, bh->b_size); 983 } 984 jbd_unlock_bh_state(bh); 985 out: 986 jbd2_journal_put_journal_head(jh); 987 if (unlikely(committed_data)) 988 jbd2_free(committed_data, bh->b_size); 989 return err; 990 } 991 992 /** 993 * void jbd2_journal_set_triggers() - Add triggers for commit writeout 994 * @bh: buffer to trigger on 995 * @type: struct jbd2_buffer_trigger_type containing the trigger(s). 996 * 997 * Set any triggers on this journal_head. This is always safe, because 998 * triggers for a committing buffer will be saved off, and triggers for 999 * a running transaction will match the buffer in that transaction. 1000 * 1001 * Call with NULL to clear the triggers. 1002 */ 1003 void jbd2_journal_set_triggers(struct buffer_head *bh, 1004 struct jbd2_buffer_trigger_type *type) 1005 { 1006 struct journal_head *jh = bh2jh(bh); 1007 1008 jh->b_triggers = type; 1009 } 1010 1011 void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, 1012 struct jbd2_buffer_trigger_type *triggers) 1013 { 1014 struct buffer_head *bh = jh2bh(jh); 1015 1016 if (!triggers || !triggers->t_frozen) 1017 return; 1018 1019 triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); 1020 } 1021 1022 void jbd2_buffer_abort_trigger(struct journal_head *jh, 1023 struct jbd2_buffer_trigger_type *triggers) 1024 { 1025 if (!triggers || !triggers->t_abort) 1026 return; 1027 1028 triggers->t_abort(triggers, jh2bh(jh)); 1029 } 1030 1031 1032 1033 /** 1034 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 1035 * @handle: transaction to add buffer to. 1036 * @bh: buffer to mark 1037 * 1038 * mark dirty metadata which needs to be journaled as part of the current 1039 * transaction. 1040 * 1041 * The buffer is placed on the transaction's metadata list and is marked 1042 * as belonging to the transaction. 1043 * 1044 * Returns error number or 0 on success. 1045 * 1046 * Special care needs to be taken if the buffer already belongs to the 1047 * current committing transaction (in which case we should have frozen 1048 * data present for that commit). In that case, we don't relink the 1049 * buffer: that only gets done when the old transaction finally 1050 * completes its commit. 1051 */ 1052 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1053 { 1054 transaction_t *transaction = handle->h_transaction; 1055 journal_t *journal = transaction->t_journal; 1056 struct journal_head *jh = bh2jh(bh); 1057 1058 jbd_debug(5, "journal_head %p\n", jh); 1059 JBUFFER_TRACE(jh, "entry"); 1060 if (is_handle_aborted(handle)) 1061 goto out; 1062 1063 jbd_lock_bh_state(bh); 1064 1065 if (jh->b_modified == 0) { 1066 /* 1067 * This buffer's got modified and becoming part 1068 * of the transaction. This needs to be done 1069 * once a transaction -bzzz 1070 */ 1071 jh->b_modified = 1; 1072 J_ASSERT_JH(jh, handle->h_buffer_credits > 0); 1073 handle->h_buffer_credits--; 1074 } 1075 1076 /* 1077 * fastpath, to avoid expensive locking. If this buffer is already 1078 * on the running transaction's metadata list there is nothing to do. 1079 * Nobody can take it off again because there is a handle open. 1080 * I _think_ we're OK here with SMP barriers - a mistaken decision will 1081 * result in this test being false, so we go in and take the locks. 1082 */ 1083 if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) { 1084 JBUFFER_TRACE(jh, "fastpath"); 1085 J_ASSERT_JH(jh, jh->b_transaction == 1086 journal->j_running_transaction); 1087 goto out_unlock_bh; 1088 } 1089 1090 set_buffer_jbddirty(bh); 1091 1092 /* 1093 * Metadata already on the current transaction list doesn't 1094 * need to be filed. Metadata on another transaction's list must 1095 * be committing, and will be refiled once the commit completes: 1096 * leave it alone for now. 1097 */ 1098 if (jh->b_transaction != transaction) { 1099 JBUFFER_TRACE(jh, "already on other transaction"); 1100 J_ASSERT_JH(jh, jh->b_transaction == 1101 journal->j_committing_transaction); 1102 J_ASSERT_JH(jh, jh->b_next_transaction == transaction); 1103 /* And this case is illegal: we can't reuse another 1104 * transaction's data buffer, ever. */ 1105 goto out_unlock_bh; 1106 } 1107 1108 /* That test should have eliminated the following case: */ 1109 J_ASSERT_JH(jh, jh->b_frozen_data == NULL); 1110 1111 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1112 spin_lock(&journal->j_list_lock); 1113 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1114 spin_unlock(&journal->j_list_lock); 1115 out_unlock_bh: 1116 jbd_unlock_bh_state(bh); 1117 out: 1118 JBUFFER_TRACE(jh, "exit"); 1119 return 0; 1120 } 1121 1122 /* 1123 * jbd2_journal_release_buffer: undo a get_write_access without any buffer 1124 * updates, if the update decided in the end that it didn't need access. 1125 * 1126 */ 1127 void 1128 jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) 1129 { 1130 BUFFER_TRACE(bh, "entry"); 1131 } 1132 1133 /** 1134 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. 1135 * @handle: transaction handle 1136 * @bh: bh to 'forget' 1137 * 1138 * We can only do the bforget if there are no commits pending against the 1139 * buffer. If the buffer is dirty in the current running transaction we 1140 * can safely unlink it. 1141 * 1142 * bh may not be a journalled buffer at all - it may be a non-JBD 1143 * buffer which came off the hashtable. Check for this. 1144 * 1145 * Decrements bh->b_count by one. 1146 * 1147 * Allow this call even if the handle has aborted --- it may be part of 1148 * the caller's cleanup after an abort. 1149 */ 1150 int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1151 { 1152 transaction_t *transaction = handle->h_transaction; 1153 journal_t *journal = transaction->t_journal; 1154 struct journal_head *jh; 1155 int drop_reserve = 0; 1156 int err = 0; 1157 int was_modified = 0; 1158 1159 BUFFER_TRACE(bh, "entry"); 1160 1161 jbd_lock_bh_state(bh); 1162 spin_lock(&journal->j_list_lock); 1163 1164 if (!buffer_jbd(bh)) 1165 goto not_jbd; 1166 jh = bh2jh(bh); 1167 1168 /* Critical error: attempting to delete a bitmap buffer, maybe? 1169 * Don't do any jbd operations, and return an error. */ 1170 if (!J_EXPECT_JH(jh, !jh->b_committed_data, 1171 "inconsistent data on disk")) { 1172 err = -EIO; 1173 goto not_jbd; 1174 } 1175 1176 /* keep track of wether or not this transaction modified us */ 1177 was_modified = jh->b_modified; 1178 1179 /* 1180 * The buffer's going from the transaction, we must drop 1181 * all references -bzzz 1182 */ 1183 jh->b_modified = 0; 1184 1185 if (jh->b_transaction == handle->h_transaction) { 1186 J_ASSERT_JH(jh, !jh->b_frozen_data); 1187 1188 /* If we are forgetting a buffer which is already part 1189 * of this transaction, then we can just drop it from 1190 * the transaction immediately. */ 1191 clear_buffer_dirty(bh); 1192 clear_buffer_jbddirty(bh); 1193 1194 JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); 1195 1196 /* 1197 * we only want to drop a reference if this transaction 1198 * modified the buffer 1199 */ 1200 if (was_modified) 1201 drop_reserve = 1; 1202 1203 /* 1204 * We are no longer going to journal this buffer. 1205 * However, the commit of this transaction is still 1206 * important to the buffer: the delete that we are now 1207 * processing might obsolete an old log entry, so by 1208 * committing, we can satisfy the buffer's checkpoint. 1209 * 1210 * So, if we have a checkpoint on the buffer, we should 1211 * now refile the buffer on our BJ_Forget list so that 1212 * we know to remove the checkpoint after we commit. 1213 */ 1214 1215 if (jh->b_cp_transaction) { 1216 __jbd2_journal_temp_unlink_buffer(jh); 1217 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1218 } else { 1219 __jbd2_journal_unfile_buffer(jh); 1220 jbd2_journal_remove_journal_head(bh); 1221 __brelse(bh); 1222 if (!buffer_jbd(bh)) { 1223 spin_unlock(&journal->j_list_lock); 1224 jbd_unlock_bh_state(bh); 1225 __bforget(bh); 1226 goto drop; 1227 } 1228 } 1229 } else if (jh->b_transaction) { 1230 J_ASSERT_JH(jh, (jh->b_transaction == 1231 journal->j_committing_transaction)); 1232 /* However, if the buffer is still owned by a prior 1233 * (committing) transaction, we can't drop it yet... */ 1234 JBUFFER_TRACE(jh, "belongs to older transaction"); 1235 /* ... but we CAN drop it from the new transaction if we 1236 * have also modified it since the original commit. */ 1237 1238 if (jh->b_next_transaction) { 1239 J_ASSERT(jh->b_next_transaction == transaction); 1240 jh->b_next_transaction = NULL; 1241 1242 /* 1243 * only drop a reference if this transaction modified 1244 * the buffer 1245 */ 1246 if (was_modified) 1247 drop_reserve = 1; 1248 } 1249 } 1250 1251 not_jbd: 1252 spin_unlock(&journal->j_list_lock); 1253 jbd_unlock_bh_state(bh); 1254 __brelse(bh); 1255 drop: 1256 if (drop_reserve) { 1257 /* no need to reserve log space for this block -bzzz */ 1258 handle->h_buffer_credits++; 1259 } 1260 return err; 1261 } 1262 1263 /** 1264 * int jbd2_journal_stop() - complete a transaction 1265 * @handle: tranaction to complete. 1266 * 1267 * All done for a particular handle. 1268 * 1269 * There is not much action needed here. We just return any remaining 1270 * buffer credits to the transaction and remove the handle. The only 1271 * complication is that we need to start a commit operation if the 1272 * filesystem is marked for synchronous update. 1273 * 1274 * jbd2_journal_stop itself will not usually return an error, but it may 1275 * do so in unusual circumstances. In particular, expect it to 1276 * return -EIO if a jbd2_journal_abort has been executed since the 1277 * transaction began. 1278 */ 1279 int jbd2_journal_stop(handle_t *handle) 1280 { 1281 transaction_t *transaction = handle->h_transaction; 1282 journal_t *journal = transaction->t_journal; 1283 int err, wait_for_commit = 0; 1284 tid_t tid; 1285 pid_t pid; 1286 1287 J_ASSERT(journal_current_handle() == handle); 1288 1289 if (is_handle_aborted(handle)) 1290 err = -EIO; 1291 else { 1292 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1293 err = 0; 1294 } 1295 1296 if (--handle->h_ref > 0) { 1297 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1298 handle->h_ref); 1299 return err; 1300 } 1301 1302 jbd_debug(4, "Handle %p going down\n", handle); 1303 1304 /* 1305 * Implement synchronous transaction batching. If the handle 1306 * was synchronous, don't force a commit immediately. Let's 1307 * yield and let another thread piggyback onto this 1308 * transaction. Keep doing that while new threads continue to 1309 * arrive. It doesn't cost much - we're about to run a commit 1310 * and sleep on IO anyway. Speeds up many-threaded, many-dir 1311 * operations by 30x or more... 1312 * 1313 * We try and optimize the sleep time against what the 1314 * underlying disk can do, instead of having a static sleep 1315 * time. This is useful for the case where our storage is so 1316 * fast that it is more optimal to go ahead and force a flush 1317 * and wait for the transaction to be committed than it is to 1318 * wait for an arbitrary amount of time for new writers to 1319 * join the transaction. We achieve this by measuring how 1320 * long it takes to commit a transaction, and compare it with 1321 * how long this transaction has been running, and if run time 1322 * < commit time then we sleep for the delta and commit. This 1323 * greatly helps super fast disks that would see slowdowns as 1324 * more threads started doing fsyncs. 1325 * 1326 * But don't do this if this process was the most recent one 1327 * to perform a synchronous write. We do this to detect the 1328 * case where a single process is doing a stream of sync 1329 * writes. No point in waiting for joiners in that case. 1330 */ 1331 pid = current->pid; 1332 if (handle->h_sync && journal->j_last_sync_writer != pid) { 1333 u64 commit_time, trans_time; 1334 1335 journal->j_last_sync_writer = pid; 1336 1337 read_lock(&journal->j_state_lock); 1338 commit_time = journal->j_average_commit_time; 1339 read_unlock(&journal->j_state_lock); 1340 1341 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1342 transaction->t_start_time)); 1343 1344 commit_time = max_t(u64, commit_time, 1345 1000*journal->j_min_batch_time); 1346 commit_time = min_t(u64, commit_time, 1347 1000*journal->j_max_batch_time); 1348 1349 if (trans_time < commit_time) { 1350 ktime_t expires = ktime_add_ns(ktime_get(), 1351 commit_time); 1352 set_current_state(TASK_UNINTERRUPTIBLE); 1353 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 1354 } 1355 } 1356 1357 if (handle->h_sync) 1358 transaction->t_synchronous_commit = 1; 1359 current->journal_info = NULL; 1360 atomic_sub(handle->h_buffer_credits, 1361 &transaction->t_outstanding_credits); 1362 1363 /* 1364 * If the handle is marked SYNC, we need to set another commit 1365 * going! We also want to force a commit if the current 1366 * transaction is occupying too much of the log, or if the 1367 * transaction is too old now. 1368 */ 1369 if (handle->h_sync || 1370 (atomic_read(&transaction->t_outstanding_credits) > 1371 journal->j_max_transaction_buffers) || 1372 time_after_eq(jiffies, transaction->t_expires)) { 1373 /* Do this even for aborted journals: an abort still 1374 * completes the commit thread, it just doesn't write 1375 * anything to disk. */ 1376 1377 jbd_debug(2, "transaction too old, requesting commit for " 1378 "handle %p\n", handle); 1379 /* This is non-blocking */ 1380 jbd2_log_start_commit(journal, transaction->t_tid); 1381 1382 /* 1383 * Special case: JBD2_SYNC synchronous updates require us 1384 * to wait for the commit to complete. 1385 */ 1386 if (handle->h_sync && !(current->flags & PF_MEMALLOC)) 1387 wait_for_commit = 1; 1388 } 1389 1390 /* 1391 * Once we drop t_updates, if it goes to zero the transaction 1392 * could start commiting on us and eventually disappear. So 1393 * once we do this, we must not dereference transaction 1394 * pointer again. 1395 */ 1396 tid = transaction->t_tid; 1397 if (atomic_dec_and_test(&transaction->t_updates)) { 1398 wake_up(&journal->j_wait_updates); 1399 if (journal->j_barrier_count) 1400 wake_up(&journal->j_wait_transaction_locked); 1401 } 1402 1403 if (wait_for_commit) 1404 err = jbd2_log_wait_commit(journal, tid); 1405 1406 lock_map_release(&handle->h_lockdep_map); 1407 1408 jbd2_free_handle(handle); 1409 return err; 1410 } 1411 1412 /** 1413 * int jbd2_journal_force_commit() - force any uncommitted transactions 1414 * @journal: journal to force 1415 * 1416 * For synchronous operations: force any uncommitted transactions 1417 * to disk. May seem kludgy, but it reuses all the handle batching 1418 * code in a very simple manner. 1419 */ 1420 int jbd2_journal_force_commit(journal_t *journal) 1421 { 1422 handle_t *handle; 1423 int ret; 1424 1425 handle = jbd2_journal_start(journal, 1); 1426 if (IS_ERR(handle)) { 1427 ret = PTR_ERR(handle); 1428 } else { 1429 handle->h_sync = 1; 1430 ret = jbd2_journal_stop(handle); 1431 } 1432 return ret; 1433 } 1434 1435 /* 1436 * 1437 * List management code snippets: various functions for manipulating the 1438 * transaction buffer lists. 1439 * 1440 */ 1441 1442 /* 1443 * Append a buffer to a transaction list, given the transaction's list head 1444 * pointer. 1445 * 1446 * j_list_lock is held. 1447 * 1448 * jbd_lock_bh_state(jh2bh(jh)) is held. 1449 */ 1450 1451 static inline void 1452 __blist_add_buffer(struct journal_head **list, struct journal_head *jh) 1453 { 1454 if (!*list) { 1455 jh->b_tnext = jh->b_tprev = jh; 1456 *list = jh; 1457 } else { 1458 /* Insert at the tail of the list to preserve order */ 1459 struct journal_head *first = *list, *last = first->b_tprev; 1460 jh->b_tprev = last; 1461 jh->b_tnext = first; 1462 last->b_tnext = first->b_tprev = jh; 1463 } 1464 } 1465 1466 /* 1467 * Remove a buffer from a transaction list, given the transaction's list 1468 * head pointer. 1469 * 1470 * Called with j_list_lock held, and the journal may not be locked. 1471 * 1472 * jbd_lock_bh_state(jh2bh(jh)) is held. 1473 */ 1474 1475 static inline void 1476 __blist_del_buffer(struct journal_head **list, struct journal_head *jh) 1477 { 1478 if (*list == jh) { 1479 *list = jh->b_tnext; 1480 if (*list == jh) 1481 *list = NULL; 1482 } 1483 jh->b_tprev->b_tnext = jh->b_tnext; 1484 jh->b_tnext->b_tprev = jh->b_tprev; 1485 } 1486 1487 /* 1488 * Remove a buffer from the appropriate transaction list. 1489 * 1490 * Note that this function can *change* the value of 1491 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1492 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1493 * of these pointers, it could go bad. Generally the caller needs to re-read 1494 * the pointer from the transaction_t. 1495 * 1496 * Called under j_list_lock. The journal may not be locked. 1497 */ 1498 void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) 1499 { 1500 struct journal_head **list = NULL; 1501 transaction_t *transaction; 1502 struct buffer_head *bh = jh2bh(jh); 1503 1504 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1505 transaction = jh->b_transaction; 1506 if (transaction) 1507 assert_spin_locked(&transaction->t_journal->j_list_lock); 1508 1509 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1510 if (jh->b_jlist != BJ_None) 1511 J_ASSERT_JH(jh, transaction != NULL); 1512 1513 switch (jh->b_jlist) { 1514 case BJ_None: 1515 return; 1516 case BJ_Metadata: 1517 transaction->t_nr_buffers--; 1518 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); 1519 list = &transaction->t_buffers; 1520 break; 1521 case BJ_Forget: 1522 list = &transaction->t_forget; 1523 break; 1524 case BJ_IO: 1525 list = &transaction->t_iobuf_list; 1526 break; 1527 case BJ_Shadow: 1528 list = &transaction->t_shadow_list; 1529 break; 1530 case BJ_LogCtl: 1531 list = &transaction->t_log_list; 1532 break; 1533 case BJ_Reserved: 1534 list = &transaction->t_reserved_list; 1535 break; 1536 } 1537 1538 __blist_del_buffer(list, jh); 1539 jh->b_jlist = BJ_None; 1540 if (test_clear_buffer_jbddirty(bh)) 1541 mark_buffer_dirty(bh); /* Expose it to the VM */ 1542 } 1543 1544 void __jbd2_journal_unfile_buffer(struct journal_head *jh) 1545 { 1546 __jbd2_journal_temp_unlink_buffer(jh); 1547 jh->b_transaction = NULL; 1548 } 1549 1550 void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) 1551 { 1552 jbd_lock_bh_state(jh2bh(jh)); 1553 spin_lock(&journal->j_list_lock); 1554 __jbd2_journal_unfile_buffer(jh); 1555 spin_unlock(&journal->j_list_lock); 1556 jbd_unlock_bh_state(jh2bh(jh)); 1557 } 1558 1559 /* 1560 * Called from jbd2_journal_try_to_free_buffers(). 1561 * 1562 * Called under jbd_lock_bh_state(bh) 1563 */ 1564 static void 1565 __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) 1566 { 1567 struct journal_head *jh; 1568 1569 jh = bh2jh(bh); 1570 1571 if (buffer_locked(bh) || buffer_dirty(bh)) 1572 goto out; 1573 1574 if (jh->b_next_transaction != NULL) 1575 goto out; 1576 1577 spin_lock(&journal->j_list_lock); 1578 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { 1579 /* written-back checkpointed metadata buffer */ 1580 if (jh->b_jlist == BJ_None) { 1581 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1582 __jbd2_journal_remove_checkpoint(jh); 1583 jbd2_journal_remove_journal_head(bh); 1584 __brelse(bh); 1585 } 1586 } 1587 spin_unlock(&journal->j_list_lock); 1588 out: 1589 return; 1590 } 1591 1592 /** 1593 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1594 * @journal: journal for operation 1595 * @page: to try and free 1596 * @gfp_mask: we use the mask to detect how hard should we try to release 1597 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to 1598 * release the buffers. 1599 * 1600 * 1601 * For all the buffers on this page, 1602 * if they are fully written out ordered data, move them onto BUF_CLEAN 1603 * so try_to_free_buffers() can reap them. 1604 * 1605 * This function returns non-zero if we wish try_to_free_buffers() 1606 * to be called. We do this if the page is releasable by try_to_free_buffers(). 1607 * We also do it if the page has locked or dirty buffers and the caller wants 1608 * us to perform sync or async writeout. 1609 * 1610 * This complicates JBD locking somewhat. We aren't protected by the 1611 * BKL here. We wish to remove the buffer from its committing or 1612 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer. 1613 * 1614 * This may *change* the value of transaction_t->t_datalist, so anyone 1615 * who looks at t_datalist needs to lock against this function. 1616 * 1617 * Even worse, someone may be doing a jbd2_journal_dirty_data on this 1618 * buffer. So we need to lock against that. jbd2_journal_dirty_data() 1619 * will come out of the lock with the buffer dirty, which makes it 1620 * ineligible for release here. 1621 * 1622 * Who else is affected by this? hmm... Really the only contender 1623 * is do_get_write_access() - it could be looking at the buffer while 1624 * journal_try_to_free_buffer() is changing its state. But that 1625 * cannot happen because we never reallocate freed data as metadata 1626 * while the data is part of a transaction. Yes? 1627 * 1628 * Return 0 on failure, 1 on success 1629 */ 1630 int jbd2_journal_try_to_free_buffers(journal_t *journal, 1631 struct page *page, gfp_t gfp_mask) 1632 { 1633 struct buffer_head *head; 1634 struct buffer_head *bh; 1635 int ret = 0; 1636 1637 J_ASSERT(PageLocked(page)); 1638 1639 head = page_buffers(page); 1640 bh = head; 1641 do { 1642 struct journal_head *jh; 1643 1644 /* 1645 * We take our own ref against the journal_head here to avoid 1646 * having to add tons of locking around each instance of 1647 * jbd2_journal_remove_journal_head() and 1648 * jbd2_journal_put_journal_head(). 1649 */ 1650 jh = jbd2_journal_grab_journal_head(bh); 1651 if (!jh) 1652 continue; 1653 1654 jbd_lock_bh_state(bh); 1655 __journal_try_to_free_buffer(journal, bh); 1656 jbd2_journal_put_journal_head(jh); 1657 jbd_unlock_bh_state(bh); 1658 if (buffer_jbd(bh)) 1659 goto busy; 1660 } while ((bh = bh->b_this_page) != head); 1661 1662 ret = try_to_free_buffers(page); 1663 1664 busy: 1665 return ret; 1666 } 1667 1668 /* 1669 * This buffer is no longer needed. If it is on an older transaction's 1670 * checkpoint list we need to record it on this transaction's forget list 1671 * to pin this buffer (and hence its checkpointing transaction) down until 1672 * this transaction commits. If the buffer isn't on a checkpoint list, we 1673 * release it. 1674 * Returns non-zero if JBD no longer has an interest in the buffer. 1675 * 1676 * Called under j_list_lock. 1677 * 1678 * Called under jbd_lock_bh_state(bh). 1679 */ 1680 static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) 1681 { 1682 int may_free = 1; 1683 struct buffer_head *bh = jh2bh(jh); 1684 1685 __jbd2_journal_unfile_buffer(jh); 1686 1687 if (jh->b_cp_transaction) { 1688 JBUFFER_TRACE(jh, "on running+cp transaction"); 1689 /* 1690 * We don't want to write the buffer anymore, clear the 1691 * bit so that we don't confuse checks in 1692 * __journal_file_buffer 1693 */ 1694 clear_buffer_dirty(bh); 1695 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); 1696 may_free = 0; 1697 } else { 1698 JBUFFER_TRACE(jh, "on running transaction"); 1699 jbd2_journal_remove_journal_head(bh); 1700 __brelse(bh); 1701 } 1702 return may_free; 1703 } 1704 1705 /* 1706 * jbd2_journal_invalidatepage 1707 * 1708 * This code is tricky. It has a number of cases to deal with. 1709 * 1710 * There are two invariants which this code relies on: 1711 * 1712 * i_size must be updated on disk before we start calling invalidatepage on the 1713 * data. 1714 * 1715 * This is done in ext3 by defining an ext3_setattr method which 1716 * updates i_size before truncate gets going. By maintaining this 1717 * invariant, we can be sure that it is safe to throw away any buffers 1718 * attached to the current transaction: once the transaction commits, 1719 * we know that the data will not be needed. 1720 * 1721 * Note however that we can *not* throw away data belonging to the 1722 * previous, committing transaction! 1723 * 1724 * Any disk blocks which *are* part of the previous, committing 1725 * transaction (and which therefore cannot be discarded immediately) are 1726 * not going to be reused in the new running transaction 1727 * 1728 * The bitmap committed_data images guarantee this: any block which is 1729 * allocated in one transaction and removed in the next will be marked 1730 * as in-use in the committed_data bitmap, so cannot be reused until 1731 * the next transaction to delete the block commits. This means that 1732 * leaving committing buffers dirty is quite safe: the disk blocks 1733 * cannot be reallocated to a different file and so buffer aliasing is 1734 * not possible. 1735 * 1736 * 1737 * The above applies mainly to ordered data mode. In writeback mode we 1738 * don't make guarantees about the order in which data hits disk --- in 1739 * particular we don't guarantee that new dirty data is flushed before 1740 * transaction commit --- so it is always safe just to discard data 1741 * immediately in that mode. --sct 1742 */ 1743 1744 /* 1745 * The journal_unmap_buffer helper function returns zero if the buffer 1746 * concerned remains pinned as an anonymous buffer belonging to an older 1747 * transaction. 1748 * 1749 * We're outside-transaction here. Either or both of j_running_transaction 1750 * and j_committing_transaction may be NULL. 1751 */ 1752 static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) 1753 { 1754 transaction_t *transaction; 1755 struct journal_head *jh; 1756 int may_free = 1; 1757 int ret; 1758 1759 BUFFER_TRACE(bh, "entry"); 1760 1761 /* 1762 * It is safe to proceed here without the j_list_lock because the 1763 * buffers cannot be stolen by try_to_free_buffers as long as we are 1764 * holding the page lock. --sct 1765 */ 1766 1767 if (!buffer_jbd(bh)) 1768 goto zap_buffer_unlocked; 1769 1770 /* OK, we have data buffer in journaled mode */ 1771 write_lock(&journal->j_state_lock); 1772 jbd_lock_bh_state(bh); 1773 spin_lock(&journal->j_list_lock); 1774 1775 jh = jbd2_journal_grab_journal_head(bh); 1776 if (!jh) 1777 goto zap_buffer_no_jh; 1778 1779 /* 1780 * We cannot remove the buffer from checkpoint lists until the 1781 * transaction adding inode to orphan list (let's call it T) 1782 * is committed. Otherwise if the transaction changing the 1783 * buffer would be cleaned from the journal before T is 1784 * committed, a crash will cause that the correct contents of 1785 * the buffer will be lost. On the other hand we have to 1786 * clear the buffer dirty bit at latest at the moment when the 1787 * transaction marking the buffer as freed in the filesystem 1788 * structures is committed because from that moment on the 1789 * buffer can be reallocated and used by a different page. 1790 * Since the block hasn't been freed yet but the inode has 1791 * already been added to orphan list, it is safe for us to add 1792 * the buffer to BJ_Forget list of the newest transaction. 1793 */ 1794 transaction = jh->b_transaction; 1795 if (transaction == NULL) { 1796 /* First case: not on any transaction. If it 1797 * has no checkpoint link, then we can zap it: 1798 * it's a writeback-mode buffer so we don't care 1799 * if it hits disk safely. */ 1800 if (!jh->b_cp_transaction) { 1801 JBUFFER_TRACE(jh, "not on any transaction: zap"); 1802 goto zap_buffer; 1803 } 1804 1805 if (!buffer_dirty(bh)) { 1806 /* bdflush has written it. We can drop it now */ 1807 goto zap_buffer; 1808 } 1809 1810 /* OK, it must be in the journal but still not 1811 * written fully to disk: it's metadata or 1812 * journaled data... */ 1813 1814 if (journal->j_running_transaction) { 1815 /* ... and once the current transaction has 1816 * committed, the buffer won't be needed any 1817 * longer. */ 1818 JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); 1819 ret = __dispose_buffer(jh, 1820 journal->j_running_transaction); 1821 jbd2_journal_put_journal_head(jh); 1822 spin_unlock(&journal->j_list_lock); 1823 jbd_unlock_bh_state(bh); 1824 write_unlock(&journal->j_state_lock); 1825 return ret; 1826 } else { 1827 /* There is no currently-running transaction. So the 1828 * orphan record which we wrote for this file must have 1829 * passed into commit. We must attach this buffer to 1830 * the committing transaction, if it exists. */ 1831 if (journal->j_committing_transaction) { 1832 JBUFFER_TRACE(jh, "give to committing trans"); 1833 ret = __dispose_buffer(jh, 1834 journal->j_committing_transaction); 1835 jbd2_journal_put_journal_head(jh); 1836 spin_unlock(&journal->j_list_lock); 1837 jbd_unlock_bh_state(bh); 1838 write_unlock(&journal->j_state_lock); 1839 return ret; 1840 } else { 1841 /* The orphan record's transaction has 1842 * committed. We can cleanse this buffer */ 1843 clear_buffer_jbddirty(bh); 1844 goto zap_buffer; 1845 } 1846 } 1847 } else if (transaction == journal->j_committing_transaction) { 1848 JBUFFER_TRACE(jh, "on committing transaction"); 1849 /* 1850 * The buffer is committing, we simply cannot touch 1851 * it. So we just set j_next_transaction to the 1852 * running transaction (if there is one) and mark 1853 * buffer as freed so that commit code knows it should 1854 * clear dirty bits when it is done with the buffer. 1855 */ 1856 set_buffer_freed(bh); 1857 if (journal->j_running_transaction && buffer_jbddirty(bh)) 1858 jh->b_next_transaction = journal->j_running_transaction; 1859 jbd2_journal_put_journal_head(jh); 1860 spin_unlock(&journal->j_list_lock); 1861 jbd_unlock_bh_state(bh); 1862 write_unlock(&journal->j_state_lock); 1863 return 0; 1864 } else { 1865 /* Good, the buffer belongs to the running transaction. 1866 * We are writing our own transaction's data, not any 1867 * previous one's, so it is safe to throw it away 1868 * (remember that we expect the filesystem to have set 1869 * i_size already for this truncate so recovery will not 1870 * expose the disk blocks we are discarding here.) */ 1871 J_ASSERT_JH(jh, transaction == journal->j_running_transaction); 1872 JBUFFER_TRACE(jh, "on running transaction"); 1873 may_free = __dispose_buffer(jh, transaction); 1874 } 1875 1876 zap_buffer: 1877 jbd2_journal_put_journal_head(jh); 1878 zap_buffer_no_jh: 1879 spin_unlock(&journal->j_list_lock); 1880 jbd_unlock_bh_state(bh); 1881 write_unlock(&journal->j_state_lock); 1882 zap_buffer_unlocked: 1883 clear_buffer_dirty(bh); 1884 J_ASSERT_BH(bh, !buffer_jbddirty(bh)); 1885 clear_buffer_mapped(bh); 1886 clear_buffer_req(bh); 1887 clear_buffer_new(bh); 1888 bh->b_bdev = NULL; 1889 return may_free; 1890 } 1891 1892 /** 1893 * void jbd2_journal_invalidatepage() 1894 * @journal: journal to use for flush... 1895 * @page: page to flush 1896 * @offset: length of page to invalidate. 1897 * 1898 * Reap page buffers containing data after offset in page. 1899 * 1900 */ 1901 void jbd2_journal_invalidatepage(journal_t *journal, 1902 struct page *page, 1903 unsigned long offset) 1904 { 1905 struct buffer_head *head, *bh, *next; 1906 unsigned int curr_off = 0; 1907 int may_free = 1; 1908 1909 if (!PageLocked(page)) 1910 BUG(); 1911 if (!page_has_buffers(page)) 1912 return; 1913 1914 /* We will potentially be playing with lists other than just the 1915 * data lists (especially for journaled data mode), so be 1916 * cautious in our locking. */ 1917 1918 head = bh = page_buffers(page); 1919 do { 1920 unsigned int next_off = curr_off + bh->b_size; 1921 next = bh->b_this_page; 1922 1923 if (offset <= curr_off) { 1924 /* This block is wholly outside the truncation point */ 1925 lock_buffer(bh); 1926 may_free &= journal_unmap_buffer(journal, bh); 1927 unlock_buffer(bh); 1928 } 1929 curr_off = next_off; 1930 bh = next; 1931 1932 } while (bh != head); 1933 1934 if (!offset) { 1935 if (may_free && try_to_free_buffers(page)) 1936 J_ASSERT(!page_has_buffers(page)); 1937 } 1938 } 1939 1940 /* 1941 * File a buffer on the given transaction list. 1942 */ 1943 void __jbd2_journal_file_buffer(struct journal_head *jh, 1944 transaction_t *transaction, int jlist) 1945 { 1946 struct journal_head **list = NULL; 1947 int was_dirty = 0; 1948 struct buffer_head *bh = jh2bh(jh); 1949 1950 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 1951 assert_spin_locked(&transaction->t_journal->j_list_lock); 1952 1953 J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); 1954 J_ASSERT_JH(jh, jh->b_transaction == transaction || 1955 jh->b_transaction == NULL); 1956 1957 if (jh->b_transaction && jh->b_jlist == jlist) 1958 return; 1959 1960 if (jlist == BJ_Metadata || jlist == BJ_Reserved || 1961 jlist == BJ_Shadow || jlist == BJ_Forget) { 1962 /* 1963 * For metadata buffers, we track dirty bit in buffer_jbddirty 1964 * instead of buffer_dirty. We should not see a dirty bit set 1965 * here because we clear it in do_get_write_access but e.g. 1966 * tune2fs can modify the sb and set the dirty bit at any time 1967 * so we try to gracefully handle that. 1968 */ 1969 if (buffer_dirty(bh)) 1970 warn_dirty_buffer(bh); 1971 if (test_clear_buffer_dirty(bh) || 1972 test_clear_buffer_jbddirty(bh)) 1973 was_dirty = 1; 1974 } 1975 1976 if (jh->b_transaction) 1977 __jbd2_journal_temp_unlink_buffer(jh); 1978 jh->b_transaction = transaction; 1979 1980 switch (jlist) { 1981 case BJ_None: 1982 J_ASSERT_JH(jh, !jh->b_committed_data); 1983 J_ASSERT_JH(jh, !jh->b_frozen_data); 1984 return; 1985 case BJ_Metadata: 1986 transaction->t_nr_buffers++; 1987 list = &transaction->t_buffers; 1988 break; 1989 case BJ_Forget: 1990 list = &transaction->t_forget; 1991 break; 1992 case BJ_IO: 1993 list = &transaction->t_iobuf_list; 1994 break; 1995 case BJ_Shadow: 1996 list = &transaction->t_shadow_list; 1997 break; 1998 case BJ_LogCtl: 1999 list = &transaction->t_log_list; 2000 break; 2001 case BJ_Reserved: 2002 list = &transaction->t_reserved_list; 2003 break; 2004 } 2005 2006 __blist_add_buffer(list, jh); 2007 jh->b_jlist = jlist; 2008 2009 if (was_dirty) 2010 set_buffer_jbddirty(bh); 2011 } 2012 2013 void jbd2_journal_file_buffer(struct journal_head *jh, 2014 transaction_t *transaction, int jlist) 2015 { 2016 jbd_lock_bh_state(jh2bh(jh)); 2017 spin_lock(&transaction->t_journal->j_list_lock); 2018 __jbd2_journal_file_buffer(jh, transaction, jlist); 2019 spin_unlock(&transaction->t_journal->j_list_lock); 2020 jbd_unlock_bh_state(jh2bh(jh)); 2021 } 2022 2023 /* 2024 * Remove a buffer from its current buffer list in preparation for 2025 * dropping it from its current transaction entirely. If the buffer has 2026 * already started to be used by a subsequent transaction, refile the 2027 * buffer on that transaction's metadata list. 2028 * 2029 * Called under journal->j_list_lock 2030 * 2031 * Called under jbd_lock_bh_state(jh2bh(jh)) 2032 */ 2033 void __jbd2_journal_refile_buffer(struct journal_head *jh) 2034 { 2035 int was_dirty, jlist; 2036 struct buffer_head *bh = jh2bh(jh); 2037 2038 J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); 2039 if (jh->b_transaction) 2040 assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); 2041 2042 /* If the buffer is now unused, just drop it. */ 2043 if (jh->b_next_transaction == NULL) { 2044 __jbd2_journal_unfile_buffer(jh); 2045 return; 2046 } 2047 2048 /* 2049 * It has been modified by a later transaction: add it to the new 2050 * transaction's metadata list. 2051 */ 2052 2053 was_dirty = test_clear_buffer_jbddirty(bh); 2054 __jbd2_journal_temp_unlink_buffer(jh); 2055 jh->b_transaction = jh->b_next_transaction; 2056 jh->b_next_transaction = NULL; 2057 if (buffer_freed(bh)) 2058 jlist = BJ_Forget; 2059 else if (jh->b_modified) 2060 jlist = BJ_Metadata; 2061 else 2062 jlist = BJ_Reserved; 2063 __jbd2_journal_file_buffer(jh, jh->b_transaction, jlist); 2064 J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); 2065 2066 if (was_dirty) 2067 set_buffer_jbddirty(bh); 2068 } 2069 2070 /* 2071 * For the unlocked version of this call, also make sure that any 2072 * hanging journal_head is cleaned up if necessary. 2073 * 2074 * __jbd2_journal_refile_buffer is usually called as part of a single locked 2075 * operation on a buffer_head, in which the caller is probably going to 2076 * be hooking the journal_head onto other lists. In that case it is up 2077 * to the caller to remove the journal_head if necessary. For the 2078 * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be 2079 * doing anything else to the buffer so we need to do the cleanup 2080 * ourselves to avoid a jh leak. 2081 * 2082 * *** The journal_head may be freed by this call! *** 2083 */ 2084 void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) 2085 { 2086 struct buffer_head *bh = jh2bh(jh); 2087 2088 jbd_lock_bh_state(bh); 2089 spin_lock(&journal->j_list_lock); 2090 2091 __jbd2_journal_refile_buffer(jh); 2092 jbd_unlock_bh_state(bh); 2093 jbd2_journal_remove_journal_head(bh); 2094 2095 spin_unlock(&journal->j_list_lock); 2096 __brelse(bh); 2097 } 2098 2099 /* 2100 * File inode in the inode list of the handle's transaction 2101 */ 2102 int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2103 { 2104 transaction_t *transaction = handle->h_transaction; 2105 journal_t *journal = transaction->t_journal; 2106 2107 if (is_handle_aborted(handle)) 2108 return -EIO; 2109 2110 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2111 transaction->t_tid); 2112 2113 /* 2114 * First check whether inode isn't already on the transaction's 2115 * lists without taking the lock. Note that this check is safe 2116 * without the lock as we cannot race with somebody removing inode 2117 * from the transaction. The reason is that we remove inode from the 2118 * transaction only in journal_release_jbd_inode() and when we commit 2119 * the transaction. We are guarded from the first case by holding 2120 * a reference to the inode. We are safe against the second case 2121 * because if jinode->i_transaction == transaction, commit code 2122 * cannot touch the transaction because we hold reference to it, 2123 * and if jinode->i_next_transaction == transaction, commit code 2124 * will only file the inode where we want it. 2125 */ 2126 if (jinode->i_transaction == transaction || 2127 jinode->i_next_transaction == transaction) 2128 return 0; 2129 2130 spin_lock(&journal->j_list_lock); 2131 2132 if (jinode->i_transaction == transaction || 2133 jinode->i_next_transaction == transaction) 2134 goto done; 2135 2136 /* On some different transaction's list - should be 2137 * the committing one */ 2138 if (jinode->i_transaction) { 2139 J_ASSERT(jinode->i_next_transaction == NULL); 2140 J_ASSERT(jinode->i_transaction == 2141 journal->j_committing_transaction); 2142 jinode->i_next_transaction = transaction; 2143 goto done; 2144 } 2145 /* Not on any transaction list... */ 2146 J_ASSERT(!jinode->i_next_transaction); 2147 jinode->i_transaction = transaction; 2148 list_add(&jinode->i_list, &transaction->t_inode_list); 2149 done: 2150 spin_unlock(&journal->j_list_lock); 2151 2152 return 0; 2153 } 2154 2155 /* 2156 * File truncate and transaction commit interact with each other in a 2157 * non-trivial way. If a transaction writing data block A is 2158 * committing, we cannot discard the data by truncate until we have 2159 * written them. Otherwise if we crashed after the transaction with 2160 * write has committed but before the transaction with truncate has 2161 * committed, we could see stale data in block A. This function is a 2162 * helper to solve this problem. It starts writeout of the truncated 2163 * part in case it is in the committing transaction. 2164 * 2165 * Filesystem code must call this function when inode is journaled in 2166 * ordered mode before truncation happens and after the inode has been 2167 * placed on orphan list with the new inode size. The second condition 2168 * avoids the race that someone writes new data and we start 2169 * committing the transaction after this function has been called but 2170 * before a transaction for truncate is started (and furthermore it 2171 * allows us to optimize the case where the addition to orphan list 2172 * happens in the same transaction as write --- we don't have to write 2173 * any data in such case). 2174 */ 2175 int jbd2_journal_begin_ordered_truncate(journal_t *journal, 2176 struct jbd2_inode *jinode, 2177 loff_t new_size) 2178 { 2179 transaction_t *inode_trans, *commit_trans; 2180 int ret = 0; 2181 2182 /* This is a quick check to avoid locking if not necessary */ 2183 if (!jinode->i_transaction) 2184 goto out; 2185 /* Locks are here just to force reading of recent values, it is 2186 * enough that the transaction was not committing before we started 2187 * a transaction adding the inode to orphan list */ 2188 read_lock(&journal->j_state_lock); 2189 commit_trans = journal->j_committing_transaction; 2190 read_unlock(&journal->j_state_lock); 2191 spin_lock(&journal->j_list_lock); 2192 inode_trans = jinode->i_transaction; 2193 spin_unlock(&journal->j_list_lock); 2194 if (inode_trans == commit_trans) { 2195 ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, 2196 new_size, LLONG_MAX); 2197 if (ret) 2198 jbd2_journal_abort(journal, ret); 2199 } 2200 out: 2201 return ret; 2202 } 2203