1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * linux/fs/jbd2/journal.c 4 * 5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 6 * 7 * Copyright 1998 Red Hat corp --- All Rights Reserved 8 * 9 * Generic filesystem journal-writing code; part of the ext2fs 10 * journaling system. 11 * 12 * This file manages journals: areas of disk reserved for logging 13 * transactional updates. This includes the kernel journaling thread 14 * which is responsible for scheduling updates to the log. 15 * 16 * We do not actually manage the physical storage of the journal in this 17 * file: that is left to a per-journal policy function, which allows us 18 * to store the journal within a filesystem-specified area for ext2 19 * journaling (ext2 can use a reserved inode for storing the log). 20 */ 21 22 #include <linux/module.h> 23 #include <linux/time.h> 24 #include <linux/fs.h> 25 #include <linux/jbd2.h> 26 #include <linux/errno.h> 27 #include <linux/slab.h> 28 #include <linux/init.h> 29 #include <linux/mm.h> 30 #include <linux/freezer.h> 31 #include <linux/pagemap.h> 32 #include <linux/kthread.h> 33 #include <linux/poison.h> 34 #include <linux/proc_fs.h> 35 #include <linux/seq_file.h> 36 #include <linux/math64.h> 37 #include <linux/hash.h> 38 #include <linux/log2.h> 39 #include <linux/vmalloc.h> 40 #include <linux/backing-dev.h> 41 #include <linux/bitops.h> 42 #include <linux/ratelimit.h> 43 #include <linux/sched/mm.h> 44 45 #define CREATE_TRACE_POINTS 46 #include <trace/events/jbd2.h> 47 48 #include <linux/uaccess.h> 49 #include <asm/page.h> 50 51 #ifdef CONFIG_JBD2_DEBUG 52 static ushort jbd2_journal_enable_debug __read_mostly; 53 54 module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); 55 MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); 56 #endif 57 58 EXPORT_SYMBOL(jbd2_journal_extend); 59 EXPORT_SYMBOL(jbd2_journal_stop); 60 EXPORT_SYMBOL(jbd2_journal_lock_updates); 61 EXPORT_SYMBOL(jbd2_journal_unlock_updates); 62 EXPORT_SYMBOL(jbd2_journal_get_write_access); 63 EXPORT_SYMBOL(jbd2_journal_get_create_access); 64 EXPORT_SYMBOL(jbd2_journal_get_undo_access); 65 EXPORT_SYMBOL(jbd2_journal_set_triggers); 66 EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 67 EXPORT_SYMBOL(jbd2_journal_forget); 68 EXPORT_SYMBOL(jbd2_journal_flush); 69 EXPORT_SYMBOL(jbd2_journal_revoke); 70 71 EXPORT_SYMBOL(jbd2_journal_init_dev); 72 EXPORT_SYMBOL(jbd2_journal_init_inode); 73 EXPORT_SYMBOL(jbd2_journal_check_used_features); 74 EXPORT_SYMBOL(jbd2_journal_check_available_features); 75 EXPORT_SYMBOL(jbd2_journal_set_features); 76 EXPORT_SYMBOL(jbd2_journal_load); 77 EXPORT_SYMBOL(jbd2_journal_destroy); 78 EXPORT_SYMBOL(jbd2_journal_abort); 79 EXPORT_SYMBOL(jbd2_journal_errno); 80 EXPORT_SYMBOL(jbd2_journal_ack_err); 81 EXPORT_SYMBOL(jbd2_journal_clear_err); 82 EXPORT_SYMBOL(jbd2_log_wait_commit); 83 EXPORT_SYMBOL(jbd2_journal_start_commit); 84 EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 85 EXPORT_SYMBOL(jbd2_journal_wipe); 86 EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); 87 EXPORT_SYMBOL(jbd2_journal_invalidate_folio); 88 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 89 EXPORT_SYMBOL(jbd2_journal_force_commit); 90 EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); 91 EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); 92 EXPORT_SYMBOL(jbd2_journal_finish_inode_data_buffers); 93 EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94 EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); 95 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); 96 EXPORT_SYMBOL(jbd2_inode_cache); 97 98 #ifdef CONFIG_JBD2_DEBUG 99 void __jbd2_debug(int level, const char *file, const char *func, 100 unsigned int line, const char *fmt, ...) 101 { 102 struct va_format vaf; 103 va_list args; 104 105 if (level > jbd2_journal_enable_debug) 106 return; 107 va_start(args, fmt); 108 vaf.fmt = fmt; 109 vaf.va = &args; 110 printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); 111 va_end(args); 112 } 113 #endif 114 115 /* Checksumming functions */ 116 static __be32 jbd2_superblock_csum(journal_superblock_t *sb) 117 { 118 __u32 csum; 119 __be32 old_csum; 120 121 old_csum = sb->s_checksum; 122 sb->s_checksum = 0; 123 csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); 124 sb->s_checksum = old_csum; 125 126 return cpu_to_be32(csum); 127 } 128 129 /* 130 * Helper function used to manage commit timeouts 131 */ 132 133 static void commit_timeout(struct timer_list *t) 134 { 135 journal_t *journal = timer_container_of(journal, t, j_commit_timer); 136 137 wake_up_process(journal->j_task); 138 } 139 140 /* 141 * kjournald2: The main thread function used to manage a logging device 142 * journal. 143 * 144 * This kernel thread is responsible for two things: 145 * 146 * 1) COMMIT: Every so often we need to commit the current state of the 147 * filesystem to disk. The journal thread is responsible for writing 148 * all of the metadata buffers to disk. If a fast commit is ongoing 149 * journal thread waits until it's done and then continues from 150 * there on. 151 * 152 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all 153 * of the data in that part of the log has been rewritten elsewhere on 154 * the disk. Flushing these old buffers to reclaim space in the log is 155 * known as checkpointing, and this thread is responsible for that job. 156 */ 157 158 static int kjournald2(void *arg) 159 { 160 journal_t *journal = arg; 161 transaction_t *transaction; 162 163 /* 164 * Set up an interval timer which can be used to trigger a commit wakeup 165 * after the commit interval expires 166 */ 167 timer_setup(&journal->j_commit_timer, commit_timeout, 0); 168 169 set_freezable(); 170 171 /* Record that the journal thread is running */ 172 journal->j_task = current; 173 wake_up(&journal->j_wait_done_commit); 174 175 /* 176 * Make sure that no allocations from this kernel thread will ever 177 * recurse to the fs layer because we are responsible for the 178 * transaction commit and any fs involvement might get stuck waiting for 179 * the trasn. commit. 180 */ 181 memalloc_nofs_save(); 182 183 /* 184 * And now, wait forever for commit wakeup events. 185 */ 186 write_lock(&journal->j_state_lock); 187 188 loop: 189 if (journal->j_flags & JBD2_UNMOUNT) 190 goto end_loop; 191 192 jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n", 193 journal->j_commit_sequence, journal->j_commit_request); 194 195 if (journal->j_commit_sequence != journal->j_commit_request) { 196 jbd2_debug(1, "OK, requests differ\n"); 197 write_unlock(&journal->j_state_lock); 198 timer_delete_sync(&journal->j_commit_timer); 199 jbd2_journal_commit_transaction(journal); 200 write_lock(&journal->j_state_lock); 201 goto loop; 202 } 203 204 wake_up(&journal->j_wait_done_commit); 205 if (freezing(current)) { 206 /* 207 * The simpler the better. Flushing journal isn't a 208 * good idea, because that depends on threads that may 209 * be already stopped. 210 */ 211 jbd2_debug(1, "Now suspending kjournald2\n"); 212 write_unlock(&journal->j_state_lock); 213 try_to_freeze(); 214 write_lock(&journal->j_state_lock); 215 } else { 216 /* 217 * We assume on resume that commits are already there, 218 * so we don't sleep 219 */ 220 DEFINE_WAIT(wait); 221 222 prepare_to_wait(&journal->j_wait_commit, &wait, 223 TASK_INTERRUPTIBLE); 224 transaction = journal->j_running_transaction; 225 if (transaction == NULL || 226 time_before(jiffies, transaction->t_expires)) { 227 write_unlock(&journal->j_state_lock); 228 schedule(); 229 write_lock(&journal->j_state_lock); 230 } 231 finish_wait(&journal->j_wait_commit, &wait); 232 } 233 234 jbd2_debug(1, "kjournald2 wakes\n"); 235 236 /* 237 * Were we woken up by a commit wakeup event? 238 */ 239 transaction = journal->j_running_transaction; 240 if (transaction && time_after_eq(jiffies, transaction->t_expires)) { 241 journal->j_commit_request = transaction->t_tid; 242 jbd2_debug(1, "woke because of timeout\n"); 243 } 244 goto loop; 245 246 end_loop: 247 timer_delete_sync(&journal->j_commit_timer); 248 journal->j_task = NULL; 249 wake_up(&journal->j_wait_done_commit); 250 jbd2_debug(1, "Journal thread exiting.\n"); 251 write_unlock(&journal->j_state_lock); 252 return 0; 253 } 254 255 static int jbd2_journal_start_thread(journal_t *journal) 256 { 257 struct task_struct *t; 258 259 t = kthread_run(kjournald2, journal, "jbd2/%s", 260 journal->j_devname); 261 if (IS_ERR(t)) 262 return PTR_ERR(t); 263 264 wait_event(journal->j_wait_done_commit, journal->j_task != NULL); 265 return 0; 266 } 267 268 static void journal_kill_thread(journal_t *journal) 269 { 270 write_lock(&journal->j_state_lock); 271 journal->j_flags |= JBD2_UNMOUNT; 272 273 while (journal->j_task) { 274 write_unlock(&journal->j_state_lock); 275 wake_up(&journal->j_wait_commit); 276 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 277 write_lock(&journal->j_state_lock); 278 } 279 write_unlock(&journal->j_state_lock); 280 } 281 282 static inline bool jbd2_data_needs_escaping(char *data) 283 { 284 return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER); 285 } 286 287 static inline void jbd2_data_do_escape(char *data) 288 { 289 *((unsigned int *)data) = 0; 290 } 291 292 /* 293 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. 294 * 295 * Writes a metadata buffer to a given disk block. The actual IO is not 296 * performed but a new buffer_head is constructed which labels the data 297 * to be written with the correct destination disk block. 298 * 299 * Any magic-number escaping which needs to be done will cause a 300 * copy-out here. If the buffer happens to start with the 301 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the 302 * magic number is only written to the log for descripter blocks. In 303 * this case, we copy the data and replace the first word with 0, and we 304 * return a result code which indicates that this buffer needs to be 305 * marked as an escaped buffer in the corresponding log descriptor 306 * block. The missing word can then be restored when the block is read 307 * during recovery. 308 * 309 * If the source buffer has already been modified by a new transaction 310 * since we took the last commit snapshot, we use the frozen copy of 311 * that data for IO. If we end up using the existing buffer_head's data 312 * for the write, then we have to make sure nobody modifies it while the 313 * IO is in progress. do_get_write_access() handles this. 314 * 315 * The function returns a pointer to the buffer_head to be used for IO. 316 * 317 * 318 * Return value: 319 * =0: Finished OK without escape 320 * =1: Finished OK with escape 321 */ 322 323 int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 324 struct journal_head *jh_in, 325 struct buffer_head **bh_out, 326 sector_t blocknr) 327 { 328 int do_escape = 0; 329 struct buffer_head *new_bh; 330 struct folio *new_folio; 331 unsigned int new_offset; 332 struct buffer_head *bh_in = jh2bh(jh_in); 333 journal_t *journal = transaction->t_journal; 334 335 /* 336 * The buffer really shouldn't be locked: only the current committing 337 * transaction is allowed to write it, so nobody else is allowed 338 * to do any IO. 339 * 340 * akpm: except if we're journalling data, and write() output is 341 * also part of a shared mapping, and another thread has 342 * decided to launch a writepage() against this buffer. 343 */ 344 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 345 346 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 347 348 /* keep subsequent assertions sane */ 349 atomic_set(&new_bh->b_count, 1); 350 351 spin_lock(&jh_in->b_state_lock); 352 /* 353 * If a new transaction has already done a buffer copy-out, then 354 * we use that version of the data for the commit. 355 */ 356 if (jh_in->b_frozen_data) { 357 new_folio = virt_to_folio(jh_in->b_frozen_data); 358 new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); 359 do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data); 360 if (do_escape) 361 jbd2_data_do_escape(jh_in->b_frozen_data); 362 } else { 363 char *tmp; 364 char *mapped_data; 365 366 new_folio = bh_in->b_folio; 367 new_offset = offset_in_folio(new_folio, bh_in->b_data); 368 mapped_data = kmap_local_folio(new_folio, new_offset); 369 /* 370 * Fire data frozen trigger if data already wasn't frozen. Do 371 * this before checking for escaping, as the trigger may modify 372 * the magic offset. If a copy-out happens afterwards, it will 373 * have the correct data in the buffer. 374 */ 375 jbd2_buffer_frozen_trigger(jh_in, mapped_data, 376 jh_in->b_triggers); 377 do_escape = jbd2_data_needs_escaping(mapped_data); 378 kunmap_local(mapped_data); 379 /* 380 * Do we need to do a data copy? 381 */ 382 if (!do_escape) 383 goto escape_done; 384 385 spin_unlock(&jh_in->b_state_lock); 386 tmp = kmalloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); 387 spin_lock(&jh_in->b_state_lock); 388 if (jh_in->b_frozen_data) { 389 kfree(tmp); 390 goto copy_done; 391 } 392 393 jh_in->b_frozen_data = tmp; 394 memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); 395 /* 396 * This isn't strictly necessary, as we're using frozen 397 * data for the escaping, but it keeps consistency with 398 * b_frozen_data usage. 399 */ 400 jh_in->b_frozen_triggers = jh_in->b_triggers; 401 402 copy_done: 403 new_folio = virt_to_folio(jh_in->b_frozen_data); 404 new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); 405 jbd2_data_do_escape(jh_in->b_frozen_data); 406 } 407 408 escape_done: 409 folio_set_bh(new_bh, new_folio, new_offset); 410 new_bh->b_size = bh_in->b_size; 411 new_bh->b_bdev = journal->j_dev; 412 new_bh->b_blocknr = blocknr; 413 new_bh->b_private = bh_in; 414 set_buffer_mapped(new_bh); 415 set_buffer_dirty(new_bh); 416 417 *bh_out = new_bh; 418 419 /* 420 * The to-be-written buffer needs to get moved to the io queue, 421 * and the original buffer whose contents we are shadowing or 422 * copying is moved to the transaction's shadow queue. 423 */ 424 JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); 425 spin_lock(&journal->j_list_lock); 426 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 427 spin_unlock(&journal->j_list_lock); 428 set_buffer_shadow(bh_in); 429 spin_unlock(&jh_in->b_state_lock); 430 431 return do_escape; 432 } 433 434 /* 435 * Allocation code for the journal file. Manage the space left in the 436 * journal, so that we can begin checkpointing when appropriate. 437 */ 438 439 /* 440 * Called with j_state_lock locked for writing. 441 * Returns true if a transaction commit was started. 442 */ 443 static int __jbd2_log_start_commit(journal_t *journal, tid_t target) 444 { 445 /* Return if the txn has already requested to be committed */ 446 if (journal->j_commit_request == target) 447 return 0; 448 449 /* 450 * The only transaction we can possibly wait upon is the 451 * currently running transaction (if it exists). Otherwise, 452 * the target tid must be an old one. 453 */ 454 if (journal->j_running_transaction && 455 journal->j_running_transaction->t_tid == target) { 456 /* 457 * We want a new commit: OK, mark the request and wakeup the 458 * commit thread. We do _not_ do the commit ourselves. 459 */ 460 461 journal->j_commit_request = target; 462 jbd2_debug(1, "JBD2: requesting commit %u/%u\n", 463 journal->j_commit_request, 464 journal->j_commit_sequence); 465 journal->j_running_transaction->t_requested = jiffies; 466 wake_up(&journal->j_wait_commit); 467 return 1; 468 } else if (!tid_geq(journal->j_commit_request, target)) 469 /* This should never happen, but if it does, preserve 470 the evidence before kjournald goes into a loop and 471 increments j_commit_sequence beyond all recognition. */ 472 WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n", 473 journal->j_commit_request, 474 journal->j_commit_sequence, 475 target, journal->j_running_transaction ? 476 journal->j_running_transaction->t_tid : 0); 477 return 0; 478 } 479 480 int jbd2_log_start_commit(journal_t *journal, tid_t tid) 481 { 482 int ret; 483 484 write_lock(&journal->j_state_lock); 485 ret = __jbd2_log_start_commit(journal, tid); 486 write_unlock(&journal->j_state_lock); 487 return ret; 488 } 489 490 /* 491 * Force and wait any uncommitted transactions. We can only force the running 492 * transaction if we don't have an active handle, otherwise, we will deadlock. 493 * Returns: <0 in case of error, 494 * 0 if nothing to commit, 495 * 1 if transaction was successfully committed. 496 */ 497 static int __jbd2_journal_force_commit(journal_t *journal) 498 { 499 transaction_t *transaction = NULL; 500 tid_t tid; 501 int need_to_start = 0, ret = 0; 502 503 read_lock(&journal->j_state_lock); 504 if (journal->j_running_transaction && !current->journal_info) { 505 transaction = journal->j_running_transaction; 506 if (!tid_geq(journal->j_commit_request, transaction->t_tid)) 507 need_to_start = 1; 508 } else if (journal->j_committing_transaction) 509 transaction = journal->j_committing_transaction; 510 511 if (!transaction) { 512 /* Nothing to commit */ 513 read_unlock(&journal->j_state_lock); 514 return 0; 515 } 516 tid = transaction->t_tid; 517 read_unlock(&journal->j_state_lock); 518 if (need_to_start) 519 jbd2_log_start_commit(journal, tid); 520 ret = jbd2_log_wait_commit(journal, tid); 521 if (!ret) 522 ret = 1; 523 524 return ret; 525 } 526 527 /** 528 * jbd2_journal_force_commit_nested - Force and wait upon a commit if the 529 * calling process is not within transaction. 530 * 531 * @journal: journal to force 532 * Returns true if progress was made. 533 * 534 * This is used for forcing out undo-protected data which contains 535 * bitmaps, when the fs is running out of space. 536 */ 537 int jbd2_journal_force_commit_nested(journal_t *journal) 538 { 539 int ret; 540 541 ret = __jbd2_journal_force_commit(journal); 542 return ret > 0; 543 } 544 545 /** 546 * jbd2_journal_force_commit() - force any uncommitted transactions 547 * @journal: journal to force 548 * 549 * Caller want unconditional commit. We can only force the running transaction 550 * if we don't have an active handle, otherwise, we will deadlock. 551 */ 552 int jbd2_journal_force_commit(journal_t *journal) 553 { 554 int ret; 555 556 J_ASSERT(!current->journal_info); 557 ret = __jbd2_journal_force_commit(journal); 558 if (ret > 0) 559 ret = 0; 560 return ret; 561 } 562 563 /* 564 * Start a commit of the current running transaction (if any). Returns true 565 * if a transaction is going to be committed (or is currently already 566 * committing), and fills its tid in at *ptid 567 */ 568 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) 569 { 570 int ret = 0; 571 572 write_lock(&journal->j_state_lock); 573 if (journal->j_running_transaction) { 574 tid_t tid = journal->j_running_transaction->t_tid; 575 576 __jbd2_log_start_commit(journal, tid); 577 /* There's a running transaction and we've just made sure 578 * it's commit has been scheduled. */ 579 if (ptid) 580 *ptid = tid; 581 ret = 1; 582 } else if (journal->j_committing_transaction) { 583 /* 584 * If commit has been started, then we have to wait for 585 * completion of that transaction. 586 */ 587 if (ptid) 588 *ptid = journal->j_committing_transaction->t_tid; 589 ret = 1; 590 } 591 write_unlock(&journal->j_state_lock); 592 return ret; 593 } 594 595 /* 596 * Return 1 if a given transaction has not yet sent barrier request 597 * connected with a transaction commit. If 0 is returned, transaction 598 * may or may not have sent the barrier. Used to avoid sending barrier 599 * twice in common cases. 600 */ 601 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) 602 { 603 int ret = 0; 604 transaction_t *commit_trans, *running_trans; 605 606 if (!(journal->j_flags & JBD2_BARRIER)) 607 return 0; 608 read_lock(&journal->j_state_lock); 609 /* Transaction already committed? */ 610 if (tid_geq(journal->j_commit_sequence, tid)) 611 goto out; 612 commit_trans = journal->j_committing_transaction; 613 if (!commit_trans || commit_trans->t_tid != tid) { 614 running_trans = journal->j_running_transaction; 615 /* 616 * The query transaction hasn't started committing, 617 * it must still be running. 618 */ 619 if (WARN_ON_ONCE(!running_trans || 620 running_trans->t_tid != tid)) 621 goto out; 622 623 running_trans->t_need_data_flush = 1; 624 ret = 1; 625 goto out; 626 } 627 /* 628 * Transaction is being committed and we already proceeded to 629 * submitting a flush to fs partition? 630 */ 631 if (journal->j_fs_dev != journal->j_dev) { 632 if (!commit_trans->t_need_data_flush || 633 commit_trans->t_state >= T_COMMIT_DFLUSH) 634 goto out; 635 } else { 636 if (commit_trans->t_state >= T_COMMIT_JFLUSH) 637 goto out; 638 } 639 ret = 1; 640 out: 641 read_unlock(&journal->j_state_lock); 642 return ret; 643 } 644 EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier); 645 646 /* 647 * Wait for a specified commit to complete. 648 * The caller may not hold the journal lock. 649 */ 650 int jbd2_log_wait_commit(journal_t *journal, tid_t tid) 651 { 652 int err = 0; 653 654 read_lock(&journal->j_state_lock); 655 #ifdef CONFIG_PROVE_LOCKING 656 /* 657 * Some callers make sure transaction is already committing and in that 658 * case we cannot block on open handles anymore. So don't warn in that 659 * case. 660 */ 661 if (tid_gt(tid, journal->j_commit_sequence) && 662 (!journal->j_committing_transaction || 663 journal->j_committing_transaction->t_tid != tid)) { 664 read_unlock(&journal->j_state_lock); 665 jbd2_might_wait_for_commit(journal); 666 read_lock(&journal->j_state_lock); 667 } 668 #endif 669 #ifdef CONFIG_JBD2_DEBUG 670 if (!tid_geq(journal->j_commit_request, tid)) { 671 printk(KERN_ERR 672 "%s: error: j_commit_request=%u, tid=%u\n", 673 __func__, journal->j_commit_request, tid); 674 } 675 #endif 676 while (tid_gt(tid, journal->j_commit_sequence)) { 677 jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", 678 tid, journal->j_commit_sequence); 679 read_unlock(&journal->j_state_lock); 680 wake_up(&journal->j_wait_commit); 681 wait_event(journal->j_wait_done_commit, 682 !tid_gt(tid, journal->j_commit_sequence)); 683 read_lock(&journal->j_state_lock); 684 } 685 read_unlock(&journal->j_state_lock); 686 687 if (unlikely(is_journal_aborted(journal))) 688 err = -EIO; 689 return err; 690 } 691 692 /* 693 * Start a fast commit. If there's an ongoing fast or full commit wait for 694 * it to complete. Returns 0 if a new fast commit was started. Returns -EALREADY 695 * if a fast commit is not needed, either because there's an already a commit 696 * going on or this tid has already been committed. Returns -EINVAL if no jbd2 697 * commit has yet been performed. 698 */ 699 int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) 700 { 701 if (unlikely(is_journal_aborted(journal))) 702 return -EIO; 703 /* 704 * Fast commits only allowed if at least one full commit has 705 * been processed. 706 */ 707 if (!journal->j_stats.ts_tid) 708 return -EINVAL; 709 710 write_lock(&journal->j_state_lock); 711 if (tid_geq(journal->j_commit_sequence, tid)) { 712 write_unlock(&journal->j_state_lock); 713 return -EALREADY; 714 } 715 716 if (journal->j_flags & JBD2_FULL_COMMIT_ONGOING || 717 (journal->j_flags & JBD2_FAST_COMMIT_ONGOING)) { 718 DEFINE_WAIT(wait); 719 720 prepare_to_wait(&journal->j_fc_wait, &wait, 721 TASK_UNINTERRUPTIBLE); 722 write_unlock(&journal->j_state_lock); 723 schedule(); 724 finish_wait(&journal->j_fc_wait, &wait); 725 return -EALREADY; 726 } 727 journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; 728 write_unlock(&journal->j_state_lock); 729 730 return 0; 731 } 732 EXPORT_SYMBOL(jbd2_fc_begin_commit); 733 734 /* 735 * Stop a fast commit. If fallback is set, this function starts commit of 736 * TID tid before any other fast commit can start. 737 */ 738 static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) 739 { 740 if (journal->j_fc_cleanup_callback) 741 journal->j_fc_cleanup_callback(journal, 0, tid); 742 write_lock(&journal->j_state_lock); 743 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 744 if (fallback) 745 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; 746 write_unlock(&journal->j_state_lock); 747 wake_up(&journal->j_fc_wait); 748 if (fallback) 749 return jbd2_complete_transaction(journal, tid); 750 return 0; 751 } 752 753 int jbd2_fc_end_commit(journal_t *journal) 754 { 755 return __jbd2_fc_end_commit(journal, 0, false); 756 } 757 EXPORT_SYMBOL(jbd2_fc_end_commit); 758 759 int jbd2_fc_end_commit_fallback(journal_t *journal) 760 { 761 tid_t tid; 762 763 read_lock(&journal->j_state_lock); 764 tid = journal->j_running_transaction ? 765 journal->j_running_transaction->t_tid : 0; 766 read_unlock(&journal->j_state_lock); 767 return __jbd2_fc_end_commit(journal, tid, true); 768 } 769 EXPORT_SYMBOL(jbd2_fc_end_commit_fallback); 770 771 /* Return 1 when transaction with given tid has already committed. */ 772 int jbd2_transaction_committed(journal_t *journal, tid_t tid) 773 { 774 return tid_geq(READ_ONCE(journal->j_commit_sequence), tid); 775 } 776 EXPORT_SYMBOL(jbd2_transaction_committed); 777 778 /* 779 * When this function returns the transaction corresponding to tid 780 * will be completed. If the transaction has currently running, start 781 * committing that transaction before waiting for it to complete. If 782 * the transaction id is stale, it is by definition already completed, 783 * so just return SUCCESS. 784 */ 785 int jbd2_complete_transaction(journal_t *journal, tid_t tid) 786 { 787 int need_to_wait = 1; 788 789 read_lock(&journal->j_state_lock); 790 if (journal->j_running_transaction && 791 journal->j_running_transaction->t_tid == tid) { 792 if (journal->j_commit_request != tid) { 793 /* transaction not yet started, so request it */ 794 read_unlock(&journal->j_state_lock); 795 jbd2_log_start_commit(journal, tid); 796 goto wait_commit; 797 } 798 } else if (!(journal->j_committing_transaction && 799 journal->j_committing_transaction->t_tid == tid)) 800 need_to_wait = 0; 801 read_unlock(&journal->j_state_lock); 802 if (!need_to_wait) 803 return 0; 804 wait_commit: 805 return jbd2_log_wait_commit(journal, tid); 806 } 807 EXPORT_SYMBOL(jbd2_complete_transaction); 808 809 /* 810 * Log buffer allocation routines: 811 */ 812 813 int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) 814 { 815 unsigned long blocknr; 816 817 write_lock(&journal->j_state_lock); 818 J_ASSERT(journal->j_free > 1); 819 820 blocknr = journal->j_head; 821 journal->j_head++; 822 journal->j_free--; 823 if (journal->j_head == journal->j_last) 824 journal->j_head = journal->j_first; 825 write_unlock(&journal->j_state_lock); 826 return jbd2_journal_bmap(journal, blocknr, retp); 827 } 828 829 /* Map one fast commit buffer for use by the file system */ 830 int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) 831 { 832 unsigned long long pblock; 833 unsigned long blocknr; 834 int ret = 0; 835 struct buffer_head *bh; 836 int fc_off; 837 838 *bh_out = NULL; 839 840 if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last) 841 return -EINVAL; 842 843 fc_off = journal->j_fc_off; 844 blocknr = journal->j_fc_first + fc_off; 845 journal->j_fc_off++; 846 ret = jbd2_journal_bmap(journal, blocknr, &pblock); 847 if (ret) 848 return ret; 849 850 bh = __getblk(journal->j_dev, pblock, journal->j_blocksize); 851 if (!bh) 852 return -ENOMEM; 853 854 journal->j_fc_wbuf[fc_off] = bh; 855 856 *bh_out = bh; 857 858 return 0; 859 } 860 EXPORT_SYMBOL(jbd2_fc_get_buf); 861 862 /* 863 * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf 864 * for completion. 865 */ 866 int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) 867 { 868 struct buffer_head *bh; 869 int i, j_fc_off; 870 871 j_fc_off = journal->j_fc_off; 872 873 /* 874 * Wait in reverse order to minimize chances of us being woken up before 875 * all IOs have completed 876 */ 877 for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) { 878 bh = journal->j_fc_wbuf[i]; 879 wait_on_buffer(bh); 880 /* 881 * Update j_fc_off so jbd2_fc_release_bufs can release remain 882 * buffer head. 883 */ 884 if (unlikely(!buffer_uptodate(bh))) { 885 journal->j_fc_off = i + 1; 886 return -EIO; 887 } 888 put_bh(bh); 889 journal->j_fc_wbuf[i] = NULL; 890 } 891 892 return 0; 893 } 894 EXPORT_SYMBOL(jbd2_fc_wait_bufs); 895 896 void jbd2_fc_release_bufs(journal_t *journal) 897 { 898 struct buffer_head *bh; 899 int i, j_fc_off; 900 901 j_fc_off = journal->j_fc_off; 902 903 for (i = j_fc_off - 1; i >= 0; i--) { 904 bh = journal->j_fc_wbuf[i]; 905 if (!bh) 906 break; 907 put_bh(bh); 908 journal->j_fc_wbuf[i] = NULL; 909 } 910 } 911 EXPORT_SYMBOL(jbd2_fc_release_bufs); 912 913 /* 914 * Conversion of logical to physical block numbers for the journal 915 * 916 * On external journals the journal blocks are identity-mapped, so 917 * this is a no-op. If needed, we can use j_blk_offset - everything is 918 * ready. 919 */ 920 int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, 921 unsigned long long *retp) 922 { 923 int err = 0; 924 unsigned long long ret; 925 sector_t block = blocknr; 926 927 if (journal->j_bmap) { 928 err = journal->j_bmap(journal, &block); 929 if (err == 0) 930 *retp = block; 931 } else if (journal->j_inode) { 932 ret = bmap(journal->j_inode, &block); 933 934 if (ret || !block) { 935 printk(KERN_ALERT "%s: journal block not found " 936 "at offset %lu on %s\n", 937 __func__, blocknr, journal->j_devname); 938 jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); 939 err = -EIO; 940 } else { 941 *retp = block; 942 } 943 944 } else { 945 *retp = blocknr; /* +journal->j_blk_offset */ 946 } 947 return err; 948 } 949 950 /* 951 * We play buffer_head aliasing tricks to write data/metadata blocks to 952 * the journal without copying their contents, but for journal 953 * descriptor blocks we do need to generate bona fide buffers. 954 * 955 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying 956 * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). 957 * But we don't bother doing that, so there will be coherency problems with 958 * mmaps of blockdevs which hold live JBD-controlled filesystems. 959 */ 960 struct buffer_head * 961 jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) 962 { 963 journal_t *journal = transaction->t_journal; 964 struct buffer_head *bh; 965 unsigned long long blocknr; 966 journal_header_t *header; 967 int err; 968 969 err = jbd2_journal_next_log_block(journal, &blocknr); 970 971 if (err) 972 return NULL; 973 974 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 975 if (!bh) 976 return NULL; 977 atomic_dec(&transaction->t_outstanding_credits); 978 lock_buffer(bh); 979 memset(bh->b_data, 0, journal->j_blocksize); 980 header = (journal_header_t *)bh->b_data; 981 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 982 header->h_blocktype = cpu_to_be32(type); 983 header->h_sequence = cpu_to_be32(transaction->t_tid); 984 set_buffer_uptodate(bh); 985 unlock_buffer(bh); 986 BUFFER_TRACE(bh, "return this buffer"); 987 return bh; 988 } 989 990 void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) 991 { 992 struct jbd2_journal_block_tail *tail; 993 __u32 csum; 994 995 if (!jbd2_journal_has_csum_v2or3(j)) 996 return; 997 998 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - 999 sizeof(struct jbd2_journal_block_tail)); 1000 tail->t_checksum = 0; 1001 csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); 1002 tail->t_checksum = cpu_to_be32(csum); 1003 } 1004 1005 /* 1006 * Return tid of the oldest transaction in the journal and block in the journal 1007 * where the transaction starts. 1008 * 1009 * If the journal is now empty, return which will be the next transaction ID 1010 * we will write and where will that transaction start. 1011 * 1012 * The return value is 0 if journal tail cannot be pushed any further, 1 if 1013 * it can. 1014 */ 1015 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, 1016 unsigned long *block) 1017 { 1018 transaction_t *transaction; 1019 int ret; 1020 1021 read_lock(&journal->j_state_lock); 1022 spin_lock(&journal->j_list_lock); 1023 transaction = journal->j_checkpoint_transactions; 1024 if (transaction) { 1025 *tid = transaction->t_tid; 1026 *block = transaction->t_log_start; 1027 } else if ((transaction = journal->j_committing_transaction) != NULL) { 1028 *tid = transaction->t_tid; 1029 *block = transaction->t_log_start; 1030 } else if ((transaction = journal->j_running_transaction) != NULL) { 1031 *tid = transaction->t_tid; 1032 *block = journal->j_head; 1033 } else { 1034 *tid = journal->j_transaction_sequence; 1035 *block = journal->j_head; 1036 } 1037 ret = tid_gt(*tid, journal->j_tail_sequence); 1038 spin_unlock(&journal->j_list_lock); 1039 read_unlock(&journal->j_state_lock); 1040 1041 return ret; 1042 } 1043 1044 /* 1045 * Update information in journal structure and in on disk journal superblock 1046 * about log tail. This function does not check whether information passed in 1047 * really pushes log tail further. It's responsibility of the caller to make 1048 * sure provided log tail information is valid (e.g. by holding 1049 * j_checkpoint_mutex all the time between computing log tail and calling this 1050 * function as is the case with jbd2_cleanup_journal_tail()). 1051 * 1052 * Requires j_checkpoint_mutex 1053 */ 1054 int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) 1055 { 1056 unsigned long freed; 1057 int ret; 1058 1059 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1060 1061 /* 1062 * We cannot afford for write to remain in drive's caches since as 1063 * soon as we update j_tail, next transaction can start reusing journal 1064 * space and if we lose sb update during power failure we'd replay 1065 * old transaction with possibly newly overwritten data. 1066 */ 1067 ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); 1068 if (ret) 1069 goto out; 1070 1071 write_lock(&journal->j_state_lock); 1072 freed = block - journal->j_tail; 1073 if (block < journal->j_tail) 1074 freed += journal->j_last - journal->j_first; 1075 1076 trace_jbd2_update_log_tail(journal, tid, block, freed); 1077 jbd2_debug(1, 1078 "Cleaning journal tail from %u to %u (offset %lu), " 1079 "freeing %lu\n", 1080 journal->j_tail_sequence, tid, block, freed); 1081 1082 journal->j_free += freed; 1083 journal->j_tail_sequence = tid; 1084 journal->j_tail = block; 1085 write_unlock(&journal->j_state_lock); 1086 1087 out: 1088 return ret; 1089 } 1090 1091 /* 1092 * This is a variation of __jbd2_update_log_tail which checks for validity of 1093 * provided log tail and locks j_checkpoint_mutex. So it is safe against races 1094 * with other threads updating log tail. 1095 */ 1096 void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) 1097 { 1098 mutex_lock_io(&journal->j_checkpoint_mutex); 1099 if (tid_gt(tid, journal->j_tail_sequence)) 1100 __jbd2_update_log_tail(journal, tid, block); 1101 mutex_unlock(&journal->j_checkpoint_mutex); 1102 } 1103 1104 struct jbd2_stats_proc_session { 1105 journal_t *journal; 1106 struct transaction_stats_s *stats; 1107 int start; 1108 int max; 1109 }; 1110 1111 static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 1112 { 1113 return *pos ? NULL : SEQ_START_TOKEN; 1114 } 1115 1116 static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) 1117 { 1118 (*pos)++; 1119 return NULL; 1120 } 1121 1122 static int jbd2_seq_info_show(struct seq_file *seq, void *v) 1123 { 1124 struct jbd2_stats_proc_session *s = seq->private; 1125 1126 if (v != SEQ_START_TOKEN) 1127 return 0; 1128 seq_printf(seq, "%lu transactions (%lu requested), " 1129 "each up to %u blocks\n", 1130 s->stats->ts_tid, s->stats->ts_requested, 1131 s->journal->j_max_transaction_buffers); 1132 if (s->stats->ts_tid == 0) 1133 return 0; 1134 seq_printf(seq, "average: \n %ums waiting for transaction\n", 1135 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); 1136 seq_printf(seq, " %ums request delay\n", 1137 (s->stats->ts_requested == 0) ? 0 : 1138 jiffies_to_msecs(s->stats->run.rs_request_delay / 1139 s->stats->ts_requested)); 1140 seq_printf(seq, " %ums running transaction\n", 1141 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); 1142 seq_printf(seq, " %ums transaction was being locked\n", 1143 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); 1144 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 1145 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); 1146 seq_printf(seq, " %ums logging transaction\n", 1147 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); 1148 seq_printf(seq, " %lluus average transaction commit time\n", 1149 div_u64(s->journal->j_average_commit_time, 1000)); 1150 seq_printf(seq, " %lu handles per transaction\n", 1151 s->stats->run.rs_handle_count / s->stats->ts_tid); 1152 seq_printf(seq, " %lu blocks per transaction\n", 1153 s->stats->run.rs_blocks / s->stats->ts_tid); 1154 seq_printf(seq, " %lu logged blocks per transaction\n", 1155 s->stats->run.rs_blocks_logged / s->stats->ts_tid); 1156 return 0; 1157 } 1158 1159 static void jbd2_seq_info_stop(struct seq_file *seq, void *v) 1160 { 1161 } 1162 1163 static const struct seq_operations jbd2_seq_info_ops = { 1164 .start = jbd2_seq_info_start, 1165 .next = jbd2_seq_info_next, 1166 .stop = jbd2_seq_info_stop, 1167 .show = jbd2_seq_info_show, 1168 }; 1169 1170 static int jbd2_seq_info_open(struct inode *inode, struct file *file) 1171 { 1172 journal_t *journal = pde_data(inode); 1173 struct jbd2_stats_proc_session *s; 1174 int rc, size; 1175 1176 s = kmalloc_obj(*s); 1177 if (s == NULL) 1178 return -ENOMEM; 1179 size = sizeof(struct transaction_stats_s); 1180 s->stats = kmalloc(size, GFP_KERNEL); 1181 if (s->stats == NULL) { 1182 kfree(s); 1183 return -ENOMEM; 1184 } 1185 spin_lock(&journal->j_history_lock); 1186 memcpy(s->stats, &journal->j_stats, size); 1187 s->journal = journal; 1188 spin_unlock(&journal->j_history_lock); 1189 1190 rc = seq_open(file, &jbd2_seq_info_ops); 1191 if (rc == 0) { 1192 struct seq_file *m = file->private_data; 1193 m->private = s; 1194 } else { 1195 kfree(s->stats); 1196 kfree(s); 1197 } 1198 return rc; 1199 1200 } 1201 1202 static int jbd2_seq_info_release(struct inode *inode, struct file *file) 1203 { 1204 struct seq_file *seq = file->private_data; 1205 struct jbd2_stats_proc_session *s = seq->private; 1206 kfree(s->stats); 1207 kfree(s); 1208 return seq_release(inode, file); 1209 } 1210 1211 static const struct proc_ops jbd2_info_proc_ops = { 1212 .proc_open = jbd2_seq_info_open, 1213 .proc_read = seq_read, 1214 .proc_lseek = seq_lseek, 1215 .proc_release = jbd2_seq_info_release, 1216 }; 1217 1218 static struct proc_dir_entry *proc_jbd2_stats; 1219 1220 static void jbd2_stats_proc_init(journal_t *journal) 1221 { 1222 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 1223 if (journal->j_proc_entry) { 1224 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 1225 &jbd2_info_proc_ops, journal); 1226 } 1227 } 1228 1229 static void jbd2_stats_proc_exit(journal_t *journal) 1230 { 1231 remove_proc_entry("info", journal->j_proc_entry); 1232 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 1233 } 1234 1235 /* Minimum size of descriptor tag */ 1236 static int jbd2_min_tag_size(void) 1237 { 1238 /* 1239 * Tag with 32-bit block numbers does not use last four bytes of the 1240 * structure 1241 */ 1242 return sizeof(journal_block_tag_t) - 4; 1243 } 1244 1245 /** 1246 * jbd2_journal_shrink_scan() 1247 * @shrink: shrinker to work on 1248 * @sc: reclaim request to process 1249 * 1250 * Scan the checkpointed buffer on the checkpoint list and release the 1251 * journal_head. 1252 */ 1253 static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, 1254 struct shrink_control *sc) 1255 { 1256 journal_t *journal = shrink->private_data; 1257 unsigned long nr_to_scan = sc->nr_to_scan; 1258 unsigned long nr_shrunk; 1259 unsigned long count; 1260 1261 count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); 1262 trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); 1263 1264 nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); 1265 1266 count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); 1267 trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); 1268 1269 return nr_shrunk; 1270 } 1271 1272 /** 1273 * jbd2_journal_shrink_count() 1274 * @shrink: shrinker to work on 1275 * @sc: reclaim request to process 1276 * 1277 * Count the number of checkpoint buffers on the checkpoint list. 1278 */ 1279 static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, 1280 struct shrink_control *sc) 1281 { 1282 journal_t *journal = shrink->private_data; 1283 unsigned long count; 1284 1285 count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); 1286 trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); 1287 1288 return count; 1289 } 1290 1291 /* 1292 * If the journal init or create aborts, we need to mark the journal 1293 * superblock as being NULL to prevent the journal destroy from writing 1294 * back a bogus superblock. 1295 */ 1296 static void journal_fail_superblock(journal_t *journal) 1297 { 1298 struct buffer_head *bh = journal->j_sb_buffer; 1299 brelse(bh); 1300 journal->j_sb_buffer = NULL; 1301 } 1302 1303 /* 1304 * Check the superblock for a given journal, performing initial 1305 * validation of the format. 1306 */ 1307 static int journal_check_superblock(journal_t *journal) 1308 { 1309 journal_superblock_t *sb = journal->j_superblock; 1310 int num_fc_blks; 1311 int err = -EINVAL; 1312 1313 if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || 1314 sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { 1315 printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); 1316 return err; 1317 } 1318 1319 if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && 1320 be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { 1321 printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); 1322 return err; 1323 } 1324 1325 if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { 1326 printk(KERN_WARNING "JBD2: journal file too short\n"); 1327 return err; 1328 } 1329 1330 if (be32_to_cpu(sb->s_first) == 0 || 1331 be32_to_cpu(sb->s_first) >= journal->j_total_len) { 1332 printk(KERN_WARNING 1333 "JBD2: Invalid start block of journal: %u\n", 1334 be32_to_cpu(sb->s_first)); 1335 return err; 1336 } 1337 1338 /* 1339 * If this is a V2 superblock, then we have to check the 1340 * features flags on it. 1341 */ 1342 if (!jbd2_format_support_feature(journal)) 1343 return 0; 1344 1345 if ((sb->s_feature_ro_compat & 1346 ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || 1347 (sb->s_feature_incompat & 1348 ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { 1349 printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); 1350 return err; 1351 } 1352 1353 num_fc_blks = jbd2_has_feature_fast_commit(journal) ? 1354 jbd2_journal_get_num_fc_blks(sb) : 0; 1355 if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || 1356 be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { 1357 printk(KERN_ERR "JBD2: journal file too short %u,%d\n", 1358 be32_to_cpu(sb->s_maxlen), num_fc_blks); 1359 return err; 1360 } 1361 1362 if (jbd2_has_feature_csum2(journal) && 1363 jbd2_has_feature_csum3(journal)) { 1364 /* Can't have checksum v2 and v3 at the same time! */ 1365 printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " 1366 "at the same time!\n"); 1367 return err; 1368 } 1369 1370 if (jbd2_journal_has_csum_v2or3(journal) && 1371 jbd2_has_feature_checksum(journal)) { 1372 /* Can't have checksum v1 and v2 on at the same time! */ 1373 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " 1374 "at the same time!\n"); 1375 return err; 1376 } 1377 1378 if (jbd2_journal_has_csum_v2or3(journal)) { 1379 if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { 1380 printk(KERN_ERR "JBD2: Unknown checksum type\n"); 1381 return err; 1382 } 1383 1384 /* Check superblock checksum */ 1385 if (sb->s_checksum != jbd2_superblock_csum(sb)) { 1386 printk(KERN_ERR "JBD2: journal checksum error\n"); 1387 err = -EFSBADCRC; 1388 return err; 1389 } 1390 } 1391 1392 return 0; 1393 } 1394 1395 static int journal_revoke_records_per_block(journal_t *journal) 1396 { 1397 int record_size; 1398 int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); 1399 1400 if (jbd2_has_feature_64bit(journal)) 1401 record_size = 8; 1402 else 1403 record_size = 4; 1404 1405 if (jbd2_journal_has_csum_v2or3(journal)) 1406 space -= sizeof(struct jbd2_journal_block_tail); 1407 return space / record_size; 1408 } 1409 1410 static int jbd2_journal_get_max_txn_bufs(journal_t *journal) 1411 { 1412 return (journal->j_total_len - journal->j_fc_wbufsize) / 3; 1413 } 1414 1415 /* 1416 * Base amount of descriptor blocks we reserve for each transaction. 1417 */ 1418 static int jbd2_descriptor_blocks_per_trans(journal_t *journal) 1419 { 1420 int tag_space = journal->j_blocksize - sizeof(journal_header_t); 1421 int tags_per_block; 1422 1423 /* Subtract UUID */ 1424 tag_space -= 16; 1425 if (jbd2_journal_has_csum_v2or3(journal)) 1426 tag_space -= sizeof(struct jbd2_journal_block_tail); 1427 /* Commit code leaves a slack space of 16 bytes at the end of block */ 1428 tags_per_block = (tag_space - 16) / journal_tag_bytes(journal); 1429 /* 1430 * Revoke descriptors are accounted separately so we need to reserve 1431 * space for commit block and normal transaction descriptor blocks. 1432 */ 1433 return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal), 1434 tags_per_block); 1435 } 1436 1437 /* 1438 * Initialize number of blocks each transaction reserves for its bookkeeping 1439 * and maximum number of blocks a transaction can use. This needs to be called 1440 * after the journal size and the fastcommit area size are initialized. 1441 */ 1442 static void jbd2_journal_init_transaction_limits(journal_t *journal) 1443 { 1444 journal->j_revoke_records_per_block = 1445 journal_revoke_records_per_block(journal); 1446 journal->j_transaction_overhead_buffers = 1447 jbd2_descriptor_blocks_per_trans(journal); 1448 journal->j_max_transaction_buffers = 1449 jbd2_journal_get_max_txn_bufs(journal); 1450 } 1451 1452 /* 1453 * Load the on-disk journal superblock and read the key fields into the 1454 * journal_t. 1455 */ 1456 static int journal_load_superblock(journal_t *journal) 1457 { 1458 int err; 1459 struct buffer_head *bh; 1460 journal_superblock_t *sb; 1461 1462 bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, 1463 journal->j_blocksize); 1464 if (bh) 1465 err = bh_read(bh, 0); 1466 if (!bh || err < 0) { 1467 pr_err("%s: Cannot read journal superblock\n", __func__); 1468 brelse(bh); 1469 return -EIO; 1470 } 1471 1472 journal->j_sb_buffer = bh; 1473 sb = (journal_superblock_t *)bh->b_data; 1474 journal->j_superblock = sb; 1475 err = journal_check_superblock(journal); 1476 if (err) { 1477 journal_fail_superblock(journal); 1478 return err; 1479 } 1480 1481 journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); 1482 journal->j_tail = be32_to_cpu(sb->s_start); 1483 journal->j_first = be32_to_cpu(sb->s_first); 1484 journal->j_errno = be32_to_cpu(sb->s_errno); 1485 journal->j_last = be32_to_cpu(sb->s_maxlen); 1486 1487 if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) 1488 journal->j_total_len = be32_to_cpu(sb->s_maxlen); 1489 /* Precompute checksum seed for all metadata */ 1490 if (jbd2_journal_has_csum_v2or3(journal)) 1491 journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, 1492 sizeof(sb->s_uuid)); 1493 /* After journal features are set, we can compute transaction limits */ 1494 jbd2_journal_init_transaction_limits(journal); 1495 1496 if (jbd2_has_feature_fast_commit(journal)) { 1497 journal->j_fc_last = be32_to_cpu(sb->s_maxlen); 1498 journal->j_last = journal->j_fc_last - 1499 jbd2_journal_get_num_fc_blks(sb); 1500 journal->j_fc_first = journal->j_last + 1; 1501 journal->j_fc_off = 0; 1502 } 1503 1504 return 0; 1505 } 1506 1507 1508 /* 1509 * Management for journal control blocks: functions to create and 1510 * destroy journal_t structures, and to initialise and read existing 1511 * journal blocks from disk. */ 1512 1513 /* The journal_init_common() function creates and fills a journal_t object 1514 * in memory. It calls journal_load_superblock() to load the on-disk journal 1515 * superblock and initialize the journal_t object. 1516 */ 1517 1518 static journal_t *journal_init_common(struct block_device *bdev, 1519 struct block_device *fs_dev, 1520 unsigned long long start, int len, int blocksize) 1521 { 1522 journal_t *journal; 1523 int err; 1524 int n; 1525 1526 journal = kzalloc_obj(*journal); 1527 if (!journal) 1528 return ERR_PTR(-ENOMEM); 1529 1530 lockdep_register_key(&journal->jbd2_trans_commit_key); 1531 journal->j_blocksize = blocksize; 1532 journal->j_dev = bdev; 1533 journal->j_fs_dev = fs_dev; 1534 journal->j_blk_offset = start; 1535 journal->j_total_len = len; 1536 jbd2_init_fs_dev_write_error(journal); 1537 1538 err = journal_load_superblock(journal); 1539 if (err) 1540 goto err_cleanup; 1541 1542 init_waitqueue_head(&journal->j_wait_transaction_locked); 1543 init_waitqueue_head(&journal->j_wait_done_commit); 1544 init_waitqueue_head(&journal->j_wait_commit); 1545 init_waitqueue_head(&journal->j_wait_updates); 1546 init_waitqueue_head(&journal->j_wait_reserved); 1547 init_waitqueue_head(&journal->j_fc_wait); 1548 mutex_init(&journal->j_abort_mutex); 1549 mutex_init(&journal->j_barrier); 1550 mutex_init(&journal->j_checkpoint_mutex); 1551 spin_lock_init(&journal->j_revoke_lock); 1552 spin_lock_init(&journal->j_list_lock); 1553 spin_lock_init(&journal->j_history_lock); 1554 rwlock_init(&journal->j_state_lock); 1555 1556 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 1557 journal->j_min_batch_time = 0; 1558 journal->j_max_batch_time = 15000; /* 15ms */ 1559 atomic_set(&journal->j_reserved_credits, 0); 1560 lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", 1561 &journal->jbd2_trans_commit_key, 0); 1562 1563 /* The journal is marked for error until we succeed with recovery! */ 1564 journal->j_flags = JBD2_ABORT; 1565 1566 /* Set up a default-sized revoke table for the new mount. */ 1567 err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); 1568 if (err) 1569 goto err_cleanup; 1570 1571 /* 1572 * journal descriptor can store up to n blocks, we need enough 1573 * buffers to write out full descriptor block. 1574 */ 1575 err = -ENOMEM; 1576 n = journal->j_blocksize / jbd2_min_tag_size(); 1577 journal->j_wbufsize = n; 1578 journal->j_fc_wbuf = NULL; 1579 journal->j_wbuf = kmalloc_objs(struct buffer_head *, n); 1580 if (!journal->j_wbuf) 1581 goto err_cleanup; 1582 1583 err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, 1584 GFP_KERNEL); 1585 if (err) 1586 goto err_cleanup; 1587 1588 journal->j_shrink_transaction = NULL; 1589 1590 journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", 1591 MAJOR(bdev->bd_dev), 1592 MINOR(bdev->bd_dev)); 1593 if (!journal->j_shrinker) { 1594 err = -ENOMEM; 1595 goto err_cleanup; 1596 } 1597 1598 journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; 1599 journal->j_shrinker->count_objects = jbd2_journal_shrink_count; 1600 journal->j_shrinker->private_data = journal; 1601 1602 shrinker_register(journal->j_shrinker); 1603 1604 return journal; 1605 1606 err_cleanup: 1607 percpu_counter_destroy(&journal->j_checkpoint_jh_count); 1608 kfree(journal->j_wbuf); 1609 jbd2_journal_destroy_revoke(journal); 1610 journal_fail_superblock(journal); 1611 lockdep_unregister_key(&journal->jbd2_trans_commit_key); 1612 kfree(journal); 1613 return ERR_PTR(err); 1614 } 1615 1616 /* jbd2_journal_init_dev and jbd2_journal_init_inode: 1617 * 1618 * Create a journal structure assigned some fixed set of disk blocks to 1619 * the journal. We don't actually touch those disk blocks yet, but we 1620 * need to set up all of the mapping information to tell the journaling 1621 * system where the journal blocks are. 1622 * 1623 */ 1624 1625 /** 1626 * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure 1627 * @bdev: Block device on which to create the journal 1628 * @fs_dev: Device which hold journalled filesystem for this journal. 1629 * @start: Block nr Start of journal. 1630 * @len: Length of the journal in blocks. 1631 * @blocksize: blocksize of journalling device 1632 * 1633 * Returns: a newly created journal_t * 1634 * 1635 * jbd2_journal_init_dev creates a journal which maps a fixed contiguous 1636 * range of blocks on an arbitrary block device. 1637 * 1638 */ 1639 journal_t *jbd2_journal_init_dev(struct block_device *bdev, 1640 struct block_device *fs_dev, 1641 unsigned long long start, int len, int blocksize) 1642 { 1643 journal_t *journal; 1644 1645 journal = journal_init_common(bdev, fs_dev, start, len, blocksize); 1646 if (IS_ERR(journal)) 1647 return ERR_CAST(journal); 1648 1649 snprintf(journal->j_devname, sizeof(journal->j_devname), 1650 "%pg", journal->j_dev); 1651 strreplace(journal->j_devname, '/', '!'); 1652 jbd2_stats_proc_init(journal); 1653 1654 return journal; 1655 } 1656 1657 /** 1658 * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode. 1659 * @inode: An inode to create the journal in 1660 * 1661 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as 1662 * the journal. The inode must exist already, must support bmap() and 1663 * must have all data blocks preallocated. 1664 */ 1665 journal_t *jbd2_journal_init_inode(struct inode *inode) 1666 { 1667 journal_t *journal; 1668 sector_t blocknr; 1669 int err = 0; 1670 1671 blocknr = 0; 1672 err = bmap(inode, &blocknr); 1673 if (err || !blocknr) { 1674 pr_err("%s: Cannot locate journal superblock\n", __func__); 1675 return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); 1676 } 1677 1678 jbd2_debug(1, "JBD2: inode %s/%llu, size %lld, bits %d, blksize %ld\n", 1679 inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, 1680 inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); 1681 1682 journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, 1683 blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, 1684 inode->i_sb->s_blocksize); 1685 if (IS_ERR(journal)) 1686 return ERR_CAST(journal); 1687 1688 journal->j_inode = inode; 1689 snprintf(journal->j_devname, sizeof(journal->j_devname), 1690 "%pg-%llu", journal->j_dev, journal->j_inode->i_ino); 1691 strreplace(journal->j_devname, '/', '!'); 1692 jbd2_stats_proc_init(journal); 1693 1694 return journal; 1695 } 1696 1697 /* 1698 * Given a journal_t structure, initialise the various fields for 1699 * startup of a new journaling session. We use this both when creating 1700 * a journal, and after recovering an old journal to reset it for 1701 * subsequent use. 1702 */ 1703 1704 static int journal_reset(journal_t *journal) 1705 { 1706 journal_superblock_t *sb = journal->j_superblock; 1707 unsigned long long first, last; 1708 1709 first = be32_to_cpu(sb->s_first); 1710 last = be32_to_cpu(sb->s_maxlen); 1711 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { 1712 printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n", 1713 first, last); 1714 journal_fail_superblock(journal); 1715 return -EINVAL; 1716 } 1717 1718 journal->j_first = first; 1719 journal->j_last = last; 1720 1721 if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) { 1722 /* 1723 * Disable the cycled recording mode if the journal head block 1724 * number is not correct. 1725 */ 1726 if (journal->j_head < first || journal->j_head >= last) { 1727 printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, " 1728 "disable journal_cycle_record\n", 1729 journal->j_head); 1730 journal->j_head = journal->j_first; 1731 } 1732 } else { 1733 journal->j_head = journal->j_first; 1734 } 1735 journal->j_tail = journal->j_head; 1736 journal->j_free = journal->j_last - journal->j_first; 1737 1738 journal->j_tail_sequence = journal->j_transaction_sequence; 1739 journal->j_commit_sequence = journal->j_transaction_sequence - 1; 1740 journal->j_commit_request = journal->j_commit_sequence; 1741 1742 /* 1743 * Now that journal recovery is done, turn fast commits off here. This 1744 * way, if fast commit was enabled before the crash but if now FS has 1745 * disabled it, we don't enable fast commits. 1746 */ 1747 jbd2_clear_feature_fast_commit(journal); 1748 1749 /* 1750 * As a special case, if the on-disk copy is already marked as needing 1751 * no recovery (s_start == 0), then we can safely defer the superblock 1752 * update until the next commit by setting JBD2_FLUSHED. This avoids 1753 * attempting a write to a potential-readonly device. 1754 */ 1755 if (sb->s_start == 0) { 1756 jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb " 1757 "(start %ld, seq %u, errno %d)\n", 1758 journal->j_tail, journal->j_tail_sequence, 1759 journal->j_errno); 1760 journal->j_flags |= JBD2_FLUSHED; 1761 } else { 1762 /* Lock here to make assertions happy... */ 1763 mutex_lock_io(&journal->j_checkpoint_mutex); 1764 /* 1765 * Update log tail information. We use REQ_FUA since new 1766 * transaction will start reusing journal space and so we 1767 * must make sure information about current log tail is on 1768 * disk before that. 1769 */ 1770 jbd2_journal_update_sb_log_tail(journal, 1771 journal->j_tail_sequence, 1772 journal->j_tail, REQ_FUA); 1773 mutex_unlock(&journal->j_checkpoint_mutex); 1774 } 1775 return jbd2_journal_start_thread(journal); 1776 } 1777 1778 /* 1779 * This function expects that the caller will have locked the journal 1780 * buffer head, and will return with it unlocked 1781 */ 1782 static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) 1783 { 1784 struct buffer_head *bh = journal->j_sb_buffer; 1785 journal_superblock_t *sb = journal->j_superblock; 1786 int ret = 0; 1787 1788 /* Buffer got discarded which means block device got invalidated */ 1789 if (!buffer_mapped(bh)) { 1790 unlock_buffer(bh); 1791 return -EIO; 1792 } 1793 1794 /* 1795 * Always set high priority flags to exempt from block layer's 1796 * QOS policies, e.g. writeback throttle. 1797 */ 1798 write_flags |= JBD2_JOURNAL_REQ_FLAGS; 1799 if (!(journal->j_flags & JBD2_BARRIER)) 1800 write_flags &= ~(REQ_FUA | REQ_PREFLUSH); 1801 1802 trace_jbd2_write_superblock(journal, write_flags); 1803 1804 if (buffer_write_io_error(bh)) { 1805 /* 1806 * Oh, dear. A previous attempt to write the journal 1807 * superblock failed. This could happen because the 1808 * USB device was yanked out. Or it could happen to 1809 * be a transient write error and maybe the block will 1810 * be remapped. Nothing we can do but to retry the 1811 * write and hope for the best. 1812 */ 1813 printk(KERN_ERR "JBD2: previous I/O error detected " 1814 "for journal superblock update for %s.\n", 1815 journal->j_devname); 1816 clear_buffer_write_io_error(bh); 1817 set_buffer_uptodate(bh); 1818 } 1819 if (jbd2_journal_has_csum_v2or3(journal)) 1820 sb->s_checksum = jbd2_superblock_csum(sb); 1821 bh_submit(bh, REQ_OP_WRITE | write_flags, bh_end_write); 1822 wait_on_buffer(bh); 1823 if (buffer_write_io_error(bh)) { 1824 clear_buffer_write_io_error(bh); 1825 set_buffer_uptodate(bh); 1826 ret = -EIO; 1827 } 1828 if (ret) { 1829 printk(KERN_ERR "JBD2: I/O error when updating journal superblock for %s.\n", 1830 journal->j_devname); 1831 if (!is_journal_aborted(journal)) 1832 jbd2_journal_abort(journal, ret); 1833 } 1834 1835 return ret; 1836 } 1837 1838 /** 1839 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk. 1840 * @journal: The journal to update. 1841 * @tail_tid: TID of the new transaction at the tail of the log 1842 * @tail_block: The first block of the transaction at the tail of the log 1843 * @write_flags: Flags for the journal sb write operation 1844 * 1845 * Update a journal's superblock information about log tail and write it to 1846 * disk, waiting for the IO to complete. 1847 */ 1848 int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, 1849 unsigned long tail_block, 1850 blk_opf_t write_flags) 1851 { 1852 journal_superblock_t *sb = journal->j_superblock; 1853 int ret; 1854 1855 if (is_journal_aborted(journal)) 1856 return -EIO; 1857 ret = jbd2_check_fs_dev_write_error(journal); 1858 if (ret) { 1859 jbd2_journal_abort(journal, ret); 1860 return -EIO; 1861 } 1862 1863 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1864 jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", 1865 tail_block, tail_tid); 1866 1867 lock_buffer(journal->j_sb_buffer); 1868 sb->s_sequence = cpu_to_be32(tail_tid); 1869 sb->s_start = cpu_to_be32(tail_block); 1870 1871 ret = jbd2_write_superblock(journal, write_flags); 1872 if (ret) 1873 goto out; 1874 1875 /* Log is no longer empty */ 1876 write_lock(&journal->j_state_lock); 1877 journal->j_flags &= ~JBD2_FLUSHED; 1878 write_unlock(&journal->j_state_lock); 1879 1880 out: 1881 return ret; 1882 } 1883 1884 /** 1885 * jbd2_mark_journal_empty() - Mark on disk journal as empty. 1886 * @journal: The journal to update. 1887 * @write_flags: Flags for the journal sb write operation 1888 * 1889 * Update a journal's dynamic superblock fields to show that journal is empty. 1890 * Write updated superblock to disk waiting for IO to complete. 1891 */ 1892 static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) 1893 { 1894 journal_superblock_t *sb = journal->j_superblock; 1895 bool had_fast_commit = false; 1896 1897 BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); 1898 lock_buffer(journal->j_sb_buffer); 1899 if (sb->s_start == 0) { /* Is it already empty? */ 1900 unlock_buffer(journal->j_sb_buffer); 1901 return; 1902 } 1903 1904 jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n", 1905 journal->j_tail_sequence); 1906 1907 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1908 sb->s_start = cpu_to_be32(0); 1909 sb->s_head = cpu_to_be32(journal->j_head); 1910 if (jbd2_has_feature_fast_commit(journal)) { 1911 /* 1912 * When journal is clean, no need to commit fast commit flag and 1913 * make file system incompatible with older kernels. 1914 */ 1915 jbd2_clear_feature_fast_commit(journal); 1916 had_fast_commit = true; 1917 } 1918 1919 jbd2_write_superblock(journal, write_flags); 1920 1921 if (had_fast_commit) 1922 jbd2_set_feature_fast_commit(journal); 1923 1924 /* Log is empty */ 1925 write_lock(&journal->j_state_lock); 1926 journal->j_flags |= JBD2_FLUSHED; 1927 write_unlock(&journal->j_state_lock); 1928 } 1929 1930 /** 1931 * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) 1932 * @journal: The journal to erase. 1933 * @flags: A discard/zeroout request is sent for each physically contigous 1934 * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or 1935 * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation 1936 * to perform. 1937 * 1938 * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes 1939 * will be explicitly written if no hardware offload is available, see 1940 * blkdev_issue_zeroout for more details. 1941 */ 1942 static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) 1943 { 1944 int err = 0; 1945 unsigned long block, log_offset; /* logical */ 1946 unsigned long long phys_block, block_start, block_stop; /* physical */ 1947 loff_t byte_start, byte_stop, byte_count; 1948 1949 /* flags must be set to either discard or zeroout */ 1950 if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || 1951 ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && 1952 (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) 1953 return -EINVAL; 1954 1955 if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && 1956 !bdev_max_discard_sectors(journal->j_dev)) 1957 return -EOPNOTSUPP; 1958 1959 /* 1960 * lookup block mapping and issue discard/zeroout for each 1961 * contiguous region 1962 */ 1963 log_offset = be32_to_cpu(journal->j_superblock->s_first); 1964 block_start = ~0ULL; 1965 for (block = log_offset; block < journal->j_total_len; block++) { 1966 err = jbd2_journal_bmap(journal, block, &phys_block); 1967 if (err) { 1968 pr_err("JBD2: bad block at offset %lu", block); 1969 return err; 1970 } 1971 1972 if (block_start == ~0ULL) 1973 block_stop = block_start = phys_block; 1974 1975 /* 1976 * last block not contiguous with current block, 1977 * process last contiguous region and return to this block on 1978 * next loop 1979 */ 1980 if (phys_block != block_stop) { 1981 block--; 1982 } else { 1983 block_stop++; 1984 /* 1985 * if this isn't the last block of journal, 1986 * no need to process now because next block may also 1987 * be part of this contiguous region 1988 */ 1989 if (block != journal->j_total_len - 1) 1990 continue; 1991 } 1992 1993 /* 1994 * end of contiguous region or this is last block of journal, 1995 * take care of the region 1996 */ 1997 byte_start = block_start * journal->j_blocksize; 1998 byte_stop = block_stop * journal->j_blocksize; 1999 byte_count = (block_stop - block_start) * journal->j_blocksize; 2000 2001 truncate_inode_pages_range(journal->j_dev->bd_mapping, 2002 byte_start, byte_stop - 1); 2003 2004 if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { 2005 err = blkdev_issue_discard(journal->j_dev, 2006 byte_start >> SECTOR_SHIFT, 2007 byte_count >> SECTOR_SHIFT, 2008 GFP_NOFS); 2009 } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { 2010 err = blkdev_issue_zeroout(journal->j_dev, 2011 byte_start >> SECTOR_SHIFT, 2012 byte_count >> SECTOR_SHIFT, 2013 GFP_NOFS, 0); 2014 } 2015 2016 if (unlikely(err != 0)) { 2017 pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", 2018 err, block_start, block_stop); 2019 return err; 2020 } 2021 2022 /* reset start and stop after processing a region */ 2023 block_start = ~0ULL; 2024 } 2025 2026 return blkdev_issue_flush(journal->j_dev); 2027 } 2028 2029 /** 2030 * jbd2_journal_update_sb_errno() - Update error in the journal. 2031 * @journal: The journal to update. 2032 * 2033 * Update a journal's errno. Write updated superblock to disk waiting for IO 2034 * to complete. 2035 */ 2036 void jbd2_journal_update_sb_errno(journal_t *journal) 2037 { 2038 journal_superblock_t *sb = journal->j_superblock; 2039 int errcode; 2040 2041 lock_buffer(journal->j_sb_buffer); 2042 errcode = journal->j_errno; 2043 if (errcode == -ESHUTDOWN) 2044 errcode = 0; 2045 jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); 2046 sb->s_errno = cpu_to_be32(errcode); 2047 2048 jbd2_write_superblock(journal, REQ_FUA); 2049 } 2050 EXPORT_SYMBOL(jbd2_journal_update_sb_errno); 2051 2052 /** 2053 * jbd2_journal_load() - Read journal from disk. 2054 * @journal: Journal to act on. 2055 * 2056 * Given a journal_t structure which tells us which disk blocks contain 2057 * a journal, read the journal from disk to initialise the in-memory 2058 * structures. 2059 */ 2060 int jbd2_journal_load(journal_t *journal) 2061 { 2062 int err; 2063 2064 /* Let the recovery code check whether it needs to recover any 2065 * data from the journal. */ 2066 err = jbd2_journal_recover(journal); 2067 if (err) { 2068 pr_warn("JBD2: journal recovery failed\n"); 2069 return err; 2070 } 2071 2072 if (journal->j_failed_commit) { 2073 printk(KERN_ERR "JBD2: journal transaction %u on %s " 2074 "is corrupt.\n", journal->j_failed_commit, 2075 journal->j_devname); 2076 return -EFSCORRUPTED; 2077 } 2078 /* 2079 * clear JBD2_ABORT flag initialized in journal_init_common 2080 * here to update log tail information with the newest seq. 2081 */ 2082 journal->j_flags &= ~JBD2_ABORT; 2083 2084 /* OK, we've finished with the dynamic journal bits: 2085 * reinitialise the dynamic contents of the superblock in memory 2086 * and reset them on disk. */ 2087 err = journal_reset(journal); 2088 if (err) { 2089 pr_warn("JBD2: journal reset failed\n"); 2090 return err; 2091 } 2092 2093 journal->j_flags |= JBD2_LOADED; 2094 return 0; 2095 } 2096 2097 /** 2098 * jbd2_journal_destroy() - Release a journal_t structure. 2099 * @journal: Journal to act on. 2100 * 2101 * Release a journal_t structure once it is no longer in use by the 2102 * journaled object. 2103 * Return <0 if we couldn't clean up the journal. 2104 */ 2105 int jbd2_journal_destroy(journal_t *journal) 2106 { 2107 int err = 0; 2108 2109 /* Wait for the commit thread to wake up and die. */ 2110 journal_kill_thread(journal); 2111 2112 /* Force a final log commit */ 2113 if (journal->j_running_transaction) 2114 jbd2_journal_commit_transaction(journal); 2115 2116 /* Force any old transactions to disk */ 2117 2118 /* Totally anal locking here... */ 2119 spin_lock(&journal->j_list_lock); 2120 while (journal->j_checkpoint_transactions != NULL) { 2121 spin_unlock(&journal->j_list_lock); 2122 mutex_lock_io(&journal->j_checkpoint_mutex); 2123 err = jbd2_log_do_checkpoint(journal); 2124 mutex_unlock(&journal->j_checkpoint_mutex); 2125 /* 2126 * If checkpointing failed, just free the buffers to avoid 2127 * looping forever 2128 */ 2129 if (err) { 2130 jbd2_journal_destroy_checkpoint(journal); 2131 spin_lock(&journal->j_list_lock); 2132 break; 2133 } 2134 spin_lock(&journal->j_list_lock); 2135 } 2136 2137 J_ASSERT(journal->j_running_transaction == NULL); 2138 J_ASSERT(journal->j_committing_transaction == NULL); 2139 J_ASSERT(journal->j_checkpoint_transactions == NULL); 2140 spin_unlock(&journal->j_list_lock); 2141 2142 /* 2143 * OK, all checkpoint transactions have been checked, now check the 2144 * writeback errseq of fs dev and abort the journal if some buffer 2145 * failed to write back to the original location, otherwise the 2146 * filesystem may become inconsistent. 2147 */ 2148 if (!is_journal_aborted(journal)) { 2149 int ret = jbd2_check_fs_dev_write_error(journal); 2150 if (ret) 2151 jbd2_journal_abort(journal, ret); 2152 } 2153 2154 if (journal->j_sb_buffer) { 2155 if (!is_journal_aborted(journal)) { 2156 mutex_lock_io(&journal->j_checkpoint_mutex); 2157 2158 write_lock(&journal->j_state_lock); 2159 journal->j_tail_sequence = 2160 ++journal->j_transaction_sequence; 2161 write_unlock(&journal->j_state_lock); 2162 2163 jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA); 2164 mutex_unlock(&journal->j_checkpoint_mutex); 2165 } else 2166 err = -EIO; 2167 brelse(journal->j_sb_buffer); 2168 } 2169 2170 if (journal->j_shrinker) { 2171 percpu_counter_destroy(&journal->j_checkpoint_jh_count); 2172 shrinker_free(journal->j_shrinker); 2173 } 2174 if (journal->j_proc_entry) 2175 jbd2_stats_proc_exit(journal); 2176 iput(journal->j_inode); 2177 if (journal->j_revoke) 2178 jbd2_journal_destroy_revoke(journal); 2179 kfree(journal->j_fc_wbuf); 2180 kfree(journal->j_wbuf); 2181 lockdep_unregister_key(&journal->jbd2_trans_commit_key); 2182 kfree(journal); 2183 2184 return err; 2185 } 2186 2187 2188 /** 2189 * jbd2_journal_check_used_features() - Check if features specified are used. 2190 * @journal: Journal to check. 2191 * @compat: bitmask of compatible features 2192 * @ro: bitmask of features that force read-only mount 2193 * @incompat: bitmask of incompatible features 2194 * 2195 * Check whether the journal uses all of a given set of 2196 * features. Return true (non-zero) if it does. 2197 **/ 2198 2199 int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, 2200 unsigned long ro, unsigned long incompat) 2201 { 2202 journal_superblock_t *sb; 2203 2204 if (!compat && !ro && !incompat) 2205 return 1; 2206 if (!jbd2_format_support_feature(journal)) 2207 return 0; 2208 2209 sb = journal->j_superblock; 2210 2211 if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && 2212 ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && 2213 ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) 2214 return 1; 2215 2216 return 0; 2217 } 2218 2219 /** 2220 * jbd2_journal_check_available_features() - Check feature set in journalling layer 2221 * @journal: Journal to check. 2222 * @compat: bitmask of compatible features 2223 * @ro: bitmask of features that force read-only mount 2224 * @incompat: bitmask of incompatible features 2225 * 2226 * Check whether the journaling code supports the use of 2227 * all of a given set of features on this journal. Return true 2228 * (non-zero) if it can. */ 2229 2230 int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat, 2231 unsigned long ro, unsigned long incompat) 2232 { 2233 if (!compat && !ro && !incompat) 2234 return 1; 2235 2236 if (!jbd2_format_support_feature(journal)) 2237 return 0; 2238 2239 if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && 2240 (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro && 2241 (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat) 2242 return 1; 2243 2244 return 0; 2245 } 2246 2247 static int 2248 jbd2_journal_initialize_fast_commit(journal_t *journal) 2249 { 2250 journal_superblock_t *sb = journal->j_superblock; 2251 unsigned long long num_fc_blks; 2252 2253 num_fc_blks = jbd2_journal_get_num_fc_blks(sb); 2254 if (num_fc_blks > journal->j_last) 2255 return -EFSCORRUPTED; 2256 if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) 2257 return -ENOSPC; 2258 2259 /* Are we called twice? */ 2260 WARN_ON(journal->j_fc_wbuf != NULL); 2261 journal->j_fc_wbuf = kmalloc_objs(struct buffer_head *, num_fc_blks); 2262 if (!journal->j_fc_wbuf) 2263 return -ENOMEM; 2264 2265 journal->j_fc_wbufsize = num_fc_blks; 2266 journal->j_fc_last = journal->j_last; 2267 journal->j_last = journal->j_fc_last - num_fc_blks; 2268 journal->j_fc_first = journal->j_last + 1; 2269 journal->j_fc_off = 0; 2270 journal->j_free = journal->j_last - journal->j_first; 2271 2272 return 0; 2273 } 2274 2275 /** 2276 * jbd2_journal_set_features() - Mark a given journal feature in the superblock 2277 * @journal: Journal to act on. 2278 * @compat: bitmask of compatible features 2279 * @ro: bitmask of features that force read-only mount 2280 * @incompat: bitmask of incompatible features 2281 * 2282 * Mark a given journal feature as present on the 2283 * superblock. Returns true if the requested features could be set. 2284 * 2285 */ 2286 2287 int jbd2_journal_set_features(journal_t *journal, unsigned long compat, 2288 unsigned long ro, unsigned long incompat) 2289 { 2290 #define INCOMPAT_FEATURE_ON(f) \ 2291 ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) 2292 #define COMPAT_FEATURE_ON(f) \ 2293 ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) 2294 journal_superblock_t *sb; 2295 2296 if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) 2297 return 1; 2298 2299 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 2300 return 0; 2301 2302 /* If enabling v2 checksums, turn on v3 instead */ 2303 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { 2304 incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; 2305 incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; 2306 } 2307 2308 /* Asking for checksumming v3 and v1? Only give them v3. */ 2309 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && 2310 compat & JBD2_FEATURE_COMPAT_CHECKSUM) 2311 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; 2312 2313 jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", 2314 compat, ro, incompat); 2315 2316 sb = journal->j_superblock; 2317 2318 if (incompat & JBD2_FEATURE_INCOMPAT_FAST_COMMIT) { 2319 if (jbd2_journal_initialize_fast_commit(journal)) { 2320 pr_err("JBD2: Cannot enable fast commits.\n"); 2321 return 0; 2322 } 2323 } 2324 2325 lock_buffer(journal->j_sb_buffer); 2326 2327 /* If enabling v3 checksums, update superblock and precompute seed */ 2328 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { 2329 sb->s_checksum_type = JBD2_CRC32C_CHKSUM; 2330 sb->s_feature_compat &= 2331 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); 2332 journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, 2333 sizeof(sb->s_uuid)); 2334 } 2335 2336 /* If enabling v1 checksums, downgrade superblock */ 2337 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) 2338 sb->s_feature_incompat &= 2339 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | 2340 JBD2_FEATURE_INCOMPAT_CSUM_V3); 2341 2342 sb->s_feature_compat |= cpu_to_be32(compat); 2343 sb->s_feature_ro_compat |= cpu_to_be32(ro); 2344 sb->s_feature_incompat |= cpu_to_be32(incompat); 2345 /* 2346 * Update the checksum now so that it is valid even for read-only 2347 * filesystems where jbd2_write_superblock() doesn't get called. 2348 */ 2349 if (jbd2_journal_has_csum_v2or3(journal)) 2350 sb->s_checksum = jbd2_superblock_csum(sb); 2351 unlock_buffer(journal->j_sb_buffer); 2352 jbd2_journal_init_transaction_limits(journal); 2353 2354 return 1; 2355 #undef COMPAT_FEATURE_ON 2356 #undef INCOMPAT_FEATURE_ON 2357 } 2358 2359 /* 2360 * jbd2_journal_clear_features() - Clear a given journal feature in the 2361 * superblock 2362 * @journal: Journal to act on. 2363 * @compat: bitmask of compatible features 2364 * @ro: bitmask of features that force read-only mount 2365 * @incompat: bitmask of incompatible features 2366 * 2367 * Clear a given journal feature as present on the 2368 * superblock. 2369 */ 2370 void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, 2371 unsigned long ro, unsigned long incompat) 2372 { 2373 journal_superblock_t *sb; 2374 2375 jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", 2376 compat, ro, incompat); 2377 2378 sb = journal->j_superblock; 2379 2380 lock_buffer(journal->j_sb_buffer); 2381 sb->s_feature_compat &= ~cpu_to_be32(compat); 2382 sb->s_feature_ro_compat &= ~cpu_to_be32(ro); 2383 sb->s_feature_incompat &= ~cpu_to_be32(incompat); 2384 /* 2385 * Update the checksum now so that it is valid even for read-only 2386 * filesystems where jbd2_write_superblock() doesn't get called. 2387 */ 2388 if (jbd2_journal_has_csum_v2or3(journal)) 2389 sb->s_checksum = jbd2_superblock_csum(sb); 2390 unlock_buffer(journal->j_sb_buffer); 2391 jbd2_journal_init_transaction_limits(journal); 2392 } 2393 EXPORT_SYMBOL(jbd2_journal_clear_features); 2394 2395 /** 2396 * jbd2_journal_flush() - Flush journal 2397 * @journal: Journal to act on. 2398 * @flags: optional operation on the journal blocks after the flush (see below) 2399 * 2400 * Flush all data for a given journal to disk and empty the journal. 2401 * Filesystems can use this when remounting readonly to ensure that 2402 * recovery does not need to happen on remount. Optionally, a discard or zeroout 2403 * can be issued on the journal blocks after flushing. 2404 * 2405 * flags: 2406 * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks 2407 * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks 2408 */ 2409 int jbd2_journal_flush(journal_t *journal, unsigned int flags) 2410 { 2411 int err = 0; 2412 transaction_t *transaction = NULL; 2413 2414 write_lock(&journal->j_state_lock); 2415 2416 /* Force everything buffered to the log... */ 2417 if (journal->j_running_transaction) { 2418 transaction = journal->j_running_transaction; 2419 __jbd2_log_start_commit(journal, transaction->t_tid); 2420 } else if (journal->j_committing_transaction) 2421 transaction = journal->j_committing_transaction; 2422 2423 /* Wait for the log commit to complete... */ 2424 if (transaction) { 2425 tid_t tid = transaction->t_tid; 2426 2427 write_unlock(&journal->j_state_lock); 2428 jbd2_log_wait_commit(journal, tid); 2429 } else { 2430 write_unlock(&journal->j_state_lock); 2431 } 2432 2433 /* ...and flush everything in the log out to disk. */ 2434 spin_lock(&journal->j_list_lock); 2435 while (!err && journal->j_checkpoint_transactions != NULL) { 2436 spin_unlock(&journal->j_list_lock); 2437 mutex_lock_io(&journal->j_checkpoint_mutex); 2438 err = jbd2_log_do_checkpoint(journal); 2439 mutex_unlock(&journal->j_checkpoint_mutex); 2440 spin_lock(&journal->j_list_lock); 2441 } 2442 spin_unlock(&journal->j_list_lock); 2443 2444 if (is_journal_aborted(journal)) 2445 return -EIO; 2446 2447 mutex_lock_io(&journal->j_checkpoint_mutex); 2448 if (!err) { 2449 err = jbd2_cleanup_journal_tail(journal); 2450 if (err < 0) { 2451 mutex_unlock(&journal->j_checkpoint_mutex); 2452 goto out; 2453 } 2454 err = 0; 2455 } 2456 2457 /* Finally, mark the journal as really needing no recovery. 2458 * This sets s_start==0 in the underlying superblock, which is 2459 * the magic code for a fully-recovered superblock. Any future 2460 * commits of data to the journal will restore the current 2461 * s_start value. */ 2462 jbd2_mark_journal_empty(journal, REQ_FUA); 2463 2464 if (flags) 2465 err = __jbd2_journal_erase(journal, flags); 2466 2467 mutex_unlock(&journal->j_checkpoint_mutex); 2468 write_lock(&journal->j_state_lock); 2469 J_ASSERT(!journal->j_running_transaction); 2470 J_ASSERT(!journal->j_committing_transaction); 2471 J_ASSERT(!journal->j_checkpoint_transactions); 2472 J_ASSERT(journal->j_head == journal->j_tail); 2473 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 2474 write_unlock(&journal->j_state_lock); 2475 out: 2476 return err; 2477 } 2478 2479 /** 2480 * jbd2_journal_wipe() - Wipe journal contents 2481 * @journal: Journal to act on. 2482 * @write: flag (see below) 2483 * 2484 * Wipe out all of the contents of a journal, safely. This will produce 2485 * a warning if the journal contains any valid recovery information. 2486 * Must be called between journal_init_*() and jbd2_journal_load(). 2487 * 2488 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise 2489 * we merely suppress recovery. 2490 */ 2491 2492 int jbd2_journal_wipe(journal_t *journal, int write) 2493 { 2494 int err; 2495 2496 J_ASSERT (!(journal->j_flags & JBD2_LOADED)); 2497 2498 if (!journal->j_tail) 2499 return 0; 2500 2501 printk(KERN_WARNING "JBD2: %s recovery information on journal\n", 2502 write ? "Clearing" : "Ignoring"); 2503 2504 err = jbd2_journal_skip_recovery(journal); 2505 if (write) { 2506 /* Lock to make assertions happy... */ 2507 mutex_lock_io(&journal->j_checkpoint_mutex); 2508 jbd2_mark_journal_empty(journal, REQ_FUA); 2509 mutex_unlock(&journal->j_checkpoint_mutex); 2510 } 2511 2512 return err; 2513 } 2514 2515 /** 2516 * jbd2_journal_abort () - Shutdown the journal immediately. 2517 * @journal: the journal to shutdown. 2518 * @errno: an error number to record in the journal indicating 2519 * the reason for the shutdown. 2520 * 2521 * Perform a complete, immediate shutdown of the ENTIRE 2522 * journal (not of a single transaction). This operation cannot be 2523 * undone without closing and reopening the journal. 2524 * 2525 * The jbd2_journal_abort function is intended to support higher level error 2526 * recovery mechanisms such as the ext2/ext3 remount-readonly error 2527 * mode. 2528 * 2529 * Journal abort has very specific semantics. Any existing dirty, 2530 * unjournaled buffers in the main filesystem will still be written to 2531 * disk by bdflush, but the journaling mechanism will be suspended 2532 * immediately and no further transaction commits will be honoured. 2533 * 2534 * Any dirty, journaled buffers will be written back to disk without 2535 * hitting the journal. Atomicity cannot be guaranteed on an aborted 2536 * filesystem, but we _do_ attempt to leave as much data as possible 2537 * behind for fsck to use for cleanup. 2538 * 2539 * Any attempt to get a new transaction handle on a journal which is in 2540 * ABORT state will just result in an -EROFS error return. A 2541 * jbd2_journal_stop on an existing handle will return -EIO if we have 2542 * entered abort state during the update. 2543 * 2544 * Recursive transactions are not disturbed by journal abort until the 2545 * final jbd2_journal_stop, which will receive the -EIO error. 2546 * 2547 * Finally, the jbd2_journal_abort call allows the caller to supply an errno 2548 * which will be recorded (if possible) in the journal superblock. This 2549 * allows a client to record failure conditions in the middle of a 2550 * transaction without having to complete the transaction to record the 2551 * failure to disk. ext3_error, for example, now uses this 2552 * functionality. 2553 * 2554 */ 2555 2556 void jbd2_journal_abort(journal_t *journal, int errno) 2557 { 2558 transaction_t *transaction; 2559 2560 /* 2561 * Lock the aborting procedure until everything is done, this avoid 2562 * races between filesystem's error handling flow (e.g. ext4_abort()), 2563 * ensure panic after the error info is written into journal's 2564 * superblock. 2565 */ 2566 mutex_lock(&journal->j_abort_mutex); 2567 /* 2568 * ESHUTDOWN always takes precedence because a file system check 2569 * caused by any other journal abort error is not required after 2570 * a shutdown triggered. 2571 */ 2572 write_lock(&journal->j_state_lock); 2573 if (journal->j_flags & JBD2_ABORT) { 2574 int old_errno = journal->j_errno; 2575 2576 write_unlock(&journal->j_state_lock); 2577 if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { 2578 journal->j_errno = errno; 2579 jbd2_journal_update_sb_errno(journal); 2580 } 2581 mutex_unlock(&journal->j_abort_mutex); 2582 return; 2583 } 2584 2585 /* 2586 * Mark the abort as occurred and start current running transaction 2587 * to release all journaled buffer. 2588 */ 2589 pr_err("Aborting journal on device %s.\n", journal->j_devname); 2590 2591 journal->j_flags |= JBD2_ABORT; 2592 journal->j_errno = errno; 2593 transaction = journal->j_running_transaction; 2594 if (transaction) 2595 __jbd2_log_start_commit(journal, transaction->t_tid); 2596 write_unlock(&journal->j_state_lock); 2597 2598 /* 2599 * Record errno to the journal super block, so that fsck and jbd2 2600 * layer could realise that a filesystem check is needed. 2601 */ 2602 jbd2_journal_update_sb_errno(journal); 2603 mutex_unlock(&journal->j_abort_mutex); 2604 } 2605 2606 /** 2607 * jbd2_journal_errno() - returns the journal's error state. 2608 * @journal: journal to examine. 2609 * 2610 * This is the errno number set with jbd2_journal_abort(), the last 2611 * time the journal was mounted - if the journal was stopped 2612 * without calling abort this will be 0. 2613 * 2614 * If the journal has been aborted on this mount time -EROFS will 2615 * be returned. 2616 */ 2617 int jbd2_journal_errno(journal_t *journal) 2618 { 2619 int err; 2620 2621 read_lock(&journal->j_state_lock); 2622 if (journal->j_flags & JBD2_ABORT) 2623 err = -EROFS; 2624 else 2625 err = journal->j_errno; 2626 read_unlock(&journal->j_state_lock); 2627 return err; 2628 } 2629 2630 /** 2631 * jbd2_journal_clear_err() - clears the journal's error state 2632 * @journal: journal to act on. 2633 * 2634 * An error must be cleared or acked to take a FS out of readonly 2635 * mode. 2636 */ 2637 int jbd2_journal_clear_err(journal_t *journal) 2638 { 2639 int err = 0; 2640 2641 write_lock(&journal->j_state_lock); 2642 if (journal->j_flags & JBD2_ABORT) 2643 err = -EROFS; 2644 else 2645 journal->j_errno = 0; 2646 write_unlock(&journal->j_state_lock); 2647 return err; 2648 } 2649 2650 /** 2651 * jbd2_journal_ack_err() - Ack journal err. 2652 * @journal: journal to act on. 2653 * 2654 * An error must be cleared or acked to take a FS out of readonly 2655 * mode. 2656 */ 2657 void jbd2_journal_ack_err(journal_t *journal) 2658 { 2659 write_lock(&journal->j_state_lock); 2660 if (journal->j_errno) 2661 journal->j_flags |= JBD2_ACK_ERR; 2662 write_unlock(&journal->j_state_lock); 2663 } 2664 2665 int jbd2_journal_blocks_per_folio(struct inode *inode) 2666 { 2667 return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - 2668 inode->i_sb->s_blocksize_bits); 2669 } 2670 2671 /* 2672 * helper functions to deal with 32 or 64bit block numbers. 2673 */ 2674 size_t journal_tag_bytes(journal_t *journal) 2675 { 2676 size_t sz; 2677 2678 if (jbd2_has_feature_csum3(journal)) 2679 return sizeof(journal_block_tag3_t); 2680 2681 sz = sizeof(journal_block_tag_t); 2682 2683 if (jbd2_has_feature_csum2(journal)) 2684 sz += sizeof(__u16); 2685 2686 if (jbd2_has_feature_64bit(journal)) 2687 return sz; 2688 else 2689 return sz - sizeof(__u32); 2690 } 2691 2692 /* 2693 * Journal_head storage management 2694 */ 2695 static struct kmem_cache *jbd2_journal_head_cache; 2696 #ifdef CONFIG_JBD2_DEBUG 2697 static atomic_t nr_journal_heads = ATOMIC_INIT(0); 2698 #endif 2699 2700 static int __init jbd2_journal_init_journal_head_cache(void) 2701 { 2702 J_ASSERT(!jbd2_journal_head_cache); 2703 jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", 2704 sizeof(struct journal_head), 2705 0, /* offset */ 2706 SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, 2707 NULL); /* ctor */ 2708 if (!jbd2_journal_head_cache) { 2709 printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); 2710 return -ENOMEM; 2711 } 2712 return 0; 2713 } 2714 2715 static void jbd2_journal_destroy_journal_head_cache(void) 2716 { 2717 kmem_cache_destroy(jbd2_journal_head_cache); 2718 jbd2_journal_head_cache = NULL; 2719 } 2720 2721 /* 2722 * journal_head splicing and dicing 2723 */ 2724 static struct journal_head *journal_alloc_journal_head(void) 2725 { 2726 struct journal_head *ret; 2727 2728 #ifdef CONFIG_JBD2_DEBUG 2729 atomic_inc(&nr_journal_heads); 2730 #endif 2731 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); 2732 if (!ret) { 2733 jbd2_debug(1, "out of memory for journal_head\n"); 2734 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); 2735 ret = kmem_cache_zalloc(jbd2_journal_head_cache, 2736 GFP_NOFS | __GFP_NOFAIL); 2737 } 2738 spin_lock_init(&ret->b_state_lock); 2739 return ret; 2740 } 2741 2742 static void journal_free_journal_head(struct journal_head *jh) 2743 { 2744 #ifdef CONFIG_JBD2_DEBUG 2745 atomic_dec(&nr_journal_heads); 2746 memset(jh, JBD2_POISON_FREE, sizeof(*jh)); 2747 #endif 2748 kmem_cache_free(jbd2_journal_head_cache, jh); 2749 } 2750 2751 /* 2752 * A journal_head is attached to a buffer_head whenever JBD has an 2753 * interest in the buffer. 2754 * 2755 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit 2756 * is set. This bit is tested in core kernel code where we need to take 2757 * JBD-specific actions. Testing the zeroness of ->b_private is not reliable 2758 * there. 2759 * 2760 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. 2761 * 2762 * When a buffer has its BH_JBD bit set it is immune from being released by 2763 * core kernel code, mainly via ->b_count. 2764 * 2765 * A journal_head is detached from its buffer_head when the journal_head's 2766 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint 2767 * transaction (b_cp_transaction) hold their references to b_jcount. 2768 * 2769 * Various places in the kernel want to attach a journal_head to a buffer_head 2770 * _before_ attaching the journal_head to a transaction. To protect the 2771 * journal_head in this situation, jbd2_journal_add_journal_head elevates the 2772 * journal_head's b_jcount refcount by one. The caller must call 2773 * jbd2_journal_put_journal_head() to undo this. 2774 * 2775 * So the typical usage would be: 2776 * 2777 * (Attach a journal_head if needed. Increments b_jcount) 2778 * struct journal_head *jh = jbd2_journal_add_journal_head(bh); 2779 * ... 2780 * (Get another reference for transaction) 2781 * jbd2_journal_grab_journal_head(bh); 2782 * jh->b_transaction = xxx; 2783 * (Put original reference) 2784 * jbd2_journal_put_journal_head(jh); 2785 */ 2786 2787 /* 2788 * Give a buffer_head a journal_head. 2789 * 2790 * May sleep. 2791 */ 2792 struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) 2793 { 2794 struct journal_head *jh; 2795 struct journal_head *new_jh = NULL; 2796 2797 repeat: 2798 if (!buffer_jbd(bh)) 2799 new_jh = journal_alloc_journal_head(); 2800 2801 jbd_lock_bh_journal_head(bh); 2802 if (buffer_jbd(bh)) { 2803 jh = bh2jh(bh); 2804 } else { 2805 J_ASSERT_BH(bh, 2806 (atomic_read(&bh->b_count) > 0) || 2807 (bh->b_folio && bh->b_folio->mapping)); 2808 2809 if (!new_jh) { 2810 jbd_unlock_bh_journal_head(bh); 2811 goto repeat; 2812 } 2813 2814 jh = new_jh; 2815 new_jh = NULL; /* We consumed it */ 2816 set_buffer_jbd(bh); 2817 bh->b_private = jh; 2818 jh->b_bh = bh; 2819 get_bh(bh); 2820 BUFFER_TRACE(bh, "added journal_head"); 2821 } 2822 jh->b_jcount++; 2823 jbd_unlock_bh_journal_head(bh); 2824 if (new_jh) 2825 journal_free_journal_head(new_jh); 2826 return bh->b_private; 2827 } 2828 2829 /* 2830 * Grab a ref against this buffer_head's journal_head. If it ended up not 2831 * having a journal_head, return NULL 2832 */ 2833 struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh) 2834 { 2835 struct journal_head *jh = NULL; 2836 2837 jbd_lock_bh_journal_head(bh); 2838 if (buffer_jbd(bh)) { 2839 jh = bh2jh(bh); 2840 jh->b_jcount++; 2841 } 2842 jbd_unlock_bh_journal_head(bh); 2843 return jh; 2844 } 2845 EXPORT_SYMBOL(jbd2_journal_grab_journal_head); 2846 2847 static void __journal_remove_journal_head(struct buffer_head *bh) 2848 { 2849 struct journal_head *jh = bh2jh(bh); 2850 2851 J_ASSERT_JH(jh, jh->b_transaction == NULL); 2852 J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 2853 J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); 2854 J_ASSERT_JH(jh, jh->b_jlist == BJ_None); 2855 J_ASSERT_BH(bh, buffer_jbd(bh)); 2856 J_ASSERT_BH(bh, jh2bh(jh) == bh); 2857 BUFFER_TRACE(bh, "remove journal_head"); 2858 2859 /* Unlink before dropping the lock */ 2860 bh->b_private = NULL; 2861 jh->b_bh = NULL; /* debug, really */ 2862 clear_buffer_jbd(bh); 2863 } 2864 2865 static void journal_release_journal_head(struct journal_head *jh) 2866 { 2867 if (jh->b_frozen_data) { 2868 printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); 2869 kfree(jh->b_frozen_data); 2870 } 2871 if (jh->b_committed_data) { 2872 printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); 2873 kfree(jh->b_committed_data); 2874 } 2875 journal_free_journal_head(jh); 2876 } 2877 2878 /* 2879 * Drop a reference on the passed journal_head. If it fell to zero then 2880 * release the journal_head from the buffer_head. 2881 */ 2882 void jbd2_journal_put_journal_head(struct journal_head *jh) 2883 { 2884 struct buffer_head *bh = jh2bh(jh); 2885 2886 jbd_lock_bh_journal_head(bh); 2887 J_ASSERT_JH(jh, jh->b_jcount > 0); 2888 --jh->b_jcount; 2889 if (!jh->b_jcount) { 2890 __journal_remove_journal_head(bh); 2891 jbd_unlock_bh_journal_head(bh); 2892 journal_release_journal_head(jh); 2893 __brelse(bh); 2894 } else { 2895 jbd_unlock_bh_journal_head(bh); 2896 } 2897 } 2898 EXPORT_SYMBOL(jbd2_journal_put_journal_head); 2899 2900 /* 2901 * Initialize jbd inode head 2902 */ 2903 void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) 2904 { 2905 jinode->i_transaction = NULL; 2906 jinode->i_next_transaction = NULL; 2907 jinode->i_vfs_inode = inode; 2908 jinode->i_flags = 0; 2909 jinode->i_dirty_start_page = 0; 2910 jinode->i_dirty_end_page = 0; 2911 INIT_LIST_HEAD(&jinode->i_list); 2912 } 2913 2914 /* 2915 * Function to be called before we start removing inode from memory (i.e., 2916 * clear_inode() is a fine place to be called from). It removes inode from 2917 * transaction's lists. 2918 */ 2919 void jbd2_journal_release_jbd_inode(journal_t *journal, 2920 struct jbd2_inode *jinode) 2921 { 2922 if (!journal) 2923 return; 2924 restart: 2925 spin_lock(&journal->j_list_lock); 2926 /* Is commit writing out inode - we have to wait */ 2927 if (jinode->i_flags & JI_COMMIT_RUNNING) { 2928 wait_queue_head_t *wq; 2929 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2930 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2931 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2932 spin_unlock(&journal->j_list_lock); 2933 schedule(); 2934 finish_wait(wq, &wait.wq_entry); 2935 goto restart; 2936 } 2937 2938 if (jinode->i_transaction) { 2939 list_del(&jinode->i_list); 2940 jinode->i_transaction = NULL; 2941 } 2942 spin_unlock(&journal->j_list_lock); 2943 } 2944 2945 2946 #ifdef CONFIG_PROC_FS 2947 2948 #define JBD2_STATS_PROC_NAME "fs/jbd2" 2949 2950 static void __init jbd2_create_jbd_stats_proc_entry(void) 2951 { 2952 proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); 2953 } 2954 2955 static void __exit jbd2_remove_jbd_stats_proc_entry(void) 2956 { 2957 if (proc_jbd2_stats) 2958 remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); 2959 } 2960 2961 #else 2962 2963 #define jbd2_create_jbd_stats_proc_entry() do {} while (0) 2964 #define jbd2_remove_jbd_stats_proc_entry() do {} while (0) 2965 2966 #endif 2967 2968 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; 2969 2970 static int __init jbd2_journal_init_inode_cache(void) 2971 { 2972 J_ASSERT(!jbd2_inode_cache); 2973 jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); 2974 if (!jbd2_inode_cache) { 2975 pr_emerg("JBD2: failed to create inode cache\n"); 2976 return -ENOMEM; 2977 } 2978 return 0; 2979 } 2980 2981 static int __init jbd2_journal_init_handle_cache(void) 2982 { 2983 J_ASSERT(!jbd2_handle_cache); 2984 jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); 2985 if (!jbd2_handle_cache) { 2986 printk(KERN_EMERG "JBD2: failed to create handle cache\n"); 2987 return -ENOMEM; 2988 } 2989 return 0; 2990 } 2991 2992 static void jbd2_journal_destroy_inode_cache(void) 2993 { 2994 kmem_cache_destroy(jbd2_inode_cache); 2995 jbd2_inode_cache = NULL; 2996 } 2997 2998 static void jbd2_journal_destroy_handle_cache(void) 2999 { 3000 kmem_cache_destroy(jbd2_handle_cache); 3001 jbd2_handle_cache = NULL; 3002 } 3003 3004 /* 3005 * Module startup and shutdown 3006 */ 3007 3008 static int __init journal_init_caches(void) 3009 { 3010 int ret; 3011 3012 ret = jbd2_journal_init_revoke_record_cache(); 3013 if (ret == 0) 3014 ret = jbd2_journal_init_revoke_table_cache(); 3015 if (ret == 0) 3016 ret = jbd2_journal_init_journal_head_cache(); 3017 if (ret == 0) 3018 ret = jbd2_journal_init_handle_cache(); 3019 if (ret == 0) 3020 ret = jbd2_journal_init_inode_cache(); 3021 if (ret == 0) 3022 ret = jbd2_journal_init_transaction_cache(); 3023 return ret; 3024 } 3025 3026 static void jbd2_journal_destroy_caches(void) 3027 { 3028 jbd2_journal_destroy_revoke_record_cache(); 3029 jbd2_journal_destroy_revoke_table_cache(); 3030 jbd2_journal_destroy_journal_head_cache(); 3031 jbd2_journal_destroy_handle_cache(); 3032 jbd2_journal_destroy_inode_cache(); 3033 jbd2_journal_destroy_transaction_cache(); 3034 } 3035 3036 static int __init journal_init(void) 3037 { 3038 int ret; 3039 3040 BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); 3041 3042 ret = journal_init_caches(); 3043 if (ret == 0) { 3044 jbd2_create_jbd_stats_proc_entry(); 3045 } else { 3046 jbd2_journal_destroy_caches(); 3047 } 3048 return ret; 3049 } 3050 3051 static void __exit journal_exit(void) 3052 { 3053 #ifdef CONFIG_JBD2_DEBUG 3054 int n = atomic_read(&nr_journal_heads); 3055 if (n) 3056 printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); 3057 #endif 3058 jbd2_remove_jbd_stats_proc_entry(); 3059 jbd2_journal_destroy_caches(); 3060 } 3061 3062 MODULE_DESCRIPTION("Generic filesystem journal-writing module"); 3063 MODULE_LICENSE("GPL"); 3064 module_init(journal_init); 3065 module_exit(journal_exit); 3066