1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_errortag.h" 14 #include "xfs_error.h" 15 #include "xfs_trans.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_log.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trace.h" 20 #include "xfs_sysfs.h" 21 #include "xfs_sb.h" 22 #include "xfs_health.h" 23 24 kmem_zone_t *xfs_log_ticket_zone; 25 26 /* Local miscellaneous function prototypes */ 27 STATIC int 28 xlog_commit_record( 29 struct xlog *log, 30 struct xlog_ticket *ticket, 31 struct xlog_in_core **iclog, 32 xfs_lsn_t *commitlsnp); 33 34 STATIC struct xlog * 35 xlog_alloc_log( 36 struct xfs_mount *mp, 37 struct xfs_buftarg *log_target, 38 xfs_daddr_t blk_offset, 39 int num_bblks); 40 STATIC int 41 xlog_space_left( 42 struct xlog *log, 43 atomic64_t *head); 44 STATIC void 45 xlog_dealloc_log( 46 struct xlog *log); 47 48 /* local state machine functions */ 49 STATIC void xlog_state_done_syncing( 50 struct xlog_in_core *iclog); 51 STATIC int 52 xlog_state_get_iclog_space( 53 struct xlog *log, 54 int len, 55 struct xlog_in_core **iclog, 56 struct xlog_ticket *ticket, 57 int *continued_write, 58 int *logoffsetp); 59 STATIC void 60 xlog_state_switch_iclogs( 61 struct xlog *log, 62 struct xlog_in_core *iclog, 63 int eventual_size); 64 STATIC void 65 xlog_grant_push_ail( 66 struct xlog *log, 67 int need_bytes); 68 STATIC void 69 xlog_regrant_reserve_log_space( 70 struct xlog *log, 71 struct xlog_ticket *ticket); 72 STATIC void 73 xlog_ungrant_log_space( 74 struct xlog *log, 75 struct xlog_ticket *ticket); 76 STATIC void 77 xlog_sync( 78 struct xlog *log, 79 struct xlog_in_core *iclog); 80 #if defined(DEBUG) 81 STATIC void 82 xlog_verify_dest_ptr( 83 struct xlog *log, 84 void *ptr); 85 STATIC void 86 xlog_verify_grant_tail( 87 struct xlog *log); 88 STATIC void 89 xlog_verify_iclog( 90 struct xlog *log, 91 struct xlog_in_core *iclog, 92 int count); 93 STATIC void 94 xlog_verify_tail_lsn( 95 struct xlog *log, 96 struct xlog_in_core *iclog, 97 xfs_lsn_t tail_lsn); 98 #else 99 #define xlog_verify_dest_ptr(a,b) 100 #define xlog_verify_grant_tail(a) 101 #define xlog_verify_iclog(a,b,c) 102 #define xlog_verify_tail_lsn(a,b,c) 103 #endif 104 105 STATIC int 106 xlog_iclogs_empty( 107 struct xlog *log); 108 109 static void 110 xlog_grant_sub_space( 111 struct xlog *log, 112 atomic64_t *head, 113 int bytes) 114 { 115 int64_t head_val = atomic64_read(head); 116 int64_t new, old; 117 118 do { 119 int cycle, space; 120 121 xlog_crack_grant_head_val(head_val, &cycle, &space); 122 123 space -= bytes; 124 if (space < 0) { 125 space += log->l_logsize; 126 cycle--; 127 } 128 129 old = head_val; 130 new = xlog_assign_grant_head_val(cycle, space); 131 head_val = atomic64_cmpxchg(head, old, new); 132 } while (head_val != old); 133 } 134 135 static void 136 xlog_grant_add_space( 137 struct xlog *log, 138 atomic64_t *head, 139 int bytes) 140 { 141 int64_t head_val = atomic64_read(head); 142 int64_t new, old; 143 144 do { 145 int tmp; 146 int cycle, space; 147 148 xlog_crack_grant_head_val(head_val, &cycle, &space); 149 150 tmp = log->l_logsize - space; 151 if (tmp > bytes) 152 space += bytes; 153 else { 154 space = bytes - tmp; 155 cycle++; 156 } 157 158 old = head_val; 159 new = xlog_assign_grant_head_val(cycle, space); 160 head_val = atomic64_cmpxchg(head, old, new); 161 } while (head_val != old); 162 } 163 164 STATIC void 165 xlog_grant_head_init( 166 struct xlog_grant_head *head) 167 { 168 xlog_assign_grant_head(&head->grant, 1, 0); 169 INIT_LIST_HEAD(&head->waiters); 170 spin_lock_init(&head->lock); 171 } 172 173 STATIC void 174 xlog_grant_head_wake_all( 175 struct xlog_grant_head *head) 176 { 177 struct xlog_ticket *tic; 178 179 spin_lock(&head->lock); 180 list_for_each_entry(tic, &head->waiters, t_queue) 181 wake_up_process(tic->t_task); 182 spin_unlock(&head->lock); 183 } 184 185 static inline int 186 xlog_ticket_reservation( 187 struct xlog *log, 188 struct xlog_grant_head *head, 189 struct xlog_ticket *tic) 190 { 191 if (head == &log->l_write_head) { 192 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 193 return tic->t_unit_res; 194 } else { 195 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 196 return tic->t_unit_res * tic->t_cnt; 197 else 198 return tic->t_unit_res; 199 } 200 } 201 202 STATIC bool 203 xlog_grant_head_wake( 204 struct xlog *log, 205 struct xlog_grant_head *head, 206 int *free_bytes) 207 { 208 struct xlog_ticket *tic; 209 int need_bytes; 210 bool woken_task = false; 211 212 list_for_each_entry(tic, &head->waiters, t_queue) { 213 214 /* 215 * There is a chance that the size of the CIL checkpoints in 216 * progress at the last AIL push target calculation resulted in 217 * limiting the target to the log head (l_last_sync_lsn) at the 218 * time. This may not reflect where the log head is now as the 219 * CIL checkpoints may have completed. 220 * 221 * Hence when we are woken here, it may be that the head of the 222 * log that has moved rather than the tail. As the tail didn't 223 * move, there still won't be space available for the 224 * reservation we require. However, if the AIL has already 225 * pushed to the target defined by the old log head location, we 226 * will hang here waiting for something else to update the AIL 227 * push target. 228 * 229 * Therefore, if there isn't space to wake the first waiter on 230 * the grant head, we need to push the AIL again to ensure the 231 * target reflects both the current log tail and log head 232 * position before we wait for the tail to move again. 233 */ 234 235 need_bytes = xlog_ticket_reservation(log, head, tic); 236 if (*free_bytes < need_bytes) { 237 if (!woken_task) 238 xlog_grant_push_ail(log, need_bytes); 239 return false; 240 } 241 242 *free_bytes -= need_bytes; 243 trace_xfs_log_grant_wake_up(log, tic); 244 wake_up_process(tic->t_task); 245 woken_task = true; 246 } 247 248 return true; 249 } 250 251 STATIC int 252 xlog_grant_head_wait( 253 struct xlog *log, 254 struct xlog_grant_head *head, 255 struct xlog_ticket *tic, 256 int need_bytes) __releases(&head->lock) 257 __acquires(&head->lock) 258 { 259 list_add_tail(&tic->t_queue, &head->waiters); 260 261 do { 262 if (XLOG_FORCED_SHUTDOWN(log)) 263 goto shutdown; 264 xlog_grant_push_ail(log, need_bytes); 265 266 __set_current_state(TASK_UNINTERRUPTIBLE); 267 spin_unlock(&head->lock); 268 269 XFS_STATS_INC(log->l_mp, xs_sleep_logspace); 270 271 trace_xfs_log_grant_sleep(log, tic); 272 schedule(); 273 trace_xfs_log_grant_wake(log, tic); 274 275 spin_lock(&head->lock); 276 if (XLOG_FORCED_SHUTDOWN(log)) 277 goto shutdown; 278 } while (xlog_space_left(log, &head->grant) < need_bytes); 279 280 list_del_init(&tic->t_queue); 281 return 0; 282 shutdown: 283 list_del_init(&tic->t_queue); 284 return -EIO; 285 } 286 287 /* 288 * Atomically get the log space required for a log ticket. 289 * 290 * Once a ticket gets put onto head->waiters, it will only return after the 291 * needed reservation is satisfied. 292 * 293 * This function is structured so that it has a lock free fast path. This is 294 * necessary because every new transaction reservation will come through this 295 * path. Hence any lock will be globally hot if we take it unconditionally on 296 * every pass. 297 * 298 * As tickets are only ever moved on and off head->waiters under head->lock, we 299 * only need to take that lock if we are going to add the ticket to the queue 300 * and sleep. We can avoid taking the lock if the ticket was never added to 301 * head->waiters because the t_queue list head will be empty and we hold the 302 * only reference to it so it can safely be checked unlocked. 303 */ 304 STATIC int 305 xlog_grant_head_check( 306 struct xlog *log, 307 struct xlog_grant_head *head, 308 struct xlog_ticket *tic, 309 int *need_bytes) 310 { 311 int free_bytes; 312 int error = 0; 313 314 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 315 316 /* 317 * If there are other waiters on the queue then give them a chance at 318 * logspace before us. Wake up the first waiters, if we do not wake 319 * up all the waiters then go to sleep waiting for more free space, 320 * otherwise try to get some space for this transaction. 321 */ 322 *need_bytes = xlog_ticket_reservation(log, head, tic); 323 free_bytes = xlog_space_left(log, &head->grant); 324 if (!list_empty_careful(&head->waiters)) { 325 spin_lock(&head->lock); 326 if (!xlog_grant_head_wake(log, head, &free_bytes) || 327 free_bytes < *need_bytes) { 328 error = xlog_grant_head_wait(log, head, tic, 329 *need_bytes); 330 } 331 spin_unlock(&head->lock); 332 } else if (free_bytes < *need_bytes) { 333 spin_lock(&head->lock); 334 error = xlog_grant_head_wait(log, head, tic, *need_bytes); 335 spin_unlock(&head->lock); 336 } 337 338 return error; 339 } 340 341 static void 342 xlog_tic_reset_res(xlog_ticket_t *tic) 343 { 344 tic->t_res_num = 0; 345 tic->t_res_arr_sum = 0; 346 tic->t_res_num_ophdrs = 0; 347 } 348 349 static void 350 xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) 351 { 352 if (tic->t_res_num == XLOG_TIC_LEN_MAX) { 353 /* add to overflow and start again */ 354 tic->t_res_o_flow += tic->t_res_arr_sum; 355 tic->t_res_num = 0; 356 tic->t_res_arr_sum = 0; 357 } 358 359 tic->t_res_arr[tic->t_res_num].r_len = len; 360 tic->t_res_arr[tic->t_res_num].r_type = type; 361 tic->t_res_arr_sum += len; 362 tic->t_res_num++; 363 } 364 365 /* 366 * Replenish the byte reservation required by moving the grant write head. 367 */ 368 int 369 xfs_log_regrant( 370 struct xfs_mount *mp, 371 struct xlog_ticket *tic) 372 { 373 struct xlog *log = mp->m_log; 374 int need_bytes; 375 int error = 0; 376 377 if (XLOG_FORCED_SHUTDOWN(log)) 378 return -EIO; 379 380 XFS_STATS_INC(mp, xs_try_logspace); 381 382 /* 383 * This is a new transaction on the ticket, so we need to change the 384 * transaction ID so that the next transaction has a different TID in 385 * the log. Just add one to the existing tid so that we can see chains 386 * of rolling transactions in the log easily. 387 */ 388 tic->t_tid++; 389 390 xlog_grant_push_ail(log, tic->t_unit_res); 391 392 tic->t_curr_res = tic->t_unit_res; 393 xlog_tic_reset_res(tic); 394 395 if (tic->t_cnt > 0) 396 return 0; 397 398 trace_xfs_log_regrant(log, tic); 399 400 error = xlog_grant_head_check(log, &log->l_write_head, tic, 401 &need_bytes); 402 if (error) 403 goto out_error; 404 405 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 406 trace_xfs_log_regrant_exit(log, tic); 407 xlog_verify_grant_tail(log); 408 return 0; 409 410 out_error: 411 /* 412 * If we are failing, make sure the ticket doesn't have any current 413 * reservations. We don't want to add this back when the ticket/ 414 * transaction gets cancelled. 415 */ 416 tic->t_curr_res = 0; 417 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 418 return error; 419 } 420 421 /* 422 * Reserve log space and return a ticket corresponding to the reservation. 423 * 424 * Each reservation is going to reserve extra space for a log record header. 425 * When writes happen to the on-disk log, we don't subtract the length of the 426 * log record header from any reservation. By wasting space in each 427 * reservation, we prevent over allocation problems. 428 */ 429 int 430 xfs_log_reserve( 431 struct xfs_mount *mp, 432 int unit_bytes, 433 int cnt, 434 struct xlog_ticket **ticp, 435 uint8_t client, 436 bool permanent) 437 { 438 struct xlog *log = mp->m_log; 439 struct xlog_ticket *tic; 440 int need_bytes; 441 int error = 0; 442 443 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 444 445 if (XLOG_FORCED_SHUTDOWN(log)) 446 return -EIO; 447 448 XFS_STATS_INC(mp, xs_try_logspace); 449 450 ASSERT(*ticp == NULL); 451 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); 452 *ticp = tic; 453 454 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt 455 : tic->t_unit_res); 456 457 trace_xfs_log_reserve(log, tic); 458 459 error = xlog_grant_head_check(log, &log->l_reserve_head, tic, 460 &need_bytes); 461 if (error) 462 goto out_error; 463 464 xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); 465 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 466 trace_xfs_log_reserve_exit(log, tic); 467 xlog_verify_grant_tail(log); 468 return 0; 469 470 out_error: 471 /* 472 * If we are failing, make sure the ticket doesn't have any current 473 * reservations. We don't want to add this back when the ticket/ 474 * transaction gets cancelled. 475 */ 476 tic->t_curr_res = 0; 477 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 478 return error; 479 } 480 481 482 /* 483 * NOTES: 484 * 485 * 1. currblock field gets updated at startup and after in-core logs 486 * marked as with WANT_SYNC. 487 */ 488 489 /* 490 * This routine is called when a user of a log manager ticket is done with 491 * the reservation. If the ticket was ever used, then a commit record for 492 * the associated transaction is written out as a log operation header with 493 * no data. The flag XLOG_TIC_INITED is set when the first write occurs with 494 * a given ticket. If the ticket was one with a permanent reservation, then 495 * a few operations are done differently. Permanent reservation tickets by 496 * default don't release the reservation. They just commit the current 497 * transaction with the belief that the reservation is still needed. A flag 498 * must be passed in before permanent reservations are actually released. 499 * When these type of tickets are not released, they need to be set into 500 * the inited state again. By doing this, a start record will be written 501 * out when the next write occurs. 502 */ 503 xfs_lsn_t 504 xfs_log_done( 505 struct xfs_mount *mp, 506 struct xlog_ticket *ticket, 507 struct xlog_in_core **iclog, 508 bool regrant) 509 { 510 struct xlog *log = mp->m_log; 511 xfs_lsn_t lsn = 0; 512 513 if (XLOG_FORCED_SHUTDOWN(log) || 514 /* 515 * If nothing was ever written, don't write out commit record. 516 * If we get an error, just continue and give back the log ticket. 517 */ 518 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 519 (xlog_commit_record(log, ticket, iclog, &lsn)))) { 520 lsn = (xfs_lsn_t) -1; 521 regrant = false; 522 } 523 524 525 if (!regrant) { 526 trace_xfs_log_done_nonperm(log, ticket); 527 528 /* 529 * Release ticket if not permanent reservation or a specific 530 * request has been made to release a permanent reservation. 531 */ 532 xlog_ungrant_log_space(log, ticket); 533 } else { 534 trace_xfs_log_done_perm(log, ticket); 535 536 xlog_regrant_reserve_log_space(log, ticket); 537 /* If this ticket was a permanent reservation and we aren't 538 * trying to release it, reset the inited flags; so next time 539 * we write, a start record will be written out. 540 */ 541 ticket->t_flags |= XLOG_TIC_INITED; 542 } 543 544 xfs_log_ticket_put(ticket); 545 return lsn; 546 } 547 548 static bool 549 __xlog_state_release_iclog( 550 struct xlog *log, 551 struct xlog_in_core *iclog) 552 { 553 lockdep_assert_held(&log->l_icloglock); 554 555 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 556 /* update tail before writing to iclog */ 557 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); 558 559 iclog->ic_state = XLOG_STATE_SYNCING; 560 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 561 xlog_verify_tail_lsn(log, iclog, tail_lsn); 562 /* cycle incremented when incrementing curr_block */ 563 return true; 564 } 565 566 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 567 return false; 568 } 569 570 /* 571 * Flush iclog to disk if this is the last reference to the given iclog and the 572 * it is in the WANT_SYNC state. 573 */ 574 static int 575 xlog_state_release_iclog( 576 struct xlog *log, 577 struct xlog_in_core *iclog) 578 { 579 lockdep_assert_held(&log->l_icloglock); 580 581 if (iclog->ic_state == XLOG_STATE_IOERROR) 582 return -EIO; 583 584 if (atomic_dec_and_test(&iclog->ic_refcnt) && 585 __xlog_state_release_iclog(log, iclog)) { 586 spin_unlock(&log->l_icloglock); 587 xlog_sync(log, iclog); 588 spin_lock(&log->l_icloglock); 589 } 590 591 return 0; 592 } 593 594 void 595 xfs_log_release_iclog( 596 struct xlog_in_core *iclog) 597 { 598 struct xlog *log = iclog->ic_log; 599 bool sync = false; 600 601 if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { 602 if (iclog->ic_state != XLOG_STATE_IOERROR) 603 sync = __xlog_state_release_iclog(log, iclog); 604 spin_unlock(&log->l_icloglock); 605 } 606 607 if (sync) 608 xlog_sync(log, iclog); 609 } 610 611 /* 612 * Mount a log filesystem 613 * 614 * mp - ubiquitous xfs mount point structure 615 * log_target - buftarg of on-disk log device 616 * blk_offset - Start block # where block size is 512 bytes (BBSIZE) 617 * num_bblocks - Number of BBSIZE blocks in on-disk log 618 * 619 * Return error or zero. 620 */ 621 int 622 xfs_log_mount( 623 xfs_mount_t *mp, 624 xfs_buftarg_t *log_target, 625 xfs_daddr_t blk_offset, 626 int num_bblks) 627 { 628 bool fatal = xfs_sb_version_hascrc(&mp->m_sb); 629 int error = 0; 630 int min_logfsbs; 631 632 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 633 xfs_notice(mp, "Mounting V%d Filesystem", 634 XFS_SB_VERSION_NUM(&mp->m_sb)); 635 } else { 636 xfs_notice(mp, 637 "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", 638 XFS_SB_VERSION_NUM(&mp->m_sb)); 639 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 640 } 641 642 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 643 if (IS_ERR(mp->m_log)) { 644 error = PTR_ERR(mp->m_log); 645 goto out; 646 } 647 648 /* 649 * Validate the given log space and drop a critical message via syslog 650 * if the log size is too small that would lead to some unexpected 651 * situations in transaction log space reservation stage. 652 * 653 * Note: we can't just reject the mount if the validation fails. This 654 * would mean that people would have to downgrade their kernel just to 655 * remedy the situation as there is no way to grow the log (short of 656 * black magic surgery with xfs_db). 657 * 658 * We can, however, reject mounts for CRC format filesystems, as the 659 * mkfs binary being used to make the filesystem should never create a 660 * filesystem with a log that is too small. 661 */ 662 min_logfsbs = xfs_log_calc_minimum_size(mp); 663 664 if (mp->m_sb.sb_logblocks < min_logfsbs) { 665 xfs_warn(mp, 666 "Log size %d blocks too small, minimum size is %d blocks", 667 mp->m_sb.sb_logblocks, min_logfsbs); 668 error = -EINVAL; 669 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { 670 xfs_warn(mp, 671 "Log size %d blocks too large, maximum size is %lld blocks", 672 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); 673 error = -EINVAL; 674 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { 675 xfs_warn(mp, 676 "log size %lld bytes too large, maximum size is %lld bytes", 677 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), 678 XFS_MAX_LOG_BYTES); 679 error = -EINVAL; 680 } else if (mp->m_sb.sb_logsunit > 1 && 681 mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { 682 xfs_warn(mp, 683 "log stripe unit %u bytes must be a multiple of block size", 684 mp->m_sb.sb_logsunit); 685 error = -EINVAL; 686 fatal = true; 687 } 688 if (error) { 689 /* 690 * Log check errors are always fatal on v5; or whenever bad 691 * metadata leads to a crash. 692 */ 693 if (fatal) { 694 xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); 695 ASSERT(0); 696 goto out_free_log; 697 } 698 xfs_crit(mp, "Log size out of supported range."); 699 xfs_crit(mp, 700 "Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); 701 } 702 703 /* 704 * Initialize the AIL now we have a log. 705 */ 706 error = xfs_trans_ail_init(mp); 707 if (error) { 708 xfs_warn(mp, "AIL initialisation failed: error %d", error); 709 goto out_free_log; 710 } 711 mp->m_log->l_ailp = mp->m_ail; 712 713 /* 714 * skip log recovery on a norecovery mount. pretend it all 715 * just worked. 716 */ 717 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 718 int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 719 720 if (readonly) 721 mp->m_flags &= ~XFS_MOUNT_RDONLY; 722 723 error = xlog_recover(mp->m_log); 724 725 if (readonly) 726 mp->m_flags |= XFS_MOUNT_RDONLY; 727 if (error) { 728 xfs_warn(mp, "log mount/recovery failed: error %d", 729 error); 730 xlog_recover_cancel(mp->m_log); 731 goto out_destroy_ail; 732 } 733 } 734 735 error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, 736 "log"); 737 if (error) 738 goto out_destroy_ail; 739 740 /* Normal transactions can now occur */ 741 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 742 743 /* 744 * Now the log has been fully initialised and we know were our 745 * space grant counters are, we can initialise the permanent ticket 746 * needed for delayed logging to work. 747 */ 748 xlog_cil_init_post_recovery(mp->m_log); 749 750 return 0; 751 752 out_destroy_ail: 753 xfs_trans_ail_destroy(mp); 754 out_free_log: 755 xlog_dealloc_log(mp->m_log); 756 out: 757 return error; 758 } 759 760 /* 761 * Finish the recovery of the file system. This is separate from the 762 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read 763 * in the root and real-time bitmap inodes between calling xfs_log_mount() and 764 * here. 765 * 766 * If we finish recovery successfully, start the background log work. If we are 767 * not doing recovery, then we have a RO filesystem and we don't need to start 768 * it. 769 */ 770 int 771 xfs_log_mount_finish( 772 struct xfs_mount *mp) 773 { 774 int error = 0; 775 bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 776 bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; 777 778 if (mp->m_flags & XFS_MOUNT_NORECOVERY) { 779 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 780 return 0; 781 } else if (readonly) { 782 /* Allow unlinked processing to proceed */ 783 mp->m_flags &= ~XFS_MOUNT_RDONLY; 784 } 785 786 /* 787 * During the second phase of log recovery, we need iget and 788 * iput to behave like they do for an active filesystem. 789 * xfs_fs_drop_inode needs to be able to prevent the deletion 790 * of inodes before we're done replaying log items on those 791 * inodes. Turn it off immediately after recovery finishes 792 * so that we don't leak the quota inodes if subsequent mount 793 * activities fail. 794 * 795 * We let all inodes involved in redo item processing end up on 796 * the LRU instead of being evicted immediately so that if we do 797 * something to an unlinked inode, the irele won't cause 798 * premature truncation and freeing of the inode, which results 799 * in log recovery failure. We have to evict the unreferenced 800 * lru inodes after clearing SB_ACTIVE because we don't 801 * otherwise clean up the lru if there's a subsequent failure in 802 * xfs_mountfs, which leads to us leaking the inodes if nothing 803 * else (e.g. quotacheck) references the inodes before the 804 * mount failure occurs. 805 */ 806 mp->m_super->s_flags |= SB_ACTIVE; 807 error = xlog_recover_finish(mp->m_log); 808 if (!error) 809 xfs_log_work_queue(mp); 810 mp->m_super->s_flags &= ~SB_ACTIVE; 811 evict_inodes(mp->m_super); 812 813 /* 814 * Drain the buffer LRU after log recovery. This is required for v4 815 * filesystems to avoid leaving around buffers with NULL verifier ops, 816 * but we do it unconditionally to make sure we're always in a clean 817 * cache state after mount. 818 * 819 * Don't push in the error case because the AIL may have pending intents 820 * that aren't removed until recovery is cancelled. 821 */ 822 if (!error && recovered) { 823 xfs_log_force(mp, XFS_LOG_SYNC); 824 xfs_ail_push_all_sync(mp->m_ail); 825 } 826 xfs_wait_buftarg(mp->m_ddev_targp); 827 828 if (readonly) 829 mp->m_flags |= XFS_MOUNT_RDONLY; 830 831 return error; 832 } 833 834 /* 835 * The mount has failed. Cancel the recovery if it hasn't completed and destroy 836 * the log. 837 */ 838 void 839 xfs_log_mount_cancel( 840 struct xfs_mount *mp) 841 { 842 xlog_recover_cancel(mp->m_log); 843 xfs_log_unmount(mp); 844 } 845 846 /* 847 * Wait for the iclog to be written disk, or return an error if the log has been 848 * shut down. 849 */ 850 static int 851 xlog_wait_on_iclog( 852 struct xlog_in_core *iclog) 853 __releases(iclog->ic_log->l_icloglock) 854 { 855 struct xlog *log = iclog->ic_log; 856 857 if (!XLOG_FORCED_SHUTDOWN(log) && 858 iclog->ic_state != XLOG_STATE_ACTIVE && 859 iclog->ic_state != XLOG_STATE_DIRTY) { 860 XFS_STATS_INC(log->l_mp, xs_log_force_sleep); 861 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 862 } else { 863 spin_unlock(&log->l_icloglock); 864 } 865 866 if (XLOG_FORCED_SHUTDOWN(log)) 867 return -EIO; 868 return 0; 869 } 870 871 /* 872 * Final log writes as part of unmount. 873 * 874 * Mark the filesystem clean as unmount happens. Note that during relocation 875 * this routine needs to be executed as part of source-bag while the 876 * deallocation must not be done until source-end. 877 */ 878 879 /* Actually write the unmount record to disk. */ 880 static void 881 xfs_log_write_unmount_record( 882 struct xfs_mount *mp) 883 { 884 /* the data section must be 32 bit size aligned */ 885 struct xfs_unmount_log_format magic = { 886 .magic = XLOG_UNMOUNT_TYPE, 887 }; 888 struct xfs_log_iovec reg = { 889 .i_addr = &magic, 890 .i_len = sizeof(magic), 891 .i_type = XLOG_REG_TYPE_UNMOUNT, 892 }; 893 struct xfs_log_vec vec = { 894 .lv_niovecs = 1, 895 .lv_iovecp = ®, 896 }; 897 struct xlog *log = mp->m_log; 898 struct xlog_in_core *iclog; 899 struct xlog_ticket *tic = NULL; 900 xfs_lsn_t lsn; 901 uint flags = XLOG_UNMOUNT_TRANS; 902 int error; 903 904 error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); 905 if (error) 906 goto out_err; 907 908 /* 909 * If we think the summary counters are bad, clear the unmount header 910 * flag in the unmount record so that the summary counters will be 911 * recalculated during log recovery at next mount. Refer to 912 * xlog_check_unmount_rec for more details. 913 */ 914 if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, 915 XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { 916 xfs_alert(mp, "%s: will fix summary counters at next mount", 917 __func__); 918 flags &= ~XLOG_UNMOUNT_TRANS; 919 } 920 921 /* remove inited flag, and account for space used */ 922 tic->t_flags = 0; 923 tic->t_curr_res -= sizeof(magic); 924 error = xlog_write(log, &vec, tic, &lsn, NULL, flags); 925 /* 926 * At this point, we're umounting anyway, so there's no point in 927 * transitioning log state to IOERROR. Just continue... 928 */ 929 out_err: 930 if (error) 931 xfs_alert(mp, "%s: unmount record failed", __func__); 932 933 spin_lock(&log->l_icloglock); 934 iclog = log->l_iclog; 935 atomic_inc(&iclog->ic_refcnt); 936 if (iclog->ic_state == XLOG_STATE_ACTIVE) 937 xlog_state_switch_iclogs(log, iclog, 0); 938 else 939 ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || 940 iclog->ic_state == XLOG_STATE_IOERROR); 941 error = xlog_state_release_iclog(log, iclog); 942 xlog_wait_on_iclog(iclog); 943 944 if (tic) { 945 trace_xfs_log_umount_write(log, tic); 946 xlog_ungrant_log_space(log, tic); 947 xfs_log_ticket_put(tic); 948 } 949 } 950 951 static void 952 xfs_log_unmount_verify_iclog( 953 struct xlog *log) 954 { 955 struct xlog_in_core *iclog = log->l_iclog; 956 957 do { 958 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 959 ASSERT(iclog->ic_offset == 0); 960 } while ((iclog = iclog->ic_next) != log->l_iclog); 961 } 962 963 /* 964 * Unmount record used to have a string "Unmount filesystem--" in the 965 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 966 * We just write the magic number now since that particular field isn't 967 * currently architecture converted and "Unmount" is a bit foo. 968 * As far as I know, there weren't any dependencies on the old behaviour. 969 */ 970 static void 971 xfs_log_unmount_write( 972 struct xfs_mount *mp) 973 { 974 struct xlog *log = mp->m_log; 975 976 /* 977 * Don't write out unmount record on norecovery mounts or ro devices. 978 * Or, if we are doing a forced umount (typically because of IO errors). 979 */ 980 if (mp->m_flags & XFS_MOUNT_NORECOVERY || 981 xfs_readonly_buftarg(log->l_targ)) { 982 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 983 return; 984 } 985 986 xfs_log_force(mp, XFS_LOG_SYNC); 987 988 if (XLOG_FORCED_SHUTDOWN(log)) 989 return; 990 xfs_log_unmount_verify_iclog(log); 991 xfs_log_write_unmount_record(mp); 992 } 993 994 /* 995 * Empty the log for unmount/freeze. 996 * 997 * To do this, we first need to shut down the background log work so it is not 998 * trying to cover the log as we clean up. We then need to unpin all objects in 999 * the log so we can then flush them out. Once they have completed their IO and 1000 * run the callbacks removing themselves from the AIL, we can write the unmount 1001 * record. 1002 */ 1003 void 1004 xfs_log_quiesce( 1005 struct xfs_mount *mp) 1006 { 1007 cancel_delayed_work_sync(&mp->m_log->l_work); 1008 xfs_log_force(mp, XFS_LOG_SYNC); 1009 1010 /* 1011 * The superblock buffer is uncached and while xfs_ail_push_all_sync() 1012 * will push it, xfs_wait_buftarg() will not wait for it. Further, 1013 * xfs_buf_iowait() cannot be used because it was pushed with the 1014 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for 1015 * the IO to complete. 1016 */ 1017 xfs_ail_push_all_sync(mp->m_ail); 1018 xfs_wait_buftarg(mp->m_ddev_targp); 1019 xfs_buf_lock(mp->m_sb_bp); 1020 xfs_buf_unlock(mp->m_sb_bp); 1021 1022 xfs_log_unmount_write(mp); 1023 } 1024 1025 /* 1026 * Shut down and release the AIL and Log. 1027 * 1028 * During unmount, we need to ensure we flush all the dirty metadata objects 1029 * from the AIL so that the log is empty before we write the unmount record to 1030 * the log. Once this is done, we can tear down the AIL and the log. 1031 */ 1032 void 1033 xfs_log_unmount( 1034 struct xfs_mount *mp) 1035 { 1036 xfs_log_quiesce(mp); 1037 1038 xfs_trans_ail_destroy(mp); 1039 1040 xfs_sysfs_del(&mp->m_log->l_kobj); 1041 1042 xlog_dealloc_log(mp->m_log); 1043 } 1044 1045 void 1046 xfs_log_item_init( 1047 struct xfs_mount *mp, 1048 struct xfs_log_item *item, 1049 int type, 1050 const struct xfs_item_ops *ops) 1051 { 1052 item->li_mountp = mp; 1053 item->li_ailp = mp->m_ail; 1054 item->li_type = type; 1055 item->li_ops = ops; 1056 item->li_lv = NULL; 1057 1058 INIT_LIST_HEAD(&item->li_ail); 1059 INIT_LIST_HEAD(&item->li_cil); 1060 INIT_LIST_HEAD(&item->li_bio_list); 1061 INIT_LIST_HEAD(&item->li_trans); 1062 } 1063 1064 /* 1065 * Wake up processes waiting for log space after we have moved the log tail. 1066 */ 1067 void 1068 xfs_log_space_wake( 1069 struct xfs_mount *mp) 1070 { 1071 struct xlog *log = mp->m_log; 1072 int free_bytes; 1073 1074 if (XLOG_FORCED_SHUTDOWN(log)) 1075 return; 1076 1077 if (!list_empty_careful(&log->l_write_head.waiters)) { 1078 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1079 1080 spin_lock(&log->l_write_head.lock); 1081 free_bytes = xlog_space_left(log, &log->l_write_head.grant); 1082 xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); 1083 spin_unlock(&log->l_write_head.lock); 1084 } 1085 1086 if (!list_empty_careful(&log->l_reserve_head.waiters)) { 1087 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1088 1089 spin_lock(&log->l_reserve_head.lock); 1090 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1091 xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); 1092 spin_unlock(&log->l_reserve_head.lock); 1093 } 1094 } 1095 1096 /* 1097 * Determine if we have a transaction that has gone to disk that needs to be 1098 * covered. To begin the transition to the idle state firstly the log needs to 1099 * be idle. That means the CIL, the AIL and the iclogs needs to be empty before 1100 * we start attempting to cover the log. 1101 * 1102 * Only if we are then in a state where covering is needed, the caller is 1103 * informed that dummy transactions are required to move the log into the idle 1104 * state. 1105 * 1106 * If there are any items in the AIl or CIL, then we do not want to attempt to 1107 * cover the log as we may be in a situation where there isn't log space 1108 * available to run a dummy transaction and this can lead to deadlocks when the 1109 * tail of the log is pinned by an item that is modified in the CIL. Hence 1110 * there's no point in running a dummy transaction at this point because we 1111 * can't start trying to idle the log until both the CIL and AIL are empty. 1112 */ 1113 static int 1114 xfs_log_need_covered(xfs_mount_t *mp) 1115 { 1116 struct xlog *log = mp->m_log; 1117 int needed = 0; 1118 1119 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) 1120 return 0; 1121 1122 if (!xlog_cil_empty(log)) 1123 return 0; 1124 1125 spin_lock(&log->l_icloglock); 1126 switch (log->l_covered_state) { 1127 case XLOG_STATE_COVER_DONE: 1128 case XLOG_STATE_COVER_DONE2: 1129 case XLOG_STATE_COVER_IDLE: 1130 break; 1131 case XLOG_STATE_COVER_NEED: 1132 case XLOG_STATE_COVER_NEED2: 1133 if (xfs_ail_min_lsn(log->l_ailp)) 1134 break; 1135 if (!xlog_iclogs_empty(log)) 1136 break; 1137 1138 needed = 1; 1139 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 1140 log->l_covered_state = XLOG_STATE_COVER_DONE; 1141 else 1142 log->l_covered_state = XLOG_STATE_COVER_DONE2; 1143 break; 1144 default: 1145 needed = 1; 1146 break; 1147 } 1148 spin_unlock(&log->l_icloglock); 1149 return needed; 1150 } 1151 1152 /* 1153 * We may be holding the log iclog lock upon entering this routine. 1154 */ 1155 xfs_lsn_t 1156 xlog_assign_tail_lsn_locked( 1157 struct xfs_mount *mp) 1158 { 1159 struct xlog *log = mp->m_log; 1160 struct xfs_log_item *lip; 1161 xfs_lsn_t tail_lsn; 1162 1163 assert_spin_locked(&mp->m_ail->ail_lock); 1164 1165 /* 1166 * To make sure we always have a valid LSN for the log tail we keep 1167 * track of the last LSN which was committed in log->l_last_sync_lsn, 1168 * and use that when the AIL was empty. 1169 */ 1170 lip = xfs_ail_min(mp->m_ail); 1171 if (lip) 1172 tail_lsn = lip->li_lsn; 1173 else 1174 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 1175 trace_xfs_log_assign_tail_lsn(log, tail_lsn); 1176 atomic64_set(&log->l_tail_lsn, tail_lsn); 1177 return tail_lsn; 1178 } 1179 1180 xfs_lsn_t 1181 xlog_assign_tail_lsn( 1182 struct xfs_mount *mp) 1183 { 1184 xfs_lsn_t tail_lsn; 1185 1186 spin_lock(&mp->m_ail->ail_lock); 1187 tail_lsn = xlog_assign_tail_lsn_locked(mp); 1188 spin_unlock(&mp->m_ail->ail_lock); 1189 1190 return tail_lsn; 1191 } 1192 1193 /* 1194 * Return the space in the log between the tail and the head. The head 1195 * is passed in the cycle/bytes formal parms. In the special case where 1196 * the reserve head has wrapped passed the tail, this calculation is no 1197 * longer valid. In this case, just return 0 which means there is no space 1198 * in the log. This works for all places where this function is called 1199 * with the reserve head. Of course, if the write head were to ever 1200 * wrap the tail, we should blow up. Rather than catch this case here, 1201 * we depend on other ASSERTions in other parts of the code. XXXmiken 1202 * 1203 * This code also handles the case where the reservation head is behind 1204 * the tail. The details of this case are described below, but the end 1205 * result is that we return the size of the log as the amount of space left. 1206 */ 1207 STATIC int 1208 xlog_space_left( 1209 struct xlog *log, 1210 atomic64_t *head) 1211 { 1212 int free_bytes; 1213 int tail_bytes; 1214 int tail_cycle; 1215 int head_cycle; 1216 int head_bytes; 1217 1218 xlog_crack_grant_head(head, &head_cycle, &head_bytes); 1219 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); 1220 tail_bytes = BBTOB(tail_bytes); 1221 if (tail_cycle == head_cycle && head_bytes >= tail_bytes) 1222 free_bytes = log->l_logsize - (head_bytes - tail_bytes); 1223 else if (tail_cycle + 1 < head_cycle) 1224 return 0; 1225 else if (tail_cycle < head_cycle) { 1226 ASSERT(tail_cycle == (head_cycle - 1)); 1227 free_bytes = tail_bytes - head_bytes; 1228 } else { 1229 /* 1230 * The reservation head is behind the tail. 1231 * In this case we just want to return the size of the 1232 * log as the amount of space left. 1233 */ 1234 xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); 1235 xfs_alert(log->l_mp, 1236 " tail_cycle = %d, tail_bytes = %d", 1237 tail_cycle, tail_bytes); 1238 xfs_alert(log->l_mp, 1239 " GH cycle = %d, GH bytes = %d", 1240 head_cycle, head_bytes); 1241 ASSERT(0); 1242 free_bytes = log->l_logsize; 1243 } 1244 return free_bytes; 1245 } 1246 1247 1248 static void 1249 xlog_ioend_work( 1250 struct work_struct *work) 1251 { 1252 struct xlog_in_core *iclog = 1253 container_of(work, struct xlog_in_core, ic_end_io_work); 1254 struct xlog *log = iclog->ic_log; 1255 int error; 1256 1257 error = blk_status_to_errno(iclog->ic_bio.bi_status); 1258 #ifdef DEBUG 1259 /* treat writes with injected CRC errors as failed */ 1260 if (iclog->ic_fail_crc) 1261 error = -EIO; 1262 #endif 1263 1264 /* 1265 * Race to shutdown the filesystem if we see an error. 1266 */ 1267 if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { 1268 xfs_alert(log->l_mp, "log I/O error %d", error); 1269 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 1270 } 1271 1272 xlog_state_done_syncing(iclog); 1273 bio_uninit(&iclog->ic_bio); 1274 1275 /* 1276 * Drop the lock to signal that we are done. Nothing references the 1277 * iclog after this, so an unmount waiting on this lock can now tear it 1278 * down safely. As such, it is unsafe to reference the iclog after the 1279 * unlock as we could race with it being freed. 1280 */ 1281 up(&iclog->ic_sema); 1282 } 1283 1284 /* 1285 * Return size of each in-core log record buffer. 1286 * 1287 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 1288 * 1289 * If the filesystem blocksize is too large, we may need to choose a 1290 * larger size since the directory code currently logs entire blocks. 1291 */ 1292 STATIC void 1293 xlog_get_iclog_buffer_size( 1294 struct xfs_mount *mp, 1295 struct xlog *log) 1296 { 1297 if (mp->m_logbufs <= 0) 1298 mp->m_logbufs = XLOG_MAX_ICLOGS; 1299 if (mp->m_logbsize <= 0) 1300 mp->m_logbsize = XLOG_BIG_RECORD_BSIZE; 1301 1302 log->l_iclog_bufs = mp->m_logbufs; 1303 log->l_iclog_size = mp->m_logbsize; 1304 1305 /* 1306 * # headers = size / 32k - one header holds cycles from 32k of data. 1307 */ 1308 log->l_iclog_heads = 1309 DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); 1310 log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; 1311 } 1312 1313 void 1314 xfs_log_work_queue( 1315 struct xfs_mount *mp) 1316 { 1317 queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, 1318 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 1319 } 1320 1321 /* 1322 * Every sync period we need to unpin all items in the AIL and push them to 1323 * disk. If there is nothing dirty, then we might need to cover the log to 1324 * indicate that the filesystem is idle. 1325 */ 1326 static void 1327 xfs_log_worker( 1328 struct work_struct *work) 1329 { 1330 struct xlog *log = container_of(to_delayed_work(work), 1331 struct xlog, l_work); 1332 struct xfs_mount *mp = log->l_mp; 1333 1334 /* dgc: errors ignored - not fatal and nowhere to report them */ 1335 if (xfs_log_need_covered(mp)) { 1336 /* 1337 * Dump a transaction into the log that contains no real change. 1338 * This is needed to stamp the current tail LSN into the log 1339 * during the covering operation. 1340 * 1341 * We cannot use an inode here for this - that will push dirty 1342 * state back up into the VFS and then periodic inode flushing 1343 * will prevent log covering from making progress. Hence we 1344 * synchronously log the superblock instead to ensure the 1345 * superblock is immediately unpinned and can be written back. 1346 */ 1347 xfs_sync_sb(mp, true); 1348 } else 1349 xfs_log_force(mp, 0); 1350 1351 /* start pushing all the metadata that is currently dirty */ 1352 xfs_ail_push_all(mp->m_ail); 1353 1354 /* queue us up again */ 1355 xfs_log_work_queue(mp); 1356 } 1357 1358 /* 1359 * This routine initializes some of the log structure for a given mount point. 1360 * Its primary purpose is to fill in enough, so recovery can occur. However, 1361 * some other stuff may be filled in too. 1362 */ 1363 STATIC struct xlog * 1364 xlog_alloc_log( 1365 struct xfs_mount *mp, 1366 struct xfs_buftarg *log_target, 1367 xfs_daddr_t blk_offset, 1368 int num_bblks) 1369 { 1370 struct xlog *log; 1371 xlog_rec_header_t *head; 1372 xlog_in_core_t **iclogp; 1373 xlog_in_core_t *iclog, *prev_iclog=NULL; 1374 int i; 1375 int error = -ENOMEM; 1376 uint log2_size = 0; 1377 1378 log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); 1379 if (!log) { 1380 xfs_warn(mp, "Log allocation failed: No memory!"); 1381 goto out; 1382 } 1383 1384 log->l_mp = mp; 1385 log->l_targ = log_target; 1386 log->l_logsize = BBTOB(num_bblks); 1387 log->l_logBBstart = blk_offset; 1388 log->l_logBBsize = num_bblks; 1389 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1390 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1391 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); 1392 1393 log->l_prev_block = -1; 1394 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1395 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); 1396 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); 1397 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1398 1399 xlog_grant_head_init(&log->l_reserve_head); 1400 xlog_grant_head_init(&log->l_write_head); 1401 1402 error = -EFSCORRUPTED; 1403 if (xfs_sb_version_hassector(&mp->m_sb)) { 1404 log2_size = mp->m_sb.sb_logsectlog; 1405 if (log2_size < BBSHIFT) { 1406 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", 1407 log2_size, BBSHIFT); 1408 goto out_free_log; 1409 } 1410 1411 log2_size -= BBSHIFT; 1412 if (log2_size > mp->m_sectbb_log) { 1413 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", 1414 log2_size, mp->m_sectbb_log); 1415 goto out_free_log; 1416 } 1417 1418 /* for larger sector sizes, must have v2 or external log */ 1419 if (log2_size && log->l_logBBstart > 0 && 1420 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1421 xfs_warn(mp, 1422 "log sector size (0x%x) invalid for configuration.", 1423 log2_size); 1424 goto out_free_log; 1425 } 1426 } 1427 log->l_sectBBsize = 1 << log2_size; 1428 1429 xlog_get_iclog_buffer_size(mp, log); 1430 1431 spin_lock_init(&log->l_icloglock); 1432 init_waitqueue_head(&log->l_flush_wait); 1433 1434 iclogp = &log->l_iclog; 1435 /* 1436 * The amount of memory to allocate for the iclog structure is 1437 * rather funky due to the way the structure is defined. It is 1438 * done this way so that we can use different sizes for machines 1439 * with different amounts of memory. See the definition of 1440 * xlog_in_core_t in xfs_log_priv.h for details. 1441 */ 1442 ASSERT(log->l_iclog_size >= 4096); 1443 for (i = 0; i < log->l_iclog_bufs; i++) { 1444 int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); 1445 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1446 sizeof(struct bio_vec); 1447 1448 iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); 1449 if (!iclog) 1450 goto out_free_iclog; 1451 1452 *iclogp = iclog; 1453 iclog->ic_prev = prev_iclog; 1454 prev_iclog = iclog; 1455 1456 iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, 1457 KM_MAYFAIL | KM_ZERO); 1458 if (!iclog->ic_data) 1459 goto out_free_iclog; 1460 #ifdef DEBUG 1461 log->l_iclog_bak[i] = &iclog->ic_header; 1462 #endif 1463 head = &iclog->ic_header; 1464 memset(head, 0, sizeof(xlog_rec_header_t)); 1465 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1466 head->h_version = cpu_to_be32( 1467 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1468 head->h_size = cpu_to_be32(log->l_iclog_size); 1469 /* new fields */ 1470 head->h_fmt = cpu_to_be32(XLOG_FMT); 1471 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1472 1473 iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; 1474 iclog->ic_state = XLOG_STATE_ACTIVE; 1475 iclog->ic_log = log; 1476 atomic_set(&iclog->ic_refcnt, 0); 1477 spin_lock_init(&iclog->ic_callback_lock); 1478 INIT_LIST_HEAD(&iclog->ic_callbacks); 1479 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1480 1481 init_waitqueue_head(&iclog->ic_force_wait); 1482 init_waitqueue_head(&iclog->ic_write_wait); 1483 INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work); 1484 sema_init(&iclog->ic_sema, 1); 1485 1486 iclogp = &iclog->ic_next; 1487 } 1488 *iclogp = log->l_iclog; /* complete ring */ 1489 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1490 1491 log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", 1492 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, 1493 mp->m_super->s_id); 1494 if (!log->l_ioend_workqueue) 1495 goto out_free_iclog; 1496 1497 error = xlog_cil_init(log); 1498 if (error) 1499 goto out_destroy_workqueue; 1500 return log; 1501 1502 out_destroy_workqueue: 1503 destroy_workqueue(log->l_ioend_workqueue); 1504 out_free_iclog: 1505 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1506 prev_iclog = iclog->ic_next; 1507 kmem_free(iclog->ic_data); 1508 kmem_free(iclog); 1509 if (prev_iclog == log->l_iclog) 1510 break; 1511 } 1512 out_free_log: 1513 kmem_free(log); 1514 out: 1515 return ERR_PTR(error); 1516 } /* xlog_alloc_log */ 1517 1518 1519 /* 1520 * Write out the commit record of a transaction associated with the given 1521 * ticket. Return the lsn of the commit record. 1522 */ 1523 STATIC int 1524 xlog_commit_record( 1525 struct xlog *log, 1526 struct xlog_ticket *ticket, 1527 struct xlog_in_core **iclog, 1528 xfs_lsn_t *commitlsnp) 1529 { 1530 struct xfs_mount *mp = log->l_mp; 1531 int error; 1532 struct xfs_log_iovec reg = { 1533 .i_addr = NULL, 1534 .i_len = 0, 1535 .i_type = XLOG_REG_TYPE_COMMIT, 1536 }; 1537 struct xfs_log_vec vec = { 1538 .lv_niovecs = 1, 1539 .lv_iovecp = ®, 1540 }; 1541 1542 ASSERT_ALWAYS(iclog); 1543 error = xlog_write(log, &vec, ticket, commitlsnp, iclog, 1544 XLOG_COMMIT_TRANS); 1545 if (error) 1546 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1547 return error; 1548 } 1549 1550 /* 1551 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1552 * log space. This code pushes on the lsn which would supposedly free up 1553 * the 25% which we want to leave free. We may need to adopt a policy which 1554 * pushes on an lsn which is further along in the log once we reach the high 1555 * water mark. In this manner, we would be creating a low water mark. 1556 */ 1557 STATIC void 1558 xlog_grant_push_ail( 1559 struct xlog *log, 1560 int need_bytes) 1561 { 1562 xfs_lsn_t threshold_lsn = 0; 1563 xfs_lsn_t last_sync_lsn; 1564 int free_blocks; 1565 int free_bytes; 1566 int threshold_block; 1567 int threshold_cycle; 1568 int free_threshold; 1569 1570 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1571 1572 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1573 free_blocks = BTOBBT(free_bytes); 1574 1575 /* 1576 * Set the threshold for the minimum number of free blocks in the 1577 * log to the maximum of what the caller needs, one quarter of the 1578 * log, and 256 blocks. 1579 */ 1580 free_threshold = BTOBB(need_bytes); 1581 free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); 1582 free_threshold = max(free_threshold, 256); 1583 if (free_blocks >= free_threshold) 1584 return; 1585 1586 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, 1587 &threshold_block); 1588 threshold_block += free_threshold; 1589 if (threshold_block >= log->l_logBBsize) { 1590 threshold_block -= log->l_logBBsize; 1591 threshold_cycle += 1; 1592 } 1593 threshold_lsn = xlog_assign_lsn(threshold_cycle, 1594 threshold_block); 1595 /* 1596 * Don't pass in an lsn greater than the lsn of the last 1597 * log record known to be on disk. Use a snapshot of the last sync lsn 1598 * so that it doesn't change between the compare and the set. 1599 */ 1600 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); 1601 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) 1602 threshold_lsn = last_sync_lsn; 1603 1604 /* 1605 * Get the transaction layer to kick the dirty buffers out to 1606 * disk asynchronously. No point in trying to do this if 1607 * the filesystem is shutting down. 1608 */ 1609 if (!XLOG_FORCED_SHUTDOWN(log)) 1610 xfs_ail_push(log->l_ailp, threshold_lsn); 1611 } 1612 1613 /* 1614 * Stamp cycle number in every block 1615 */ 1616 STATIC void 1617 xlog_pack_data( 1618 struct xlog *log, 1619 struct xlog_in_core *iclog, 1620 int roundoff) 1621 { 1622 int i, j, k; 1623 int size = iclog->ic_offset + roundoff; 1624 __be32 cycle_lsn; 1625 char *dp; 1626 1627 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 1628 1629 dp = iclog->ic_datap; 1630 for (i = 0; i < BTOBB(size); i++) { 1631 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) 1632 break; 1633 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; 1634 *(__be32 *)dp = cycle_lsn; 1635 dp += BBSIZE; 1636 } 1637 1638 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1639 xlog_in_core_2_t *xhdr = iclog->ic_data; 1640 1641 for ( ; i < BTOBB(size); i++) { 1642 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1643 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1644 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; 1645 *(__be32 *)dp = cycle_lsn; 1646 dp += BBSIZE; 1647 } 1648 1649 for (i = 1; i < log->l_iclog_heads; i++) 1650 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 1651 } 1652 } 1653 1654 /* 1655 * Calculate the checksum for a log buffer. 1656 * 1657 * This is a little more complicated than it should be because the various 1658 * headers and the actual data are non-contiguous. 1659 */ 1660 __le32 1661 xlog_cksum( 1662 struct xlog *log, 1663 struct xlog_rec_header *rhead, 1664 char *dp, 1665 int size) 1666 { 1667 uint32_t crc; 1668 1669 /* first generate the crc for the record header ... */ 1670 crc = xfs_start_cksum_update((char *)rhead, 1671 sizeof(struct xlog_rec_header), 1672 offsetof(struct xlog_rec_header, h_crc)); 1673 1674 /* ... then for additional cycle data for v2 logs ... */ 1675 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1676 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; 1677 int i; 1678 int xheads; 1679 1680 xheads = size / XLOG_HEADER_CYCLE_SIZE; 1681 if (size % XLOG_HEADER_CYCLE_SIZE) 1682 xheads++; 1683 1684 for (i = 1; i < xheads; i++) { 1685 crc = crc32c(crc, &xhdr[i].hic_xheader, 1686 sizeof(struct xlog_rec_ext_header)); 1687 } 1688 } 1689 1690 /* ... and finally for the payload */ 1691 crc = crc32c(crc, dp, size); 1692 1693 return xfs_end_cksum(crc); 1694 } 1695 1696 static void 1697 xlog_bio_end_io( 1698 struct bio *bio) 1699 { 1700 struct xlog_in_core *iclog = bio->bi_private; 1701 1702 queue_work(iclog->ic_log->l_ioend_workqueue, 1703 &iclog->ic_end_io_work); 1704 } 1705 1706 static int 1707 xlog_map_iclog_data( 1708 struct bio *bio, 1709 void *data, 1710 size_t count) 1711 { 1712 do { 1713 struct page *page = kmem_to_page(data); 1714 unsigned int off = offset_in_page(data); 1715 size_t len = min_t(size_t, count, PAGE_SIZE - off); 1716 1717 if (bio_add_page(bio, page, len, off) != len) 1718 return -EIO; 1719 1720 data += len; 1721 count -= len; 1722 } while (count); 1723 1724 return 0; 1725 } 1726 1727 STATIC void 1728 xlog_write_iclog( 1729 struct xlog *log, 1730 struct xlog_in_core *iclog, 1731 uint64_t bno, 1732 unsigned int count, 1733 bool need_flush) 1734 { 1735 ASSERT(bno < log->l_logBBsize); 1736 1737 /* 1738 * We lock the iclogbufs here so that we can serialise against I/O 1739 * completion during unmount. We might be processing a shutdown 1740 * triggered during unmount, and that can occur asynchronously to the 1741 * unmount thread, and hence we need to ensure that completes before 1742 * tearing down the iclogbufs. Hence we need to hold the buffer lock 1743 * across the log IO to archieve that. 1744 */ 1745 down(&iclog->ic_sema); 1746 if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { 1747 /* 1748 * It would seem logical to return EIO here, but we rely on 1749 * the log state machine to propagate I/O errors instead of 1750 * doing it here. We kick of the state machine and unlock 1751 * the buffer manually, the code needs to be kept in sync 1752 * with the I/O completion path. 1753 */ 1754 xlog_state_done_syncing(iclog); 1755 up(&iclog->ic_sema); 1756 return; 1757 } 1758 1759 bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); 1760 bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); 1761 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; 1762 iclog->ic_bio.bi_end_io = xlog_bio_end_io; 1763 iclog->ic_bio.bi_private = iclog; 1764 iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; 1765 if (need_flush) 1766 iclog->ic_bio.bi_opf |= REQ_PREFLUSH; 1767 1768 if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { 1769 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 1770 return; 1771 } 1772 if (is_vmalloc_addr(iclog->ic_data)) 1773 flush_kernel_vmap_range(iclog->ic_data, count); 1774 1775 /* 1776 * If this log buffer would straddle the end of the log we will have 1777 * to split it up into two bios, so that we can continue at the start. 1778 */ 1779 if (bno + BTOBB(count) > log->l_logBBsize) { 1780 struct bio *split; 1781 1782 split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno, 1783 GFP_NOIO, &fs_bio_set); 1784 bio_chain(split, &iclog->ic_bio); 1785 submit_bio(split); 1786 1787 /* restart at logical offset zero for the remainder */ 1788 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart; 1789 } 1790 1791 submit_bio(&iclog->ic_bio); 1792 } 1793 1794 /* 1795 * We need to bump cycle number for the part of the iclog that is 1796 * written to the start of the log. Watch out for the header magic 1797 * number case, though. 1798 */ 1799 static void 1800 xlog_split_iclog( 1801 struct xlog *log, 1802 void *data, 1803 uint64_t bno, 1804 unsigned int count) 1805 { 1806 unsigned int split_offset = BBTOB(log->l_logBBsize - bno); 1807 unsigned int i; 1808 1809 for (i = split_offset; i < count; i += BBSIZE) { 1810 uint32_t cycle = get_unaligned_be32(data + i); 1811 1812 if (++cycle == XLOG_HEADER_MAGIC_NUM) 1813 cycle++; 1814 put_unaligned_be32(cycle, data + i); 1815 } 1816 } 1817 1818 static int 1819 xlog_calc_iclog_size( 1820 struct xlog *log, 1821 struct xlog_in_core *iclog, 1822 uint32_t *roundoff) 1823 { 1824 uint32_t count_init, count; 1825 bool use_lsunit; 1826 1827 use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 1828 log->l_mp->m_sb.sb_logsunit > 1; 1829 1830 /* Add for LR header */ 1831 count_init = log->l_iclog_hsize + iclog->ic_offset; 1832 1833 /* Round out the log write size */ 1834 if (use_lsunit) { 1835 /* we have a v2 stripe unit to use */ 1836 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); 1837 } else { 1838 count = BBTOB(BTOBB(count_init)); 1839 } 1840 1841 ASSERT(count >= count_init); 1842 *roundoff = count - count_init; 1843 1844 if (use_lsunit) 1845 ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); 1846 else 1847 ASSERT(*roundoff < BBTOB(1)); 1848 return count; 1849 } 1850 1851 /* 1852 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1853 * fashion. Previously, we should have moved the current iclog 1854 * ptr in the log to point to the next available iclog. This allows further 1855 * write to continue while this code syncs out an iclog ready to go. 1856 * Before an in-core log can be written out, the data section must be scanned 1857 * to save away the 1st word of each BBSIZE block into the header. We replace 1858 * it with the current cycle count. Each BBSIZE block is tagged with the 1859 * cycle count because there in an implicit assumption that drives will 1860 * guarantee that entire 512 byte blocks get written at once. In other words, 1861 * we can't have part of a 512 byte block written and part not written. By 1862 * tagging each block, we will know which blocks are valid when recovering 1863 * after an unclean shutdown. 1864 * 1865 * This routine is single threaded on the iclog. No other thread can be in 1866 * this routine with the same iclog. Changing contents of iclog can there- 1867 * fore be done without grabbing the state machine lock. Updating the global 1868 * log will require grabbing the lock though. 1869 * 1870 * The entire log manager uses a logical block numbering scheme. Only 1871 * xlog_write_iclog knows about the fact that the log may not start with 1872 * block zero on a given device. 1873 */ 1874 STATIC void 1875 xlog_sync( 1876 struct xlog *log, 1877 struct xlog_in_core *iclog) 1878 { 1879 unsigned int count; /* byte count of bwrite */ 1880 unsigned int roundoff; /* roundoff to BB or stripe */ 1881 uint64_t bno; 1882 unsigned int size; 1883 bool need_flush = true, split = false; 1884 1885 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1886 1887 count = xlog_calc_iclog_size(log, iclog, &roundoff); 1888 1889 /* move grant heads by roundoff in sync */ 1890 xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); 1891 xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); 1892 1893 /* put cycle number in every block */ 1894 xlog_pack_data(log, iclog, roundoff); 1895 1896 /* real byte length */ 1897 size = iclog->ic_offset; 1898 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) 1899 size += roundoff; 1900 iclog->ic_header.h_len = cpu_to_be32(size); 1901 1902 XFS_STATS_INC(log->l_mp, xs_log_writes); 1903 XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); 1904 1905 bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); 1906 1907 /* Do we need to split this write into 2 parts? */ 1908 if (bno + BTOBB(count) > log->l_logBBsize) { 1909 xlog_split_iclog(log, &iclog->ic_header, bno, count); 1910 split = true; 1911 } 1912 1913 /* calculcate the checksum */ 1914 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, 1915 iclog->ic_datap, size); 1916 /* 1917 * Intentionally corrupt the log record CRC based on the error injection 1918 * frequency, if defined. This facilitates testing log recovery in the 1919 * event of torn writes. Hence, set the IOABORT state to abort the log 1920 * write on I/O completion and shutdown the fs. The subsequent mount 1921 * detects the bad CRC and attempts to recover. 1922 */ 1923 #ifdef DEBUG 1924 if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { 1925 iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); 1926 iclog->ic_fail_crc = true; 1927 xfs_warn(log->l_mp, 1928 "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", 1929 be64_to_cpu(iclog->ic_header.h_lsn)); 1930 } 1931 #endif 1932 1933 /* 1934 * Flush the data device before flushing the log to make sure all meta 1935 * data written back from the AIL actually made it to disk before 1936 * stamping the new log tail LSN into the log buffer. For an external 1937 * log we need to issue the flush explicitly, and unfortunately 1938 * synchronously here; for an internal log we can simply use the block 1939 * layer state machine for preflushes. 1940 */ 1941 if (log->l_targ != log->l_mp->m_ddev_targp || split) { 1942 xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); 1943 need_flush = false; 1944 } 1945 1946 xlog_verify_iclog(log, iclog, count); 1947 xlog_write_iclog(log, iclog, bno, count, need_flush); 1948 } 1949 1950 /* 1951 * Deallocate a log structure 1952 */ 1953 STATIC void 1954 xlog_dealloc_log( 1955 struct xlog *log) 1956 { 1957 xlog_in_core_t *iclog, *next_iclog; 1958 int i; 1959 1960 xlog_cil_destroy(log); 1961 1962 /* 1963 * Cycle all the iclogbuf locks to make sure all log IO completion 1964 * is done before we tear down these buffers. 1965 */ 1966 iclog = log->l_iclog; 1967 for (i = 0; i < log->l_iclog_bufs; i++) { 1968 down(&iclog->ic_sema); 1969 up(&iclog->ic_sema); 1970 iclog = iclog->ic_next; 1971 } 1972 1973 iclog = log->l_iclog; 1974 for (i = 0; i < log->l_iclog_bufs; i++) { 1975 next_iclog = iclog->ic_next; 1976 kmem_free(iclog->ic_data); 1977 kmem_free(iclog); 1978 iclog = next_iclog; 1979 } 1980 1981 log->l_mp->m_log = NULL; 1982 destroy_workqueue(log->l_ioend_workqueue); 1983 kmem_free(log); 1984 } /* xlog_dealloc_log */ 1985 1986 /* 1987 * Update counters atomically now that memcpy is done. 1988 */ 1989 static inline void 1990 xlog_state_finish_copy( 1991 struct xlog *log, 1992 struct xlog_in_core *iclog, 1993 int record_cnt, 1994 int copy_bytes) 1995 { 1996 lockdep_assert_held(&log->l_icloglock); 1997 1998 be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); 1999 iclog->ic_offset += copy_bytes; 2000 } 2001 2002 /* 2003 * print out info relating to regions written which consume 2004 * the reservation 2005 */ 2006 void 2007 xlog_print_tic_res( 2008 struct xfs_mount *mp, 2009 struct xlog_ticket *ticket) 2010 { 2011 uint i; 2012 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 2013 2014 /* match with XLOG_REG_TYPE_* in xfs_log.h */ 2015 #define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str 2016 static char *res_type_str[] = { 2017 REG_TYPE_STR(BFORMAT, "bformat"), 2018 REG_TYPE_STR(BCHUNK, "bchunk"), 2019 REG_TYPE_STR(EFI_FORMAT, "efi_format"), 2020 REG_TYPE_STR(EFD_FORMAT, "efd_format"), 2021 REG_TYPE_STR(IFORMAT, "iformat"), 2022 REG_TYPE_STR(ICORE, "icore"), 2023 REG_TYPE_STR(IEXT, "iext"), 2024 REG_TYPE_STR(IBROOT, "ibroot"), 2025 REG_TYPE_STR(ILOCAL, "ilocal"), 2026 REG_TYPE_STR(IATTR_EXT, "iattr_ext"), 2027 REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), 2028 REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), 2029 REG_TYPE_STR(QFORMAT, "qformat"), 2030 REG_TYPE_STR(DQUOT, "dquot"), 2031 REG_TYPE_STR(QUOTAOFF, "quotaoff"), 2032 REG_TYPE_STR(LRHEADER, "LR header"), 2033 REG_TYPE_STR(UNMOUNT, "unmount"), 2034 REG_TYPE_STR(COMMIT, "commit"), 2035 REG_TYPE_STR(TRANSHDR, "trans header"), 2036 REG_TYPE_STR(ICREATE, "inode create"), 2037 REG_TYPE_STR(RUI_FORMAT, "rui_format"), 2038 REG_TYPE_STR(RUD_FORMAT, "rud_format"), 2039 REG_TYPE_STR(CUI_FORMAT, "cui_format"), 2040 REG_TYPE_STR(CUD_FORMAT, "cud_format"), 2041 REG_TYPE_STR(BUI_FORMAT, "bui_format"), 2042 REG_TYPE_STR(BUD_FORMAT, "bud_format"), 2043 }; 2044 BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); 2045 #undef REG_TYPE_STR 2046 2047 xfs_warn(mp, "ticket reservation summary:"); 2048 xfs_warn(mp, " unit res = %d bytes", 2049 ticket->t_unit_res); 2050 xfs_warn(mp, " current res = %d bytes", 2051 ticket->t_curr_res); 2052 xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", 2053 ticket->t_res_arr_sum, ticket->t_res_o_flow); 2054 xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", 2055 ticket->t_res_num_ophdrs, ophdr_spc); 2056 xfs_warn(mp, " ophdr + reg = %u bytes", 2057 ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); 2058 xfs_warn(mp, " num regions = %u", 2059 ticket->t_res_num); 2060 2061 for (i = 0; i < ticket->t_res_num; i++) { 2062 uint r_type = ticket->t_res_arr[i].r_type; 2063 xfs_warn(mp, "region[%u]: %s - %u bytes", i, 2064 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 2065 "bad-rtype" : res_type_str[r_type]), 2066 ticket->t_res_arr[i].r_len); 2067 } 2068 } 2069 2070 /* 2071 * Print a summary of the transaction. 2072 */ 2073 void 2074 xlog_print_trans( 2075 struct xfs_trans *tp) 2076 { 2077 struct xfs_mount *mp = tp->t_mountp; 2078 struct xfs_log_item *lip; 2079 2080 /* dump core transaction and ticket info */ 2081 xfs_warn(mp, "transaction summary:"); 2082 xfs_warn(mp, " log res = %d", tp->t_log_res); 2083 xfs_warn(mp, " log count = %d", tp->t_log_count); 2084 xfs_warn(mp, " flags = 0x%x", tp->t_flags); 2085 2086 xlog_print_tic_res(mp, tp->t_ticket); 2087 2088 /* dump each log item */ 2089 list_for_each_entry(lip, &tp->t_items, li_trans) { 2090 struct xfs_log_vec *lv = lip->li_lv; 2091 struct xfs_log_iovec *vec; 2092 int i; 2093 2094 xfs_warn(mp, "log item: "); 2095 xfs_warn(mp, " type = 0x%x", lip->li_type); 2096 xfs_warn(mp, " flags = 0x%lx", lip->li_flags); 2097 if (!lv) 2098 continue; 2099 xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); 2100 xfs_warn(mp, " size = %d", lv->lv_size); 2101 xfs_warn(mp, " bytes = %d", lv->lv_bytes); 2102 xfs_warn(mp, " buf len = %d", lv->lv_buf_len); 2103 2104 /* dump each iovec for the log item */ 2105 vec = lv->lv_iovecp; 2106 for (i = 0; i < lv->lv_niovecs; i++) { 2107 int dumplen = min(vec->i_len, 32); 2108 2109 xfs_warn(mp, " iovec[%d]", i); 2110 xfs_warn(mp, " type = 0x%x", vec->i_type); 2111 xfs_warn(mp, " len = %d", vec->i_len); 2112 xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i); 2113 xfs_hex_dump(vec->i_addr, dumplen); 2114 2115 vec++; 2116 } 2117 } 2118 } 2119 2120 /* 2121 * Calculate the potential space needed by the log vector. Each region gets 2122 * its own xlog_op_header_t and may need to be double word aligned. 2123 */ 2124 static int 2125 xlog_write_calc_vec_length( 2126 struct xlog_ticket *ticket, 2127 struct xfs_log_vec *log_vector) 2128 { 2129 struct xfs_log_vec *lv; 2130 int headers = 0; 2131 int len = 0; 2132 int i; 2133 2134 /* acct for start rec of xact */ 2135 if (ticket->t_flags & XLOG_TIC_INITED) 2136 headers++; 2137 2138 for (lv = log_vector; lv; lv = lv->lv_next) { 2139 /* we don't write ordered log vectors */ 2140 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) 2141 continue; 2142 2143 headers += lv->lv_niovecs; 2144 2145 for (i = 0; i < lv->lv_niovecs; i++) { 2146 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; 2147 2148 len += vecp->i_len; 2149 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); 2150 } 2151 } 2152 2153 ticket->t_res_num_ophdrs += headers; 2154 len += headers * sizeof(struct xlog_op_header); 2155 2156 return len; 2157 } 2158 2159 /* 2160 * If first write for transaction, insert start record We can't be trying to 2161 * commit if we are inited. We can't have any "partial_copy" if we are inited. 2162 */ 2163 static int 2164 xlog_write_start_rec( 2165 struct xlog_op_header *ophdr, 2166 struct xlog_ticket *ticket) 2167 { 2168 if (!(ticket->t_flags & XLOG_TIC_INITED)) 2169 return 0; 2170 2171 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2172 ophdr->oh_clientid = ticket->t_clientid; 2173 ophdr->oh_len = 0; 2174 ophdr->oh_flags = XLOG_START_TRANS; 2175 ophdr->oh_res2 = 0; 2176 2177 ticket->t_flags &= ~XLOG_TIC_INITED; 2178 2179 return sizeof(struct xlog_op_header); 2180 } 2181 2182 static xlog_op_header_t * 2183 xlog_write_setup_ophdr( 2184 struct xlog *log, 2185 struct xlog_op_header *ophdr, 2186 struct xlog_ticket *ticket, 2187 uint flags) 2188 { 2189 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2190 ophdr->oh_clientid = ticket->t_clientid; 2191 ophdr->oh_res2 = 0; 2192 2193 /* are we copying a commit or unmount record? */ 2194 ophdr->oh_flags = flags; 2195 2196 /* 2197 * We've seen logs corrupted with bad transaction client ids. This 2198 * makes sure that XFS doesn't generate them on. Turn this into an EIO 2199 * and shut down the filesystem. 2200 */ 2201 switch (ophdr->oh_clientid) { 2202 case XFS_TRANSACTION: 2203 case XFS_VOLUME: 2204 case XFS_LOG: 2205 break; 2206 default: 2207 xfs_warn(log->l_mp, 2208 "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, 2209 ophdr->oh_clientid, ticket); 2210 return NULL; 2211 } 2212 2213 return ophdr; 2214 } 2215 2216 /* 2217 * Set up the parameters of the region copy into the log. This has 2218 * to handle region write split across multiple log buffers - this 2219 * state is kept external to this function so that this code can 2220 * be written in an obvious, self documenting manner. 2221 */ 2222 static int 2223 xlog_write_setup_copy( 2224 struct xlog_ticket *ticket, 2225 struct xlog_op_header *ophdr, 2226 int space_available, 2227 int space_required, 2228 int *copy_off, 2229 int *copy_len, 2230 int *last_was_partial_copy, 2231 int *bytes_consumed) 2232 { 2233 int still_to_copy; 2234 2235 still_to_copy = space_required - *bytes_consumed; 2236 *copy_off = *bytes_consumed; 2237 2238 if (still_to_copy <= space_available) { 2239 /* write of region completes here */ 2240 *copy_len = still_to_copy; 2241 ophdr->oh_len = cpu_to_be32(*copy_len); 2242 if (*last_was_partial_copy) 2243 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); 2244 *last_was_partial_copy = 0; 2245 *bytes_consumed = 0; 2246 return 0; 2247 } 2248 2249 /* partial write of region, needs extra log op header reservation */ 2250 *copy_len = space_available; 2251 ophdr->oh_len = cpu_to_be32(*copy_len); 2252 ophdr->oh_flags |= XLOG_CONTINUE_TRANS; 2253 if (*last_was_partial_copy) 2254 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; 2255 *bytes_consumed += *copy_len; 2256 (*last_was_partial_copy)++; 2257 2258 /* account for new log op header */ 2259 ticket->t_curr_res -= sizeof(struct xlog_op_header); 2260 ticket->t_res_num_ophdrs++; 2261 2262 return sizeof(struct xlog_op_header); 2263 } 2264 2265 static int 2266 xlog_write_copy_finish( 2267 struct xlog *log, 2268 struct xlog_in_core *iclog, 2269 uint flags, 2270 int *record_cnt, 2271 int *data_cnt, 2272 int *partial_copy, 2273 int *partial_copy_len, 2274 int log_offset, 2275 struct xlog_in_core **commit_iclog) 2276 { 2277 int error; 2278 2279 if (*partial_copy) { 2280 /* 2281 * This iclog has already been marked WANT_SYNC by 2282 * xlog_state_get_iclog_space. 2283 */ 2284 spin_lock(&log->l_icloglock); 2285 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2286 *record_cnt = 0; 2287 *data_cnt = 0; 2288 goto release_iclog; 2289 } 2290 2291 *partial_copy = 0; 2292 *partial_copy_len = 0; 2293 2294 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { 2295 /* no more space in this iclog - push it. */ 2296 spin_lock(&log->l_icloglock); 2297 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2298 *record_cnt = 0; 2299 *data_cnt = 0; 2300 2301 if (iclog->ic_state == XLOG_STATE_ACTIVE) 2302 xlog_state_switch_iclogs(log, iclog, 0); 2303 else 2304 ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || 2305 iclog->ic_state == XLOG_STATE_IOERROR); 2306 if (!commit_iclog) 2307 goto release_iclog; 2308 spin_unlock(&log->l_icloglock); 2309 ASSERT(flags & XLOG_COMMIT_TRANS); 2310 *commit_iclog = iclog; 2311 } 2312 2313 return 0; 2314 2315 release_iclog: 2316 error = xlog_state_release_iclog(log, iclog); 2317 spin_unlock(&log->l_icloglock); 2318 return error; 2319 } 2320 2321 /* 2322 * Write some region out to in-core log 2323 * 2324 * This will be called when writing externally provided regions or when 2325 * writing out a commit record for a given transaction. 2326 * 2327 * General algorithm: 2328 * 1. Find total length of this write. This may include adding to the 2329 * lengths passed in. 2330 * 2. Check whether we violate the tickets reservation. 2331 * 3. While writing to this iclog 2332 * A. Reserve as much space in this iclog as can get 2333 * B. If this is first write, save away start lsn 2334 * C. While writing this region: 2335 * 1. If first write of transaction, write start record 2336 * 2. Write log operation header (header per region) 2337 * 3. Find out if we can fit entire region into this iclog 2338 * 4. Potentially, verify destination memcpy ptr 2339 * 5. Memcpy (partial) region 2340 * 6. If partial copy, release iclog; otherwise, continue 2341 * copying more regions into current iclog 2342 * 4. Mark want sync bit (in simulation mode) 2343 * 5. Release iclog for potential flush to on-disk log. 2344 * 2345 * ERRORS: 2346 * 1. Panic if reservation is overrun. This should never happen since 2347 * reservation amounts are generated internal to the filesystem. 2348 * NOTES: 2349 * 1. Tickets are single threaded data structures. 2350 * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the 2351 * syncing routine. When a single log_write region needs to span 2352 * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set 2353 * on all log operation writes which don't contain the end of the 2354 * region. The XLOG_END_TRANS bit is used for the in-core log 2355 * operation which contains the end of the continued log_write region. 2356 * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, 2357 * we don't really know exactly how much space will be used. As a result, 2358 * we don't update ic_offset until the end when we know exactly how many 2359 * bytes have been written out. 2360 */ 2361 int 2362 xlog_write( 2363 struct xlog *log, 2364 struct xfs_log_vec *log_vector, 2365 struct xlog_ticket *ticket, 2366 xfs_lsn_t *start_lsn, 2367 struct xlog_in_core **commit_iclog, 2368 uint flags) 2369 { 2370 struct xlog_in_core *iclog = NULL; 2371 struct xfs_log_iovec *vecp; 2372 struct xfs_log_vec *lv; 2373 int len; 2374 int index; 2375 int partial_copy = 0; 2376 int partial_copy_len = 0; 2377 int contwr = 0; 2378 int record_cnt = 0; 2379 int data_cnt = 0; 2380 int error = 0; 2381 2382 *start_lsn = 0; 2383 2384 len = xlog_write_calc_vec_length(ticket, log_vector); 2385 2386 /* 2387 * Region headers and bytes are already accounted for. 2388 * We only need to take into account start records and 2389 * split regions in this function. 2390 */ 2391 if (ticket->t_flags & XLOG_TIC_INITED) 2392 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2393 2394 /* 2395 * Commit record headers need to be accounted for. These 2396 * come in as separate writes so are easy to detect. 2397 */ 2398 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) 2399 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2400 2401 if (ticket->t_curr_res < 0) { 2402 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 2403 "ctx ticket reservation ran out. Need to up reservation"); 2404 xlog_print_tic_res(log->l_mp, ticket); 2405 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 2406 } 2407 2408 index = 0; 2409 lv = log_vector; 2410 vecp = lv->lv_iovecp; 2411 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2412 void *ptr; 2413 int log_offset; 2414 2415 error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 2416 &contwr, &log_offset); 2417 if (error) 2418 return error; 2419 2420 ASSERT(log_offset <= iclog->ic_size - 1); 2421 ptr = iclog->ic_datap + log_offset; 2422 2423 /* start_lsn is the first lsn written to. That's all we need. */ 2424 if (!*start_lsn) 2425 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2426 2427 /* 2428 * This loop writes out as many regions as can fit in the amount 2429 * of space which was allocated by xlog_state_get_iclog_space(). 2430 */ 2431 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2432 struct xfs_log_iovec *reg; 2433 struct xlog_op_header *ophdr; 2434 int start_rec_copy; 2435 int copy_len; 2436 int copy_off; 2437 bool ordered = false; 2438 2439 /* ordered log vectors have no regions to write */ 2440 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { 2441 ASSERT(lv->lv_niovecs == 0); 2442 ordered = true; 2443 goto next_lv; 2444 } 2445 2446 reg = &vecp[index]; 2447 ASSERT(reg->i_len % sizeof(int32_t) == 0); 2448 ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); 2449 2450 start_rec_copy = xlog_write_start_rec(ptr, ticket); 2451 if (start_rec_copy) { 2452 record_cnt++; 2453 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2454 start_rec_copy); 2455 } 2456 2457 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); 2458 if (!ophdr) 2459 return -EIO; 2460 2461 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2462 sizeof(struct xlog_op_header)); 2463 2464 len += xlog_write_setup_copy(ticket, ophdr, 2465 iclog->ic_size-log_offset, 2466 reg->i_len, 2467 ©_off, ©_len, 2468 &partial_copy, 2469 &partial_copy_len); 2470 xlog_verify_dest_ptr(log, ptr); 2471 2472 /* 2473 * Copy region. 2474 * 2475 * Unmount records just log an opheader, so can have 2476 * empty payloads with no data region to copy. Hence we 2477 * only copy the payload if the vector says it has data 2478 * to copy. 2479 */ 2480 ASSERT(copy_len >= 0); 2481 if (copy_len > 0) { 2482 memcpy(ptr, reg->i_addr + copy_off, copy_len); 2483 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2484 copy_len); 2485 } 2486 copy_len += start_rec_copy + sizeof(xlog_op_header_t); 2487 record_cnt++; 2488 data_cnt += contwr ? copy_len : 0; 2489 2490 error = xlog_write_copy_finish(log, iclog, flags, 2491 &record_cnt, &data_cnt, 2492 &partial_copy, 2493 &partial_copy_len, 2494 log_offset, 2495 commit_iclog); 2496 if (error) 2497 return error; 2498 2499 /* 2500 * if we had a partial copy, we need to get more iclog 2501 * space but we don't want to increment the region 2502 * index because there is still more is this region to 2503 * write. 2504 * 2505 * If we completed writing this region, and we flushed 2506 * the iclog (indicated by resetting of the record 2507 * count), then we also need to get more log space. If 2508 * this was the last record, though, we are done and 2509 * can just return. 2510 */ 2511 if (partial_copy) 2512 break; 2513 2514 if (++index == lv->lv_niovecs) { 2515 next_lv: 2516 lv = lv->lv_next; 2517 index = 0; 2518 if (lv) 2519 vecp = lv->lv_iovecp; 2520 } 2521 if (record_cnt == 0 && !ordered) { 2522 if (!lv) 2523 return 0; 2524 break; 2525 } 2526 } 2527 } 2528 2529 ASSERT(len == 0); 2530 2531 spin_lock(&log->l_icloglock); 2532 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 2533 if (commit_iclog) { 2534 ASSERT(flags & XLOG_COMMIT_TRANS); 2535 *commit_iclog = iclog; 2536 } else { 2537 error = xlog_state_release_iclog(log, iclog); 2538 } 2539 spin_unlock(&log->l_icloglock); 2540 2541 return error; 2542 } 2543 2544 2545 /***************************************************************************** 2546 * 2547 * State Machine functions 2548 * 2549 ***************************************************************************** 2550 */ 2551 2552 static void 2553 xlog_state_activate_iclog( 2554 struct xlog_in_core *iclog, 2555 int *iclogs_changed) 2556 { 2557 ASSERT(list_empty_careful(&iclog->ic_callbacks)); 2558 2559 /* 2560 * If the number of ops in this iclog indicate it just contains the 2561 * dummy transaction, we can change state into IDLE (the second time 2562 * around). Otherwise we should change the state into NEED a dummy. 2563 * We don't need to cover the dummy. 2564 */ 2565 if (*iclogs_changed == 0 && 2566 iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { 2567 *iclogs_changed = 1; 2568 } else { 2569 /* 2570 * We have two dirty iclogs so start over. This could also be 2571 * num of ops indicating this is not the dummy going out. 2572 */ 2573 *iclogs_changed = 2; 2574 } 2575 2576 iclog->ic_state = XLOG_STATE_ACTIVE; 2577 iclog->ic_offset = 0; 2578 iclog->ic_header.h_num_logops = 0; 2579 memset(iclog->ic_header.h_cycle_data, 0, 2580 sizeof(iclog->ic_header.h_cycle_data)); 2581 iclog->ic_header.h_lsn = 0; 2582 } 2583 2584 /* 2585 * Loop through all iclogs and mark all iclogs currently marked DIRTY as 2586 * ACTIVE after iclog I/O has completed. 2587 */ 2588 static void 2589 xlog_state_activate_iclogs( 2590 struct xlog *log, 2591 int *iclogs_changed) 2592 { 2593 struct xlog_in_core *iclog = log->l_iclog; 2594 2595 do { 2596 if (iclog->ic_state == XLOG_STATE_DIRTY) 2597 xlog_state_activate_iclog(iclog, iclogs_changed); 2598 /* 2599 * The ordering of marking iclogs ACTIVE must be maintained, so 2600 * an iclog doesn't become ACTIVE beyond one that is SYNCING. 2601 */ 2602 else if (iclog->ic_state != XLOG_STATE_ACTIVE) 2603 break; 2604 } while ((iclog = iclog->ic_next) != log->l_iclog); 2605 } 2606 2607 static int 2608 xlog_covered_state( 2609 int prev_state, 2610 int iclogs_changed) 2611 { 2612 /* 2613 * We usually go to NEED. But we go to NEED2 if the changed indicates we 2614 * are done writing the dummy record. If we are done with the second 2615 * dummy recored (DONE2), then we go to IDLE. 2616 */ 2617 switch (prev_state) { 2618 case XLOG_STATE_COVER_IDLE: 2619 case XLOG_STATE_COVER_NEED: 2620 case XLOG_STATE_COVER_NEED2: 2621 break; 2622 case XLOG_STATE_COVER_DONE: 2623 if (iclogs_changed == 1) 2624 return XLOG_STATE_COVER_NEED2; 2625 break; 2626 case XLOG_STATE_COVER_DONE2: 2627 if (iclogs_changed == 1) 2628 return XLOG_STATE_COVER_IDLE; 2629 break; 2630 default: 2631 ASSERT(0); 2632 } 2633 2634 return XLOG_STATE_COVER_NEED; 2635 } 2636 2637 STATIC void 2638 xlog_state_clean_iclog( 2639 struct xlog *log, 2640 struct xlog_in_core *dirty_iclog) 2641 { 2642 int iclogs_changed = 0; 2643 2644 dirty_iclog->ic_state = XLOG_STATE_DIRTY; 2645 2646 xlog_state_activate_iclogs(log, &iclogs_changed); 2647 wake_up_all(&dirty_iclog->ic_force_wait); 2648 2649 if (iclogs_changed) { 2650 log->l_covered_state = xlog_covered_state(log->l_covered_state, 2651 iclogs_changed); 2652 } 2653 } 2654 2655 STATIC xfs_lsn_t 2656 xlog_get_lowest_lsn( 2657 struct xlog *log) 2658 { 2659 struct xlog_in_core *iclog = log->l_iclog; 2660 xfs_lsn_t lowest_lsn = 0, lsn; 2661 2662 do { 2663 if (iclog->ic_state == XLOG_STATE_ACTIVE || 2664 iclog->ic_state == XLOG_STATE_DIRTY) 2665 continue; 2666 2667 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2668 if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) 2669 lowest_lsn = lsn; 2670 } while ((iclog = iclog->ic_next) != log->l_iclog); 2671 2672 return lowest_lsn; 2673 } 2674 2675 /* 2676 * Completion of a iclog IO does not imply that a transaction has completed, as 2677 * transactions can be large enough to span many iclogs. We cannot change the 2678 * tail of the log half way through a transaction as this may be the only 2679 * transaction in the log and moving the tail to point to the middle of it 2680 * will prevent recovery from finding the start of the transaction. Hence we 2681 * should only update the last_sync_lsn if this iclog contains transaction 2682 * completion callbacks on it. 2683 * 2684 * We have to do this before we drop the icloglock to ensure we are the only one 2685 * that can update it. 2686 * 2687 * If we are moving the last_sync_lsn forwards, we also need to ensure we kick 2688 * the reservation grant head pushing. This is due to the fact that the push 2689 * target is bound by the current last_sync_lsn value. Hence if we have a large 2690 * amount of log space bound up in this committing transaction then the 2691 * last_sync_lsn value may be the limiting factor preventing tail pushing from 2692 * freeing space in the log. Hence once we've updated the last_sync_lsn we 2693 * should push the AIL to ensure the push target (and hence the grant head) is 2694 * no longer bound by the old log head location and can move forwards and make 2695 * progress again. 2696 */ 2697 static void 2698 xlog_state_set_callback( 2699 struct xlog *log, 2700 struct xlog_in_core *iclog, 2701 xfs_lsn_t header_lsn) 2702 { 2703 iclog->ic_state = XLOG_STATE_CALLBACK; 2704 2705 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2706 header_lsn) <= 0); 2707 2708 if (list_empty_careful(&iclog->ic_callbacks)) 2709 return; 2710 2711 atomic64_set(&log->l_last_sync_lsn, header_lsn); 2712 xlog_grant_push_ail(log, 0); 2713 } 2714 2715 /* 2716 * Return true if we need to stop processing, false to continue to the next 2717 * iclog. The caller will need to run callbacks if the iclog is returned in the 2718 * XLOG_STATE_CALLBACK state. 2719 */ 2720 static bool 2721 xlog_state_iodone_process_iclog( 2722 struct xlog *log, 2723 struct xlog_in_core *iclog, 2724 bool *ioerror) 2725 { 2726 xfs_lsn_t lowest_lsn; 2727 xfs_lsn_t header_lsn; 2728 2729 switch (iclog->ic_state) { 2730 case XLOG_STATE_ACTIVE: 2731 case XLOG_STATE_DIRTY: 2732 /* 2733 * Skip all iclogs in the ACTIVE & DIRTY states: 2734 */ 2735 return false; 2736 case XLOG_STATE_IOERROR: 2737 /* 2738 * Between marking a filesystem SHUTDOWN and stopping the log, 2739 * we do flush all iclogs to disk (if there wasn't a log I/O 2740 * error). So, we do want things to go smoothly in case of just 2741 * a SHUTDOWN w/o a LOG_IO_ERROR. 2742 */ 2743 *ioerror = true; 2744 return false; 2745 case XLOG_STATE_DONE_SYNC: 2746 /* 2747 * Now that we have an iclog that is in the DONE_SYNC state, do 2748 * one more check here to see if we have chased our tail around. 2749 * If this is not the lowest lsn iclog, then we will leave it 2750 * for another completion to process. 2751 */ 2752 header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2753 lowest_lsn = xlog_get_lowest_lsn(log); 2754 if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) 2755 return false; 2756 xlog_state_set_callback(log, iclog, header_lsn); 2757 return false; 2758 default: 2759 /* 2760 * Can only perform callbacks in order. Since this iclog is not 2761 * in the DONE_SYNC state, we skip the rest and just try to 2762 * clean up. 2763 */ 2764 return true; 2765 } 2766 } 2767 2768 /* 2769 * Keep processing entries in the iclog callback list until we come around and 2770 * it is empty. We need to atomically see that the list is empty and change the 2771 * state to DIRTY so that we don't miss any more callbacks being added. 2772 * 2773 * This function is called with the icloglock held and returns with it held. We 2774 * drop it while running callbacks, however, as holding it over thousands of 2775 * callbacks is unnecessary and causes excessive contention if we do. 2776 */ 2777 static void 2778 xlog_state_do_iclog_callbacks( 2779 struct xlog *log, 2780 struct xlog_in_core *iclog) 2781 __releases(&log->l_icloglock) 2782 __acquires(&log->l_icloglock) 2783 { 2784 spin_unlock(&log->l_icloglock); 2785 spin_lock(&iclog->ic_callback_lock); 2786 while (!list_empty(&iclog->ic_callbacks)) { 2787 LIST_HEAD(tmp); 2788 2789 list_splice_init(&iclog->ic_callbacks, &tmp); 2790 2791 spin_unlock(&iclog->ic_callback_lock); 2792 xlog_cil_process_committed(&tmp); 2793 spin_lock(&iclog->ic_callback_lock); 2794 } 2795 2796 /* 2797 * Pick up the icloglock while still holding the callback lock so we 2798 * serialise against anyone trying to add more callbacks to this iclog 2799 * now we've finished processing. 2800 */ 2801 spin_lock(&log->l_icloglock); 2802 spin_unlock(&iclog->ic_callback_lock); 2803 } 2804 2805 STATIC void 2806 xlog_state_do_callback( 2807 struct xlog *log) 2808 { 2809 struct xlog_in_core *iclog; 2810 struct xlog_in_core *first_iclog; 2811 bool cycled_icloglock; 2812 bool ioerror; 2813 int flushcnt = 0; 2814 int repeats = 0; 2815 2816 spin_lock(&log->l_icloglock); 2817 do { 2818 /* 2819 * Scan all iclogs starting with the one pointed to by the 2820 * log. Reset this starting point each time the log is 2821 * unlocked (during callbacks). 2822 * 2823 * Keep looping through iclogs until one full pass is made 2824 * without running any callbacks. 2825 */ 2826 first_iclog = log->l_iclog; 2827 iclog = log->l_iclog; 2828 cycled_icloglock = false; 2829 ioerror = false; 2830 repeats++; 2831 2832 do { 2833 if (xlog_state_iodone_process_iclog(log, iclog, 2834 &ioerror)) 2835 break; 2836 2837 if (iclog->ic_state != XLOG_STATE_CALLBACK && 2838 iclog->ic_state != XLOG_STATE_IOERROR) { 2839 iclog = iclog->ic_next; 2840 continue; 2841 } 2842 2843 /* 2844 * Running callbacks will drop the icloglock which means 2845 * we'll have to run at least one more complete loop. 2846 */ 2847 cycled_icloglock = true; 2848 xlog_state_do_iclog_callbacks(log, iclog); 2849 if (XLOG_FORCED_SHUTDOWN(log)) 2850 wake_up_all(&iclog->ic_force_wait); 2851 else 2852 xlog_state_clean_iclog(log, iclog); 2853 iclog = iclog->ic_next; 2854 } while (first_iclog != iclog); 2855 2856 if (repeats > 5000) { 2857 flushcnt += repeats; 2858 repeats = 0; 2859 xfs_warn(log->l_mp, 2860 "%s: possible infinite loop (%d iterations)", 2861 __func__, flushcnt); 2862 } 2863 } while (!ioerror && cycled_icloglock); 2864 2865 if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || 2866 log->l_iclog->ic_state == XLOG_STATE_IOERROR) 2867 wake_up_all(&log->l_flush_wait); 2868 2869 spin_unlock(&log->l_icloglock); 2870 } 2871 2872 2873 /* 2874 * Finish transitioning this iclog to the dirty state. 2875 * 2876 * Make sure that we completely execute this routine only when this is 2877 * the last call to the iclog. There is a good chance that iclog flushes, 2878 * when we reach the end of the physical log, get turned into 2 separate 2879 * calls to bwrite. Hence, one iclog flush could generate two calls to this 2880 * routine. By using the reference count bwritecnt, we guarantee that only 2881 * the second completion goes through. 2882 * 2883 * Callbacks could take time, so they are done outside the scope of the 2884 * global state machine log lock. 2885 */ 2886 STATIC void 2887 xlog_state_done_syncing( 2888 struct xlog_in_core *iclog) 2889 { 2890 struct xlog *log = iclog->ic_log; 2891 2892 spin_lock(&log->l_icloglock); 2893 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 2894 2895 /* 2896 * If we got an error, either on the first buffer, or in the case of 2897 * split log writes, on the second, we shut down the file system and 2898 * no iclogs should ever be attempted to be written to disk again. 2899 */ 2900 if (!XLOG_FORCED_SHUTDOWN(log)) { 2901 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); 2902 iclog->ic_state = XLOG_STATE_DONE_SYNC; 2903 } 2904 2905 /* 2906 * Someone could be sleeping prior to writing out the next 2907 * iclog buffer, we wake them all, one will get to do the 2908 * I/O, the others get to wait for the result. 2909 */ 2910 wake_up_all(&iclog->ic_write_wait); 2911 spin_unlock(&log->l_icloglock); 2912 xlog_state_do_callback(log); /* also cleans log */ 2913 } 2914 2915 /* 2916 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must 2917 * sleep. We wait on the flush queue on the head iclog as that should be 2918 * the first iclog to complete flushing. Hence if all iclogs are syncing, 2919 * we will wait here and all new writes will sleep until a sync completes. 2920 * 2921 * The in-core logs are used in a circular fashion. They are not used 2922 * out-of-order even when an iclog past the head is free. 2923 * 2924 * return: 2925 * * log_offset where xlog_write() can start writing into the in-core 2926 * log's data space. 2927 * * in-core log pointer to which xlog_write() should write. 2928 * * boolean indicating this is a continued write to an in-core log. 2929 * If this is the last write, then the in-core log's offset field 2930 * needs to be incremented, depending on the amount of data which 2931 * is copied. 2932 */ 2933 STATIC int 2934 xlog_state_get_iclog_space( 2935 struct xlog *log, 2936 int len, 2937 struct xlog_in_core **iclogp, 2938 struct xlog_ticket *ticket, 2939 int *continued_write, 2940 int *logoffsetp) 2941 { 2942 int log_offset; 2943 xlog_rec_header_t *head; 2944 xlog_in_core_t *iclog; 2945 2946 restart: 2947 spin_lock(&log->l_icloglock); 2948 if (XLOG_FORCED_SHUTDOWN(log)) { 2949 spin_unlock(&log->l_icloglock); 2950 return -EIO; 2951 } 2952 2953 iclog = log->l_iclog; 2954 if (iclog->ic_state != XLOG_STATE_ACTIVE) { 2955 XFS_STATS_INC(log->l_mp, xs_log_noiclogs); 2956 2957 /* Wait for log writes to have flushed */ 2958 xlog_wait(&log->l_flush_wait, &log->l_icloglock); 2959 goto restart; 2960 } 2961 2962 head = &iclog->ic_header; 2963 2964 atomic_inc(&iclog->ic_refcnt); /* prevents sync */ 2965 log_offset = iclog->ic_offset; 2966 2967 /* On the 1st write to an iclog, figure out lsn. This works 2968 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are 2969 * committing to. If the offset is set, that's how many blocks 2970 * must be written. 2971 */ 2972 if (log_offset == 0) { 2973 ticket->t_curr_res -= log->l_iclog_hsize; 2974 xlog_tic_add_region(ticket, 2975 log->l_iclog_hsize, 2976 XLOG_REG_TYPE_LRHEADER); 2977 head->h_cycle = cpu_to_be32(log->l_curr_cycle); 2978 head->h_lsn = cpu_to_be64( 2979 xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); 2980 ASSERT(log->l_curr_block >= 0); 2981 } 2982 2983 /* If there is enough room to write everything, then do it. Otherwise, 2984 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC 2985 * bit is on, so this will get flushed out. Don't update ic_offset 2986 * until you know exactly how many bytes get copied. Therefore, wait 2987 * until later to update ic_offset. 2988 * 2989 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's 2990 * can fit into remaining data section. 2991 */ 2992 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 2993 int error = 0; 2994 2995 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2996 2997 /* 2998 * If we are the only one writing to this iclog, sync it to 2999 * disk. We need to do an atomic compare and decrement here to 3000 * avoid racing with concurrent atomic_dec_and_lock() calls in 3001 * xlog_state_release_iclog() when there is more than one 3002 * reference to the iclog. 3003 */ 3004 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) 3005 error = xlog_state_release_iclog(log, iclog); 3006 spin_unlock(&log->l_icloglock); 3007 if (error) 3008 return error; 3009 goto restart; 3010 } 3011 3012 /* Do we have enough room to write the full amount in the remainder 3013 * of this iclog? Or must we continue a write on the next iclog and 3014 * mark this iclog as completely taken? In the case where we switch 3015 * iclogs (to mark it taken), this particular iclog will release/sync 3016 * to disk in xlog_write(). 3017 */ 3018 if (len <= iclog->ic_size - iclog->ic_offset) { 3019 *continued_write = 0; 3020 iclog->ic_offset += len; 3021 } else { 3022 *continued_write = 1; 3023 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 3024 } 3025 *iclogp = iclog; 3026 3027 ASSERT(iclog->ic_offset <= iclog->ic_size); 3028 spin_unlock(&log->l_icloglock); 3029 3030 *logoffsetp = log_offset; 3031 return 0; 3032 } /* xlog_state_get_iclog_space */ 3033 3034 /* The first cnt-1 times through here we don't need to 3035 * move the grant write head because the permanent 3036 * reservation has reserved cnt times the unit amount. 3037 * Release part of current permanent unit reservation and 3038 * reset current reservation to be one units worth. Also 3039 * move grant reservation head forward. 3040 */ 3041 STATIC void 3042 xlog_regrant_reserve_log_space( 3043 struct xlog *log, 3044 struct xlog_ticket *ticket) 3045 { 3046 trace_xfs_log_regrant_reserve_enter(log, ticket); 3047 3048 if (ticket->t_cnt > 0) 3049 ticket->t_cnt--; 3050 3051 xlog_grant_sub_space(log, &log->l_reserve_head.grant, 3052 ticket->t_curr_res); 3053 xlog_grant_sub_space(log, &log->l_write_head.grant, 3054 ticket->t_curr_res); 3055 ticket->t_curr_res = ticket->t_unit_res; 3056 xlog_tic_reset_res(ticket); 3057 3058 trace_xfs_log_regrant_reserve_sub(log, ticket); 3059 3060 /* just return if we still have some of the pre-reserved space */ 3061 if (ticket->t_cnt > 0) 3062 return; 3063 3064 xlog_grant_add_space(log, &log->l_reserve_head.grant, 3065 ticket->t_unit_res); 3066 3067 trace_xfs_log_regrant_reserve_exit(log, ticket); 3068 3069 ticket->t_curr_res = ticket->t_unit_res; 3070 xlog_tic_reset_res(ticket); 3071 } /* xlog_regrant_reserve_log_space */ 3072 3073 3074 /* 3075 * Give back the space left from a reservation. 3076 * 3077 * All the information we need to make a correct determination of space left 3078 * is present. For non-permanent reservations, things are quite easy. The 3079 * count should have been decremented to zero. We only need to deal with the 3080 * space remaining in the current reservation part of the ticket. If the 3081 * ticket contains a permanent reservation, there may be left over space which 3082 * needs to be released. A count of N means that N-1 refills of the current 3083 * reservation can be done before we need to ask for more space. The first 3084 * one goes to fill up the first current reservation. Once we run out of 3085 * space, the count will stay at zero and the only space remaining will be 3086 * in the current reservation field. 3087 */ 3088 STATIC void 3089 xlog_ungrant_log_space( 3090 struct xlog *log, 3091 struct xlog_ticket *ticket) 3092 { 3093 int bytes; 3094 3095 if (ticket->t_cnt > 0) 3096 ticket->t_cnt--; 3097 3098 trace_xfs_log_ungrant_enter(log, ticket); 3099 trace_xfs_log_ungrant_sub(log, ticket); 3100 3101 /* 3102 * If this is a permanent reservation ticket, we may be able to free 3103 * up more space based on the remaining count. 3104 */ 3105 bytes = ticket->t_curr_res; 3106 if (ticket->t_cnt > 0) { 3107 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 3108 bytes += ticket->t_unit_res*ticket->t_cnt; 3109 } 3110 3111 xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); 3112 xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); 3113 3114 trace_xfs_log_ungrant_exit(log, ticket); 3115 3116 xfs_log_space_wake(log->l_mp); 3117 } 3118 3119 /* 3120 * Mark the current iclog in the ring as WANT_SYNC and move the current iclog 3121 * pointer to the next iclog in the ring. 3122 * 3123 * When called from xlog_state_get_iclog_space(), the exact size of the iclog 3124 * has not yet been determined, all we know is that we have run out of space in 3125 * the current iclog. 3126 */ 3127 STATIC void 3128 xlog_state_switch_iclogs( 3129 struct xlog *log, 3130 struct xlog_in_core *iclog, 3131 int eventual_size) 3132 { 3133 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 3134 assert_spin_locked(&log->l_icloglock); 3135 3136 if (!eventual_size) 3137 eventual_size = iclog->ic_offset; 3138 iclog->ic_state = XLOG_STATE_WANT_SYNC; 3139 iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); 3140 log->l_prev_block = log->l_curr_block; 3141 log->l_prev_cycle = log->l_curr_cycle; 3142 3143 /* roll log?: ic_offset changed later */ 3144 log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); 3145 3146 /* Round up to next log-sunit */ 3147 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3148 log->l_mp->m_sb.sb_logsunit > 1) { 3149 uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); 3150 log->l_curr_block = roundup(log->l_curr_block, sunit_bb); 3151 } 3152 3153 if (log->l_curr_block >= log->l_logBBsize) { 3154 /* 3155 * Rewind the current block before the cycle is bumped to make 3156 * sure that the combined LSN never transiently moves forward 3157 * when the log wraps to the next cycle. This is to support the 3158 * unlocked sample of these fields from xlog_valid_lsn(). Most 3159 * other cases should acquire l_icloglock. 3160 */ 3161 log->l_curr_block -= log->l_logBBsize; 3162 ASSERT(log->l_curr_block >= 0); 3163 smp_wmb(); 3164 log->l_curr_cycle++; 3165 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) 3166 log->l_curr_cycle++; 3167 } 3168 ASSERT(iclog == log->l_iclog); 3169 log->l_iclog = iclog->ic_next; 3170 } /* xlog_state_switch_iclogs */ 3171 3172 /* 3173 * Write out all data in the in-core log as of this exact moment in time. 3174 * 3175 * Data may be written to the in-core log during this call. However, 3176 * we don't guarantee this data will be written out. A change from past 3177 * implementation means this routine will *not* write out zero length LRs. 3178 * 3179 * Basically, we try and perform an intelligent scan of the in-core logs. 3180 * If we determine there is no flushable data, we just return. There is no 3181 * flushable data if: 3182 * 3183 * 1. the current iclog is active and has no data; the previous iclog 3184 * is in the active or dirty state. 3185 * 2. the current iclog is drity, and the previous iclog is in the 3186 * active or dirty state. 3187 * 3188 * We may sleep if: 3189 * 3190 * 1. the current iclog is not in the active nor dirty state. 3191 * 2. the current iclog dirty, and the previous iclog is not in the 3192 * active nor dirty state. 3193 * 3. the current iclog is active, and there is another thread writing 3194 * to this particular iclog. 3195 * 4. a) the current iclog is active and has no other writers 3196 * b) when we return from flushing out this iclog, it is still 3197 * not in the active nor dirty state. 3198 */ 3199 int 3200 xfs_log_force( 3201 struct xfs_mount *mp, 3202 uint flags) 3203 { 3204 struct xlog *log = mp->m_log; 3205 struct xlog_in_core *iclog; 3206 xfs_lsn_t lsn; 3207 3208 XFS_STATS_INC(mp, xs_log_force); 3209 trace_xfs_log_force(mp, 0, _RET_IP_); 3210 3211 xlog_cil_force(log); 3212 3213 spin_lock(&log->l_icloglock); 3214 iclog = log->l_iclog; 3215 if (iclog->ic_state == XLOG_STATE_IOERROR) 3216 goto out_error; 3217 3218 if (iclog->ic_state == XLOG_STATE_DIRTY || 3219 (iclog->ic_state == XLOG_STATE_ACTIVE && 3220 atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { 3221 /* 3222 * If the head is dirty or (active and empty), then we need to 3223 * look at the previous iclog. 3224 * 3225 * If the previous iclog is active or dirty we are done. There 3226 * is nothing to sync out. Otherwise, we attach ourselves to the 3227 * previous iclog and go to sleep. 3228 */ 3229 iclog = iclog->ic_prev; 3230 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3231 if (atomic_read(&iclog->ic_refcnt) == 0) { 3232 /* 3233 * We are the only one with access to this iclog. 3234 * 3235 * Flush it out now. There should be a roundoff of zero 3236 * to show that someone has already taken care of the 3237 * roundoff from the previous sync. 3238 */ 3239 atomic_inc(&iclog->ic_refcnt); 3240 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3241 xlog_state_switch_iclogs(log, iclog, 0); 3242 if (xlog_state_release_iclog(log, iclog)) 3243 goto out_error; 3244 3245 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) 3246 goto out_unlock; 3247 } else { 3248 /* 3249 * Someone else is writing to this iclog. 3250 * 3251 * Use its call to flush out the data. However, the 3252 * other thread may not force out this LR, so we mark 3253 * it WANT_SYNC. 3254 */ 3255 xlog_state_switch_iclogs(log, iclog, 0); 3256 } 3257 } else { 3258 /* 3259 * If the head iclog is not active nor dirty, we just attach 3260 * ourselves to the head and go to sleep if necessary. 3261 */ 3262 ; 3263 } 3264 3265 if (flags & XFS_LOG_SYNC) 3266 return xlog_wait_on_iclog(iclog); 3267 out_unlock: 3268 spin_unlock(&log->l_icloglock); 3269 return 0; 3270 out_error: 3271 spin_unlock(&log->l_icloglock); 3272 return -EIO; 3273 } 3274 3275 static int 3276 __xfs_log_force_lsn( 3277 struct xfs_mount *mp, 3278 xfs_lsn_t lsn, 3279 uint flags, 3280 int *log_flushed, 3281 bool already_slept) 3282 { 3283 struct xlog *log = mp->m_log; 3284 struct xlog_in_core *iclog; 3285 3286 spin_lock(&log->l_icloglock); 3287 iclog = log->l_iclog; 3288 if (iclog->ic_state == XLOG_STATE_IOERROR) 3289 goto out_error; 3290 3291 while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { 3292 iclog = iclog->ic_next; 3293 if (iclog == log->l_iclog) 3294 goto out_unlock; 3295 } 3296 3297 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3298 /* 3299 * We sleep here if we haven't already slept (e.g. this is the 3300 * first time we've looked at the correct iclog buf) and the 3301 * buffer before us is going to be sync'ed. The reason for this 3302 * is that if we are doing sync transactions here, by waiting 3303 * for the previous I/O to complete, we can allow a few more 3304 * transactions into this iclog before we close it down. 3305 * 3306 * Otherwise, we mark the buffer WANT_SYNC, and bump up the 3307 * refcnt so we can release the log (which drops the ref count). 3308 * The state switch keeps new transaction commits from using 3309 * this buffer. When the current commits finish writing into 3310 * the buffer, the refcount will drop to zero and the buffer 3311 * will go out then. 3312 */ 3313 if (!already_slept && 3314 (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || 3315 iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { 3316 XFS_STATS_INC(mp, xs_log_force_sleep); 3317 3318 xlog_wait(&iclog->ic_prev->ic_write_wait, 3319 &log->l_icloglock); 3320 return -EAGAIN; 3321 } 3322 atomic_inc(&iclog->ic_refcnt); 3323 xlog_state_switch_iclogs(log, iclog, 0); 3324 if (xlog_state_release_iclog(log, iclog)) 3325 goto out_error; 3326 if (log_flushed) 3327 *log_flushed = 1; 3328 } 3329 3330 if (flags & XFS_LOG_SYNC) 3331 return xlog_wait_on_iclog(iclog); 3332 out_unlock: 3333 spin_unlock(&log->l_icloglock); 3334 return 0; 3335 out_error: 3336 spin_unlock(&log->l_icloglock); 3337 return -EIO; 3338 } 3339 3340 /* 3341 * Force the in-core log to disk for a specific LSN. 3342 * 3343 * Find in-core log with lsn. 3344 * If it is in the DIRTY state, just return. 3345 * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC 3346 * state and go to sleep or return. 3347 * If it is in any other state, go to sleep or return. 3348 * 3349 * Synchronous forces are implemented with a wait queue. All callers trying 3350 * to force a given lsn to disk must wait on the queue attached to the 3351 * specific in-core log. When given in-core log finally completes its write 3352 * to disk, that thread will wake up all threads waiting on the queue. 3353 */ 3354 int 3355 xfs_log_force_lsn( 3356 struct xfs_mount *mp, 3357 xfs_lsn_t lsn, 3358 uint flags, 3359 int *log_flushed) 3360 { 3361 int ret; 3362 ASSERT(lsn != 0); 3363 3364 XFS_STATS_INC(mp, xs_log_force); 3365 trace_xfs_log_force(mp, lsn, _RET_IP_); 3366 3367 lsn = xlog_cil_force_lsn(mp->m_log, lsn); 3368 if (lsn == NULLCOMMITLSN) 3369 return 0; 3370 3371 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); 3372 if (ret == -EAGAIN) 3373 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); 3374 return ret; 3375 } 3376 3377 /***************************************************************************** 3378 * 3379 * TICKET functions 3380 * 3381 ***************************************************************************** 3382 */ 3383 3384 /* 3385 * Free a used ticket when its refcount falls to zero. 3386 */ 3387 void 3388 xfs_log_ticket_put( 3389 xlog_ticket_t *ticket) 3390 { 3391 ASSERT(atomic_read(&ticket->t_ref) > 0); 3392 if (atomic_dec_and_test(&ticket->t_ref)) 3393 kmem_cache_free(xfs_log_ticket_zone, ticket); 3394 } 3395 3396 xlog_ticket_t * 3397 xfs_log_ticket_get( 3398 xlog_ticket_t *ticket) 3399 { 3400 ASSERT(atomic_read(&ticket->t_ref) > 0); 3401 atomic_inc(&ticket->t_ref); 3402 return ticket; 3403 } 3404 3405 /* 3406 * Figure out the total log space unit (in bytes) that would be 3407 * required for a log ticket. 3408 */ 3409 int 3410 xfs_log_calc_unit_res( 3411 struct xfs_mount *mp, 3412 int unit_bytes) 3413 { 3414 struct xlog *log = mp->m_log; 3415 int iclog_space; 3416 uint num_headers; 3417 3418 /* 3419 * Permanent reservations have up to 'cnt'-1 active log operations 3420 * in the log. A unit in this case is the amount of space for one 3421 * of these log operations. Normal reservations have a cnt of 1 3422 * and their unit amount is the total amount of space required. 3423 * 3424 * The following lines of code account for non-transaction data 3425 * which occupy space in the on-disk log. 3426 * 3427 * Normal form of a transaction is: 3428 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph> 3429 * and then there are LR hdrs, split-recs and roundoff at end of syncs. 3430 * 3431 * We need to account for all the leadup data and trailer data 3432 * around the transaction data. 3433 * And then we need to account for the worst case in terms of using 3434 * more space. 3435 * The worst case will happen if: 3436 * - the placement of the transaction happens to be such that the 3437 * roundoff is at its maximum 3438 * - the transaction data is synced before the commit record is synced 3439 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff> 3440 * Therefore the commit record is in its own Log Record. 3441 * This can happen as the commit record is called with its 3442 * own region to xlog_write(). 3443 * This then means that in the worst case, roundoff can happen for 3444 * the commit-rec as well. 3445 * The commit-rec is smaller than padding in this scenario and so it is 3446 * not added separately. 3447 */ 3448 3449 /* for trans header */ 3450 unit_bytes += sizeof(xlog_op_header_t); 3451 unit_bytes += sizeof(xfs_trans_header_t); 3452 3453 /* for start-rec */ 3454 unit_bytes += sizeof(xlog_op_header_t); 3455 3456 /* 3457 * for LR headers - the space for data in an iclog is the size minus 3458 * the space used for the headers. If we use the iclog size, then we 3459 * undercalculate the number of headers required. 3460 * 3461 * Furthermore - the addition of op headers for split-recs might 3462 * increase the space required enough to require more log and op 3463 * headers, so take that into account too. 3464 * 3465 * IMPORTANT: This reservation makes the assumption that if this 3466 * transaction is the first in an iclog and hence has the LR headers 3467 * accounted to it, then the remaining space in the iclog is 3468 * exclusively for this transaction. i.e. if the transaction is larger 3469 * than the iclog, it will be the only thing in that iclog. 3470 * Fundamentally, this means we must pass the entire log vector to 3471 * xlog_write to guarantee this. 3472 */ 3473 iclog_space = log->l_iclog_size - log->l_iclog_hsize; 3474 num_headers = howmany(unit_bytes, iclog_space); 3475 3476 /* for split-recs - ophdrs added when data split over LRs */ 3477 unit_bytes += sizeof(xlog_op_header_t) * num_headers; 3478 3479 /* add extra header reservations if we overrun */ 3480 while (!num_headers || 3481 howmany(unit_bytes, iclog_space) > num_headers) { 3482 unit_bytes += sizeof(xlog_op_header_t); 3483 num_headers++; 3484 } 3485 unit_bytes += log->l_iclog_hsize * num_headers; 3486 3487 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3488 unit_bytes += log->l_iclog_hsize; 3489 3490 /* for roundoff padding for transaction data and one for commit record */ 3491 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { 3492 /* log su roundoff */ 3493 unit_bytes += 2 * mp->m_sb.sb_logsunit; 3494 } else { 3495 /* BB roundoff */ 3496 unit_bytes += 2 * BBSIZE; 3497 } 3498 3499 return unit_bytes; 3500 } 3501 3502 /* 3503 * Allocate and initialise a new log ticket. 3504 */ 3505 struct xlog_ticket * 3506 xlog_ticket_alloc( 3507 struct xlog *log, 3508 int unit_bytes, 3509 int cnt, 3510 char client, 3511 bool permanent, 3512 xfs_km_flags_t alloc_flags) 3513 { 3514 struct xlog_ticket *tic; 3515 int unit_res; 3516 3517 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); 3518 if (!tic) 3519 return NULL; 3520 3521 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); 3522 3523 atomic_set(&tic->t_ref, 1); 3524 tic->t_task = current; 3525 INIT_LIST_HEAD(&tic->t_queue); 3526 tic->t_unit_res = unit_res; 3527 tic->t_curr_res = unit_res; 3528 tic->t_cnt = cnt; 3529 tic->t_ocnt = cnt; 3530 tic->t_tid = prandom_u32(); 3531 tic->t_clientid = client; 3532 tic->t_flags = XLOG_TIC_INITED; 3533 if (permanent) 3534 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3535 3536 xlog_tic_reset_res(tic); 3537 3538 return tic; 3539 } 3540 3541 3542 /****************************************************************************** 3543 * 3544 * Log debug routines 3545 * 3546 ****************************************************************************** 3547 */ 3548 #if defined(DEBUG) 3549 /* 3550 * Make sure that the destination ptr is within the valid data region of 3551 * one of the iclogs. This uses backup pointers stored in a different 3552 * part of the log in case we trash the log structure. 3553 */ 3554 STATIC void 3555 xlog_verify_dest_ptr( 3556 struct xlog *log, 3557 void *ptr) 3558 { 3559 int i; 3560 int good_ptr = 0; 3561 3562 for (i = 0; i < log->l_iclog_bufs; i++) { 3563 if (ptr >= log->l_iclog_bak[i] && 3564 ptr <= log->l_iclog_bak[i] + log->l_iclog_size) 3565 good_ptr++; 3566 } 3567 3568 if (!good_ptr) 3569 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); 3570 } 3571 3572 /* 3573 * Check to make sure the grant write head didn't just over lap the tail. If 3574 * the cycles are the same, we can't be overlapping. Otherwise, make sure that 3575 * the cycles differ by exactly one and check the byte count. 3576 * 3577 * This check is run unlocked, so can give false positives. Rather than assert 3578 * on failures, use a warn-once flag and a panic tag to allow the admin to 3579 * determine if they want to panic the machine when such an error occurs. For 3580 * debug kernels this will have the same effect as using an assert but, unlinke 3581 * an assert, it can be turned off at runtime. 3582 */ 3583 STATIC void 3584 xlog_verify_grant_tail( 3585 struct xlog *log) 3586 { 3587 int tail_cycle, tail_blocks; 3588 int cycle, space; 3589 3590 xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); 3591 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); 3592 if (tail_cycle != cycle) { 3593 if (cycle - 1 != tail_cycle && 3594 !(log->l_flags & XLOG_TAIL_WARN)) { 3595 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3596 "%s: cycle - 1 != tail_cycle", __func__); 3597 log->l_flags |= XLOG_TAIL_WARN; 3598 } 3599 3600 if (space > BBTOB(tail_blocks) && 3601 !(log->l_flags & XLOG_TAIL_WARN)) { 3602 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3603 "%s: space > BBTOB(tail_blocks)", __func__); 3604 log->l_flags |= XLOG_TAIL_WARN; 3605 } 3606 } 3607 } 3608 3609 /* check if it will fit */ 3610 STATIC void 3611 xlog_verify_tail_lsn( 3612 struct xlog *log, 3613 struct xlog_in_core *iclog, 3614 xfs_lsn_t tail_lsn) 3615 { 3616 int blocks; 3617 3618 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { 3619 blocks = 3620 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3621 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3622 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3623 } else { 3624 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3625 3626 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3627 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); 3628 3629 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3630 if (blocks < BTOBB(iclog->ic_offset) + 1) 3631 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3632 } 3633 } /* xlog_verify_tail_lsn */ 3634 3635 /* 3636 * Perform a number of checks on the iclog before writing to disk. 3637 * 3638 * 1. Make sure the iclogs are still circular 3639 * 2. Make sure we have a good magic number 3640 * 3. Make sure we don't have magic numbers in the data 3641 * 4. Check fields of each log operation header for: 3642 * A. Valid client identifier 3643 * B. tid ptr value falls in valid ptr space (user space code) 3644 * C. Length in log record header is correct according to the 3645 * individual operation headers within record. 3646 * 5. When a bwrite will occur within 5 blocks of the front of the physical 3647 * log, check the preceding blocks of the physical log to make sure all 3648 * the cycle numbers agree with the current cycle number. 3649 */ 3650 STATIC void 3651 xlog_verify_iclog( 3652 struct xlog *log, 3653 struct xlog_in_core *iclog, 3654 int count) 3655 { 3656 xlog_op_header_t *ophead; 3657 xlog_in_core_t *icptr; 3658 xlog_in_core_2_t *xhdr; 3659 void *base_ptr, *ptr, *p; 3660 ptrdiff_t field_offset; 3661 uint8_t clientid; 3662 int len, i, j, k, op_len; 3663 int idx; 3664 3665 /* check validity of iclog pointers */ 3666 spin_lock(&log->l_icloglock); 3667 icptr = log->l_iclog; 3668 for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) 3669 ASSERT(icptr); 3670 3671 if (icptr != log->l_iclog) 3672 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); 3673 spin_unlock(&log->l_icloglock); 3674 3675 /* check log magic numbers */ 3676 if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3677 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); 3678 3679 base_ptr = ptr = &iclog->ic_header; 3680 p = &iclog->ic_header; 3681 for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { 3682 if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3683 xfs_emerg(log->l_mp, "%s: unexpected magic num", 3684 __func__); 3685 } 3686 3687 /* check fields */ 3688 len = be32_to_cpu(iclog->ic_header.h_num_logops); 3689 base_ptr = ptr = iclog->ic_datap; 3690 ophead = ptr; 3691 xhdr = iclog->ic_data; 3692 for (i = 0; i < len; i++) { 3693 ophead = ptr; 3694 3695 /* clientid is only 1 byte */ 3696 p = &ophead->oh_clientid; 3697 field_offset = p - base_ptr; 3698 if (field_offset & 0x1ff) { 3699 clientid = ophead->oh_clientid; 3700 } else { 3701 idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); 3702 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3703 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3704 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3705 clientid = xlog_get_client_id( 3706 xhdr[j].hic_xheader.xh_cycle_data[k]); 3707 } else { 3708 clientid = xlog_get_client_id( 3709 iclog->ic_header.h_cycle_data[idx]); 3710 } 3711 } 3712 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3713 xfs_warn(log->l_mp, 3714 "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", 3715 __func__, clientid, ophead, 3716 (unsigned long)field_offset); 3717 3718 /* check length */ 3719 p = &ophead->oh_len; 3720 field_offset = p - base_ptr; 3721 if (field_offset & 0x1ff) { 3722 op_len = be32_to_cpu(ophead->oh_len); 3723 } else { 3724 idx = BTOBBT((uintptr_t)&ophead->oh_len - 3725 (uintptr_t)iclog->ic_datap); 3726 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3727 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3728 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3729 op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); 3730 } else { 3731 op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); 3732 } 3733 } 3734 ptr += sizeof(xlog_op_header_t) + op_len; 3735 } 3736 } /* xlog_verify_iclog */ 3737 #endif 3738 3739 /* 3740 * Mark all iclogs IOERROR. l_icloglock is held by the caller. 3741 */ 3742 STATIC int 3743 xlog_state_ioerror( 3744 struct xlog *log) 3745 { 3746 xlog_in_core_t *iclog, *ic; 3747 3748 iclog = log->l_iclog; 3749 if (iclog->ic_state != XLOG_STATE_IOERROR) { 3750 /* 3751 * Mark all the incore logs IOERROR. 3752 * From now on, no log flushes will result. 3753 */ 3754 ic = iclog; 3755 do { 3756 ic->ic_state = XLOG_STATE_IOERROR; 3757 ic = ic->ic_next; 3758 } while (ic != iclog); 3759 return 0; 3760 } 3761 /* 3762 * Return non-zero, if state transition has already happened. 3763 */ 3764 return 1; 3765 } 3766 3767 /* 3768 * This is called from xfs_force_shutdown, when we're forcibly 3769 * shutting down the filesystem, typically because of an IO error. 3770 * Our main objectives here are to make sure that: 3771 * a. if !logerror, flush the logs to disk. Anything modified 3772 * after this is ignored. 3773 * b. the filesystem gets marked 'SHUTDOWN' for all interested 3774 * parties to find out, 'atomically'. 3775 * c. those who're sleeping on log reservations, pinned objects and 3776 * other resources get woken up, and be told the bad news. 3777 * d. nothing new gets queued up after (b) and (c) are done. 3778 * 3779 * Note: for the !logerror case we need to flush the regions held in memory out 3780 * to disk first. This needs to be done before the log is marked as shutdown, 3781 * otherwise the iclog writes will fail. 3782 */ 3783 int 3784 xfs_log_force_umount( 3785 struct xfs_mount *mp, 3786 int logerror) 3787 { 3788 struct xlog *log; 3789 int retval; 3790 3791 log = mp->m_log; 3792 3793 /* 3794 * If this happens during log recovery, don't worry about 3795 * locking; the log isn't open for business yet. 3796 */ 3797 if (!log || 3798 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3799 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3800 if (mp->m_sb_bp) 3801 mp->m_sb_bp->b_flags |= XBF_DONE; 3802 return 0; 3803 } 3804 3805 /* 3806 * Somebody could've already done the hard work for us. 3807 * No need to get locks for this. 3808 */ 3809 if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { 3810 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3811 return 1; 3812 } 3813 3814 /* 3815 * Flush all the completed transactions to disk before marking the log 3816 * being shut down. We need to do it in this order to ensure that 3817 * completed operations are safely on disk before we shut down, and that 3818 * we don't have to issue any buffer IO after the shutdown flags are set 3819 * to guarantee this. 3820 */ 3821 if (!logerror) 3822 xfs_log_force(mp, XFS_LOG_SYNC); 3823 3824 /* 3825 * mark the filesystem and the as in a shutdown state and wake 3826 * everybody up to tell them the bad news. 3827 */ 3828 spin_lock(&log->l_icloglock); 3829 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3830 if (mp->m_sb_bp) 3831 mp->m_sb_bp->b_flags |= XBF_DONE; 3832 3833 /* 3834 * Mark the log and the iclogs with IO error flags to prevent any 3835 * further log IO from being issued or completed. 3836 */ 3837 log->l_flags |= XLOG_IO_ERROR; 3838 retval = xlog_state_ioerror(log); 3839 spin_unlock(&log->l_icloglock); 3840 3841 /* 3842 * We don't want anybody waiting for log reservations after this. That 3843 * means we have to wake up everybody queued up on reserveq as well as 3844 * writeq. In addition, we make sure in xlog_{re}grant_log_space that 3845 * we don't enqueue anything once the SHUTDOWN flag is set, and this 3846 * action is protected by the grant locks. 3847 */ 3848 xlog_grant_head_wake_all(&log->l_reserve_head); 3849 xlog_grant_head_wake_all(&log->l_write_head); 3850 3851 /* 3852 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first 3853 * as if the log writes were completed. The abort handling in the log 3854 * item committed callback functions will do this again under lock to 3855 * avoid races. 3856 */ 3857 spin_lock(&log->l_cilp->xc_push_lock); 3858 wake_up_all(&log->l_cilp->xc_commit_wait); 3859 spin_unlock(&log->l_cilp->xc_push_lock); 3860 xlog_state_do_callback(log); 3861 3862 /* return non-zero if log IOERROR transition had already happened */ 3863 return retval; 3864 } 3865 3866 STATIC int 3867 xlog_iclogs_empty( 3868 struct xlog *log) 3869 { 3870 xlog_in_core_t *iclog; 3871 3872 iclog = log->l_iclog; 3873 do { 3874 /* endianness does not matter here, zero is zero in 3875 * any language. 3876 */ 3877 if (iclog->ic_header.h_num_logops) 3878 return 0; 3879 iclog = iclog->ic_next; 3880 } while (iclog != log->l_iclog); 3881 return 1; 3882 } 3883 3884 /* 3885 * Verify that an LSN stamped into a piece of metadata is valid. This is 3886 * intended for use in read verifiers on v5 superblocks. 3887 */ 3888 bool 3889 xfs_log_check_lsn( 3890 struct xfs_mount *mp, 3891 xfs_lsn_t lsn) 3892 { 3893 struct xlog *log = mp->m_log; 3894 bool valid; 3895 3896 /* 3897 * norecovery mode skips mount-time log processing and unconditionally 3898 * resets the in-core LSN. We can't validate in this mode, but 3899 * modifications are not allowed anyways so just return true. 3900 */ 3901 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 3902 return true; 3903 3904 /* 3905 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is 3906 * handled by recovery and thus safe to ignore here. 3907 */ 3908 if (lsn == NULLCOMMITLSN) 3909 return true; 3910 3911 valid = xlog_valid_lsn(mp->m_log, lsn); 3912 3913 /* warn the user about what's gone wrong before verifier failure */ 3914 if (!valid) { 3915 spin_lock(&log->l_icloglock); 3916 xfs_warn(mp, 3917 "Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " 3918 "Please unmount and run xfs_repair (>= v4.3) to resolve.", 3919 CYCLE_LSN(lsn), BLOCK_LSN(lsn), 3920 log->l_curr_cycle, log->l_curr_block); 3921 spin_unlock(&log->l_icloglock); 3922 } 3923 3924 return valid; 3925 } 3926 3927 bool 3928 xfs_log_in_recovery( 3929 struct xfs_mount *mp) 3930 { 3931 struct xlog *log = mp->m_log; 3932 3933 return log->l_flags & XLOG_ACTIVE_RECOVERY; 3934 } 3935