1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_errortag.h" 14 #include "xfs_error.h" 15 #include "xfs_trans.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_log.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trace.h" 20 #include "xfs_sysfs.h" 21 #include "xfs_sb.h" 22 #include "xfs_health.h" 23 24 kmem_zone_t *xfs_log_ticket_zone; 25 26 /* Local miscellaneous function prototypes */ 27 STATIC int 28 xlog_commit_record( 29 struct xlog *log, 30 struct xlog_ticket *ticket, 31 struct xlog_in_core **iclog, 32 xfs_lsn_t *commitlsnp); 33 34 STATIC struct xlog * 35 xlog_alloc_log( 36 struct xfs_mount *mp, 37 struct xfs_buftarg *log_target, 38 xfs_daddr_t blk_offset, 39 int num_bblks); 40 STATIC int 41 xlog_space_left( 42 struct xlog *log, 43 atomic64_t *head); 44 STATIC void 45 xlog_dealloc_log( 46 struct xlog *log); 47 48 /* local state machine functions */ 49 STATIC void xlog_state_done_syncing( 50 struct xlog_in_core *iclog, 51 bool aborted); 52 STATIC int 53 xlog_state_get_iclog_space( 54 struct xlog *log, 55 int len, 56 struct xlog_in_core **iclog, 57 struct xlog_ticket *ticket, 58 int *continued_write, 59 int *logoffsetp); 60 STATIC int 61 xlog_state_release_iclog( 62 struct xlog *log, 63 struct xlog_in_core *iclog); 64 STATIC void 65 xlog_state_switch_iclogs( 66 struct xlog *log, 67 struct xlog_in_core *iclog, 68 int eventual_size); 69 STATIC void 70 xlog_state_want_sync( 71 struct xlog *log, 72 struct xlog_in_core *iclog); 73 74 STATIC void 75 xlog_grant_push_ail( 76 struct xlog *log, 77 int need_bytes); 78 STATIC void 79 xlog_regrant_reserve_log_space( 80 struct xlog *log, 81 struct xlog_ticket *ticket); 82 STATIC void 83 xlog_ungrant_log_space( 84 struct xlog *log, 85 struct xlog_ticket *ticket); 86 87 #if defined(DEBUG) 88 STATIC void 89 xlog_verify_dest_ptr( 90 struct xlog *log, 91 void *ptr); 92 STATIC void 93 xlog_verify_grant_tail( 94 struct xlog *log); 95 STATIC void 96 xlog_verify_iclog( 97 struct xlog *log, 98 struct xlog_in_core *iclog, 99 int count); 100 STATIC void 101 xlog_verify_tail_lsn( 102 struct xlog *log, 103 struct xlog_in_core *iclog, 104 xfs_lsn_t tail_lsn); 105 #else 106 #define xlog_verify_dest_ptr(a,b) 107 #define xlog_verify_grant_tail(a) 108 #define xlog_verify_iclog(a,b,c) 109 #define xlog_verify_tail_lsn(a,b,c) 110 #endif 111 112 STATIC int 113 xlog_iclogs_empty( 114 struct xlog *log); 115 116 static void 117 xlog_grant_sub_space( 118 struct xlog *log, 119 atomic64_t *head, 120 int bytes) 121 { 122 int64_t head_val = atomic64_read(head); 123 int64_t new, old; 124 125 do { 126 int cycle, space; 127 128 xlog_crack_grant_head_val(head_val, &cycle, &space); 129 130 space -= bytes; 131 if (space < 0) { 132 space += log->l_logsize; 133 cycle--; 134 } 135 136 old = head_val; 137 new = xlog_assign_grant_head_val(cycle, space); 138 head_val = atomic64_cmpxchg(head, old, new); 139 } while (head_val != old); 140 } 141 142 static void 143 xlog_grant_add_space( 144 struct xlog *log, 145 atomic64_t *head, 146 int bytes) 147 { 148 int64_t head_val = atomic64_read(head); 149 int64_t new, old; 150 151 do { 152 int tmp; 153 int cycle, space; 154 155 xlog_crack_grant_head_val(head_val, &cycle, &space); 156 157 tmp = log->l_logsize - space; 158 if (tmp > bytes) 159 space += bytes; 160 else { 161 space = bytes - tmp; 162 cycle++; 163 } 164 165 old = head_val; 166 new = xlog_assign_grant_head_val(cycle, space); 167 head_val = atomic64_cmpxchg(head, old, new); 168 } while (head_val != old); 169 } 170 171 STATIC void 172 xlog_grant_head_init( 173 struct xlog_grant_head *head) 174 { 175 xlog_assign_grant_head(&head->grant, 1, 0); 176 INIT_LIST_HEAD(&head->waiters); 177 spin_lock_init(&head->lock); 178 } 179 180 STATIC void 181 xlog_grant_head_wake_all( 182 struct xlog_grant_head *head) 183 { 184 struct xlog_ticket *tic; 185 186 spin_lock(&head->lock); 187 list_for_each_entry(tic, &head->waiters, t_queue) 188 wake_up_process(tic->t_task); 189 spin_unlock(&head->lock); 190 } 191 192 static inline int 193 xlog_ticket_reservation( 194 struct xlog *log, 195 struct xlog_grant_head *head, 196 struct xlog_ticket *tic) 197 { 198 if (head == &log->l_write_head) { 199 ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); 200 return tic->t_unit_res; 201 } else { 202 if (tic->t_flags & XLOG_TIC_PERM_RESERV) 203 return tic->t_unit_res * tic->t_cnt; 204 else 205 return tic->t_unit_res; 206 } 207 } 208 209 STATIC bool 210 xlog_grant_head_wake( 211 struct xlog *log, 212 struct xlog_grant_head *head, 213 int *free_bytes) 214 { 215 struct xlog_ticket *tic; 216 int need_bytes; 217 bool woken_task = false; 218 219 list_for_each_entry(tic, &head->waiters, t_queue) { 220 221 /* 222 * There is a chance that the size of the CIL checkpoints in 223 * progress at the last AIL push target calculation resulted in 224 * limiting the target to the log head (l_last_sync_lsn) at the 225 * time. This may not reflect where the log head is now as the 226 * CIL checkpoints may have completed. 227 * 228 * Hence when we are woken here, it may be that the head of the 229 * log that has moved rather than the tail. As the tail didn't 230 * move, there still won't be space available for the 231 * reservation we require. However, if the AIL has already 232 * pushed to the target defined by the old log head location, we 233 * will hang here waiting for something else to update the AIL 234 * push target. 235 * 236 * Therefore, if there isn't space to wake the first waiter on 237 * the grant head, we need to push the AIL again to ensure the 238 * target reflects both the current log tail and log head 239 * position before we wait for the tail to move again. 240 */ 241 242 need_bytes = xlog_ticket_reservation(log, head, tic); 243 if (*free_bytes < need_bytes) { 244 if (!woken_task) 245 xlog_grant_push_ail(log, need_bytes); 246 return false; 247 } 248 249 *free_bytes -= need_bytes; 250 trace_xfs_log_grant_wake_up(log, tic); 251 wake_up_process(tic->t_task); 252 woken_task = true; 253 } 254 255 return true; 256 } 257 258 STATIC int 259 xlog_grant_head_wait( 260 struct xlog *log, 261 struct xlog_grant_head *head, 262 struct xlog_ticket *tic, 263 int need_bytes) __releases(&head->lock) 264 __acquires(&head->lock) 265 { 266 list_add_tail(&tic->t_queue, &head->waiters); 267 268 do { 269 if (XLOG_FORCED_SHUTDOWN(log)) 270 goto shutdown; 271 xlog_grant_push_ail(log, need_bytes); 272 273 __set_current_state(TASK_UNINTERRUPTIBLE); 274 spin_unlock(&head->lock); 275 276 XFS_STATS_INC(log->l_mp, xs_sleep_logspace); 277 278 trace_xfs_log_grant_sleep(log, tic); 279 schedule(); 280 trace_xfs_log_grant_wake(log, tic); 281 282 spin_lock(&head->lock); 283 if (XLOG_FORCED_SHUTDOWN(log)) 284 goto shutdown; 285 } while (xlog_space_left(log, &head->grant) < need_bytes); 286 287 list_del_init(&tic->t_queue); 288 return 0; 289 shutdown: 290 list_del_init(&tic->t_queue); 291 return -EIO; 292 } 293 294 /* 295 * Atomically get the log space required for a log ticket. 296 * 297 * Once a ticket gets put onto head->waiters, it will only return after the 298 * needed reservation is satisfied. 299 * 300 * This function is structured so that it has a lock free fast path. This is 301 * necessary because every new transaction reservation will come through this 302 * path. Hence any lock will be globally hot if we take it unconditionally on 303 * every pass. 304 * 305 * As tickets are only ever moved on and off head->waiters under head->lock, we 306 * only need to take that lock if we are going to add the ticket to the queue 307 * and sleep. We can avoid taking the lock if the ticket was never added to 308 * head->waiters because the t_queue list head will be empty and we hold the 309 * only reference to it so it can safely be checked unlocked. 310 */ 311 STATIC int 312 xlog_grant_head_check( 313 struct xlog *log, 314 struct xlog_grant_head *head, 315 struct xlog_ticket *tic, 316 int *need_bytes) 317 { 318 int free_bytes; 319 int error = 0; 320 321 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 322 323 /* 324 * If there are other waiters on the queue then give them a chance at 325 * logspace before us. Wake up the first waiters, if we do not wake 326 * up all the waiters then go to sleep waiting for more free space, 327 * otherwise try to get some space for this transaction. 328 */ 329 *need_bytes = xlog_ticket_reservation(log, head, tic); 330 free_bytes = xlog_space_left(log, &head->grant); 331 if (!list_empty_careful(&head->waiters)) { 332 spin_lock(&head->lock); 333 if (!xlog_grant_head_wake(log, head, &free_bytes) || 334 free_bytes < *need_bytes) { 335 error = xlog_grant_head_wait(log, head, tic, 336 *need_bytes); 337 } 338 spin_unlock(&head->lock); 339 } else if (free_bytes < *need_bytes) { 340 spin_lock(&head->lock); 341 error = xlog_grant_head_wait(log, head, tic, *need_bytes); 342 spin_unlock(&head->lock); 343 } 344 345 return error; 346 } 347 348 static void 349 xlog_tic_reset_res(xlog_ticket_t *tic) 350 { 351 tic->t_res_num = 0; 352 tic->t_res_arr_sum = 0; 353 tic->t_res_num_ophdrs = 0; 354 } 355 356 static void 357 xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) 358 { 359 if (tic->t_res_num == XLOG_TIC_LEN_MAX) { 360 /* add to overflow and start again */ 361 tic->t_res_o_flow += tic->t_res_arr_sum; 362 tic->t_res_num = 0; 363 tic->t_res_arr_sum = 0; 364 } 365 366 tic->t_res_arr[tic->t_res_num].r_len = len; 367 tic->t_res_arr[tic->t_res_num].r_type = type; 368 tic->t_res_arr_sum += len; 369 tic->t_res_num++; 370 } 371 372 /* 373 * Replenish the byte reservation required by moving the grant write head. 374 */ 375 int 376 xfs_log_regrant( 377 struct xfs_mount *mp, 378 struct xlog_ticket *tic) 379 { 380 struct xlog *log = mp->m_log; 381 int need_bytes; 382 int error = 0; 383 384 if (XLOG_FORCED_SHUTDOWN(log)) 385 return -EIO; 386 387 XFS_STATS_INC(mp, xs_try_logspace); 388 389 /* 390 * This is a new transaction on the ticket, so we need to change the 391 * transaction ID so that the next transaction has a different TID in 392 * the log. Just add one to the existing tid so that we can see chains 393 * of rolling transactions in the log easily. 394 */ 395 tic->t_tid++; 396 397 xlog_grant_push_ail(log, tic->t_unit_res); 398 399 tic->t_curr_res = tic->t_unit_res; 400 xlog_tic_reset_res(tic); 401 402 if (tic->t_cnt > 0) 403 return 0; 404 405 trace_xfs_log_regrant(log, tic); 406 407 error = xlog_grant_head_check(log, &log->l_write_head, tic, 408 &need_bytes); 409 if (error) 410 goto out_error; 411 412 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 413 trace_xfs_log_regrant_exit(log, tic); 414 xlog_verify_grant_tail(log); 415 return 0; 416 417 out_error: 418 /* 419 * If we are failing, make sure the ticket doesn't have any current 420 * reservations. We don't want to add this back when the ticket/ 421 * transaction gets cancelled. 422 */ 423 tic->t_curr_res = 0; 424 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 425 return error; 426 } 427 428 /* 429 * Reserve log space and return a ticket corresponding to the reservation. 430 * 431 * Each reservation is going to reserve extra space for a log record header. 432 * When writes happen to the on-disk log, we don't subtract the length of the 433 * log record header from any reservation. By wasting space in each 434 * reservation, we prevent over allocation problems. 435 */ 436 int 437 xfs_log_reserve( 438 struct xfs_mount *mp, 439 int unit_bytes, 440 int cnt, 441 struct xlog_ticket **ticp, 442 uint8_t client, 443 bool permanent) 444 { 445 struct xlog *log = mp->m_log; 446 struct xlog_ticket *tic; 447 int need_bytes; 448 int error = 0; 449 450 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 451 452 if (XLOG_FORCED_SHUTDOWN(log)) 453 return -EIO; 454 455 XFS_STATS_INC(mp, xs_try_logspace); 456 457 ASSERT(*ticp == NULL); 458 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0); 459 *ticp = tic; 460 461 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt 462 : tic->t_unit_res); 463 464 trace_xfs_log_reserve(log, tic); 465 466 error = xlog_grant_head_check(log, &log->l_reserve_head, tic, 467 &need_bytes); 468 if (error) 469 goto out_error; 470 471 xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); 472 xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); 473 trace_xfs_log_reserve_exit(log, tic); 474 xlog_verify_grant_tail(log); 475 return 0; 476 477 out_error: 478 /* 479 * If we are failing, make sure the ticket doesn't have any current 480 * reservations. We don't want to add this back when the ticket/ 481 * transaction gets cancelled. 482 */ 483 tic->t_curr_res = 0; 484 tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ 485 return error; 486 } 487 488 489 /* 490 * NOTES: 491 * 492 * 1. currblock field gets updated at startup and after in-core logs 493 * marked as with WANT_SYNC. 494 */ 495 496 /* 497 * This routine is called when a user of a log manager ticket is done with 498 * the reservation. If the ticket was ever used, then a commit record for 499 * the associated transaction is written out as a log operation header with 500 * no data. The flag XLOG_TIC_INITED is set when the first write occurs with 501 * a given ticket. If the ticket was one with a permanent reservation, then 502 * a few operations are done differently. Permanent reservation tickets by 503 * default don't release the reservation. They just commit the current 504 * transaction with the belief that the reservation is still needed. A flag 505 * must be passed in before permanent reservations are actually released. 506 * When these type of tickets are not released, they need to be set into 507 * the inited state again. By doing this, a start record will be written 508 * out when the next write occurs. 509 */ 510 xfs_lsn_t 511 xfs_log_done( 512 struct xfs_mount *mp, 513 struct xlog_ticket *ticket, 514 struct xlog_in_core **iclog, 515 bool regrant) 516 { 517 struct xlog *log = mp->m_log; 518 xfs_lsn_t lsn = 0; 519 520 if (XLOG_FORCED_SHUTDOWN(log) || 521 /* 522 * If nothing was ever written, don't write out commit record. 523 * If we get an error, just continue and give back the log ticket. 524 */ 525 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 526 (xlog_commit_record(log, ticket, iclog, &lsn)))) { 527 lsn = (xfs_lsn_t) -1; 528 regrant = false; 529 } 530 531 532 if (!regrant) { 533 trace_xfs_log_done_nonperm(log, ticket); 534 535 /* 536 * Release ticket if not permanent reservation or a specific 537 * request has been made to release a permanent reservation. 538 */ 539 xlog_ungrant_log_space(log, ticket); 540 } else { 541 trace_xfs_log_done_perm(log, ticket); 542 543 xlog_regrant_reserve_log_space(log, ticket); 544 /* If this ticket was a permanent reservation and we aren't 545 * trying to release it, reset the inited flags; so next time 546 * we write, a start record will be written out. 547 */ 548 ticket->t_flags |= XLOG_TIC_INITED; 549 } 550 551 xfs_log_ticket_put(ticket); 552 return lsn; 553 } 554 555 int 556 xfs_log_release_iclog( 557 struct xfs_mount *mp, 558 struct xlog_in_core *iclog) 559 { 560 if (xlog_state_release_iclog(mp->m_log, iclog)) { 561 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 562 return -EIO; 563 } 564 565 return 0; 566 } 567 568 /* 569 * Mount a log filesystem 570 * 571 * mp - ubiquitous xfs mount point structure 572 * log_target - buftarg of on-disk log device 573 * blk_offset - Start block # where block size is 512 bytes (BBSIZE) 574 * num_bblocks - Number of BBSIZE blocks in on-disk log 575 * 576 * Return error or zero. 577 */ 578 int 579 xfs_log_mount( 580 xfs_mount_t *mp, 581 xfs_buftarg_t *log_target, 582 xfs_daddr_t blk_offset, 583 int num_bblks) 584 { 585 bool fatal = xfs_sb_version_hascrc(&mp->m_sb); 586 int error = 0; 587 int min_logfsbs; 588 589 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 590 xfs_notice(mp, "Mounting V%d Filesystem", 591 XFS_SB_VERSION_NUM(&mp->m_sb)); 592 } else { 593 xfs_notice(mp, 594 "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", 595 XFS_SB_VERSION_NUM(&mp->m_sb)); 596 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 597 } 598 599 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 600 if (IS_ERR(mp->m_log)) { 601 error = PTR_ERR(mp->m_log); 602 goto out; 603 } 604 605 /* 606 * Validate the given log space and drop a critical message via syslog 607 * if the log size is too small that would lead to some unexpected 608 * situations in transaction log space reservation stage. 609 * 610 * Note: we can't just reject the mount if the validation fails. This 611 * would mean that people would have to downgrade their kernel just to 612 * remedy the situation as there is no way to grow the log (short of 613 * black magic surgery with xfs_db). 614 * 615 * We can, however, reject mounts for CRC format filesystems, as the 616 * mkfs binary being used to make the filesystem should never create a 617 * filesystem with a log that is too small. 618 */ 619 min_logfsbs = xfs_log_calc_minimum_size(mp); 620 621 if (mp->m_sb.sb_logblocks < min_logfsbs) { 622 xfs_warn(mp, 623 "Log size %d blocks too small, minimum size is %d blocks", 624 mp->m_sb.sb_logblocks, min_logfsbs); 625 error = -EINVAL; 626 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { 627 xfs_warn(mp, 628 "Log size %d blocks too large, maximum size is %lld blocks", 629 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); 630 error = -EINVAL; 631 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { 632 xfs_warn(mp, 633 "log size %lld bytes too large, maximum size is %lld bytes", 634 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), 635 XFS_MAX_LOG_BYTES); 636 error = -EINVAL; 637 } else if (mp->m_sb.sb_logsunit > 1 && 638 mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { 639 xfs_warn(mp, 640 "log stripe unit %u bytes must be a multiple of block size", 641 mp->m_sb.sb_logsunit); 642 error = -EINVAL; 643 fatal = true; 644 } 645 if (error) { 646 /* 647 * Log check errors are always fatal on v5; or whenever bad 648 * metadata leads to a crash. 649 */ 650 if (fatal) { 651 xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); 652 ASSERT(0); 653 goto out_free_log; 654 } 655 xfs_crit(mp, "Log size out of supported range."); 656 xfs_crit(mp, 657 "Continuing onwards, but if log hangs are experienced then please report this message in the bug report."); 658 } 659 660 /* 661 * Initialize the AIL now we have a log. 662 */ 663 error = xfs_trans_ail_init(mp); 664 if (error) { 665 xfs_warn(mp, "AIL initialisation failed: error %d", error); 666 goto out_free_log; 667 } 668 mp->m_log->l_ailp = mp->m_ail; 669 670 /* 671 * skip log recovery on a norecovery mount. pretend it all 672 * just worked. 673 */ 674 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { 675 int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 676 677 if (readonly) 678 mp->m_flags &= ~XFS_MOUNT_RDONLY; 679 680 error = xlog_recover(mp->m_log); 681 682 if (readonly) 683 mp->m_flags |= XFS_MOUNT_RDONLY; 684 if (error) { 685 xfs_warn(mp, "log mount/recovery failed: error %d", 686 error); 687 xlog_recover_cancel(mp->m_log); 688 goto out_destroy_ail; 689 } 690 } 691 692 error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, 693 "log"); 694 if (error) 695 goto out_destroy_ail; 696 697 /* Normal transactions can now occur */ 698 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 699 700 /* 701 * Now the log has been fully initialised and we know were our 702 * space grant counters are, we can initialise the permanent ticket 703 * needed for delayed logging to work. 704 */ 705 xlog_cil_init_post_recovery(mp->m_log); 706 707 return 0; 708 709 out_destroy_ail: 710 xfs_trans_ail_destroy(mp); 711 out_free_log: 712 xlog_dealloc_log(mp->m_log); 713 out: 714 return error; 715 } 716 717 /* 718 * Finish the recovery of the file system. This is separate from the 719 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read 720 * in the root and real-time bitmap inodes between calling xfs_log_mount() and 721 * here. 722 * 723 * If we finish recovery successfully, start the background log work. If we are 724 * not doing recovery, then we have a RO filesystem and we don't need to start 725 * it. 726 */ 727 int 728 xfs_log_mount_finish( 729 struct xfs_mount *mp) 730 { 731 int error = 0; 732 bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); 733 bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; 734 735 if (mp->m_flags & XFS_MOUNT_NORECOVERY) { 736 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 737 return 0; 738 } else if (readonly) { 739 /* Allow unlinked processing to proceed */ 740 mp->m_flags &= ~XFS_MOUNT_RDONLY; 741 } 742 743 /* 744 * During the second phase of log recovery, we need iget and 745 * iput to behave like they do for an active filesystem. 746 * xfs_fs_drop_inode needs to be able to prevent the deletion 747 * of inodes before we're done replaying log items on those 748 * inodes. Turn it off immediately after recovery finishes 749 * so that we don't leak the quota inodes if subsequent mount 750 * activities fail. 751 * 752 * We let all inodes involved in redo item processing end up on 753 * the LRU instead of being evicted immediately so that if we do 754 * something to an unlinked inode, the irele won't cause 755 * premature truncation and freeing of the inode, which results 756 * in log recovery failure. We have to evict the unreferenced 757 * lru inodes after clearing SB_ACTIVE because we don't 758 * otherwise clean up the lru if there's a subsequent failure in 759 * xfs_mountfs, which leads to us leaking the inodes if nothing 760 * else (e.g. quotacheck) references the inodes before the 761 * mount failure occurs. 762 */ 763 mp->m_super->s_flags |= SB_ACTIVE; 764 error = xlog_recover_finish(mp->m_log); 765 if (!error) 766 xfs_log_work_queue(mp); 767 mp->m_super->s_flags &= ~SB_ACTIVE; 768 evict_inodes(mp->m_super); 769 770 /* 771 * Drain the buffer LRU after log recovery. This is required for v4 772 * filesystems to avoid leaving around buffers with NULL verifier ops, 773 * but we do it unconditionally to make sure we're always in a clean 774 * cache state after mount. 775 * 776 * Don't push in the error case because the AIL may have pending intents 777 * that aren't removed until recovery is cancelled. 778 */ 779 if (!error && recovered) { 780 xfs_log_force(mp, XFS_LOG_SYNC); 781 xfs_ail_push_all_sync(mp->m_ail); 782 } 783 xfs_wait_buftarg(mp->m_ddev_targp); 784 785 if (readonly) 786 mp->m_flags |= XFS_MOUNT_RDONLY; 787 788 return error; 789 } 790 791 /* 792 * The mount has failed. Cancel the recovery if it hasn't completed and destroy 793 * the log. 794 */ 795 void 796 xfs_log_mount_cancel( 797 struct xfs_mount *mp) 798 { 799 xlog_recover_cancel(mp->m_log); 800 xfs_log_unmount(mp); 801 } 802 803 /* 804 * Final log writes as part of unmount. 805 * 806 * Mark the filesystem clean as unmount happens. Note that during relocation 807 * this routine needs to be executed as part of source-bag while the 808 * deallocation must not be done until source-end. 809 */ 810 811 /* Actually write the unmount record to disk. */ 812 static void 813 xfs_log_write_unmount_record( 814 struct xfs_mount *mp) 815 { 816 /* the data section must be 32 bit size aligned */ 817 struct xfs_unmount_log_format magic = { 818 .magic = XLOG_UNMOUNT_TYPE, 819 }; 820 struct xfs_log_iovec reg = { 821 .i_addr = &magic, 822 .i_len = sizeof(magic), 823 .i_type = XLOG_REG_TYPE_UNMOUNT, 824 }; 825 struct xfs_log_vec vec = { 826 .lv_niovecs = 1, 827 .lv_iovecp = ®, 828 }; 829 struct xlog *log = mp->m_log; 830 struct xlog_in_core *iclog; 831 struct xlog_ticket *tic = NULL; 832 xfs_lsn_t lsn; 833 uint flags = XLOG_UNMOUNT_TRANS; 834 int error; 835 836 error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); 837 if (error) 838 goto out_err; 839 840 /* 841 * If we think the summary counters are bad, clear the unmount header 842 * flag in the unmount record so that the summary counters will be 843 * recalculated during log recovery at next mount. Refer to 844 * xlog_check_unmount_rec for more details. 845 */ 846 if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, 847 XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { 848 xfs_alert(mp, "%s: will fix summary counters at next mount", 849 __func__); 850 flags &= ~XLOG_UNMOUNT_TRANS; 851 } 852 853 /* remove inited flag, and account for space used */ 854 tic->t_flags = 0; 855 tic->t_curr_res -= sizeof(magic); 856 error = xlog_write(log, &vec, tic, &lsn, NULL, flags); 857 /* 858 * At this point, we're umounting anyway, so there's no point in 859 * transitioning log state to IOERROR. Just continue... 860 */ 861 out_err: 862 if (error) 863 xfs_alert(mp, "%s: unmount record failed", __func__); 864 865 spin_lock(&log->l_icloglock); 866 iclog = log->l_iclog; 867 atomic_inc(&iclog->ic_refcnt); 868 xlog_state_want_sync(log, iclog); 869 spin_unlock(&log->l_icloglock); 870 error = xlog_state_release_iclog(log, iclog); 871 872 spin_lock(&log->l_icloglock); 873 switch (iclog->ic_state) { 874 default: 875 if (!XLOG_FORCED_SHUTDOWN(log)) { 876 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 877 break; 878 } 879 /* fall through */ 880 case XLOG_STATE_ACTIVE: 881 case XLOG_STATE_DIRTY: 882 spin_unlock(&log->l_icloglock); 883 break; 884 } 885 886 if (tic) { 887 trace_xfs_log_umount_write(log, tic); 888 xlog_ungrant_log_space(log, tic); 889 xfs_log_ticket_put(tic); 890 } 891 } 892 893 /* 894 * Unmount record used to have a string "Unmount filesystem--" in the 895 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 896 * We just write the magic number now since that particular field isn't 897 * currently architecture converted and "Unmount" is a bit foo. 898 * As far as I know, there weren't any dependencies on the old behaviour. 899 */ 900 901 static int 902 xfs_log_unmount_write(xfs_mount_t *mp) 903 { 904 struct xlog *log = mp->m_log; 905 xlog_in_core_t *iclog; 906 #ifdef DEBUG 907 xlog_in_core_t *first_iclog; 908 #endif 909 int error; 910 911 /* 912 * Don't write out unmount record on norecovery mounts or ro devices. 913 * Or, if we are doing a forced umount (typically because of IO errors). 914 */ 915 if (mp->m_flags & XFS_MOUNT_NORECOVERY || 916 xfs_readonly_buftarg(log->l_targ)) { 917 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 918 return 0; 919 } 920 921 error = xfs_log_force(mp, XFS_LOG_SYNC); 922 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); 923 924 #ifdef DEBUG 925 first_iclog = iclog = log->l_iclog; 926 do { 927 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { 928 ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); 929 ASSERT(iclog->ic_offset == 0); 930 } 931 iclog = iclog->ic_next; 932 } while (iclog != first_iclog); 933 #endif 934 if (! (XLOG_FORCED_SHUTDOWN(log))) { 935 xfs_log_write_unmount_record(mp); 936 } else { 937 /* 938 * We're already in forced_shutdown mode, couldn't 939 * even attempt to write out the unmount transaction. 940 * 941 * Go through the motions of sync'ing and releasing 942 * the iclog, even though no I/O will actually happen, 943 * we need to wait for other log I/Os that may already 944 * be in progress. Do this as a separate section of 945 * code so we'll know if we ever get stuck here that 946 * we're in this odd situation of trying to unmount 947 * a file system that went into forced_shutdown as 948 * the result of an unmount.. 949 */ 950 spin_lock(&log->l_icloglock); 951 iclog = log->l_iclog; 952 atomic_inc(&iclog->ic_refcnt); 953 954 xlog_state_want_sync(log, iclog); 955 spin_unlock(&log->l_icloglock); 956 error = xlog_state_release_iclog(log, iclog); 957 958 spin_lock(&log->l_icloglock); 959 960 if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE 961 || iclog->ic_state == XLOG_STATE_DIRTY 962 || iclog->ic_state == XLOG_STATE_IOERROR) ) { 963 964 xlog_wait(&iclog->ic_force_wait, 965 &log->l_icloglock); 966 } else { 967 spin_unlock(&log->l_icloglock); 968 } 969 } 970 971 return error; 972 } /* xfs_log_unmount_write */ 973 974 /* 975 * Empty the log for unmount/freeze. 976 * 977 * To do this, we first need to shut down the background log work so it is not 978 * trying to cover the log as we clean up. We then need to unpin all objects in 979 * the log so we can then flush them out. Once they have completed their IO and 980 * run the callbacks removing themselves from the AIL, we can write the unmount 981 * record. 982 */ 983 void 984 xfs_log_quiesce( 985 struct xfs_mount *mp) 986 { 987 cancel_delayed_work_sync(&mp->m_log->l_work); 988 xfs_log_force(mp, XFS_LOG_SYNC); 989 990 /* 991 * The superblock buffer is uncached and while xfs_ail_push_all_sync() 992 * will push it, xfs_wait_buftarg() will not wait for it. Further, 993 * xfs_buf_iowait() cannot be used because it was pushed with the 994 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for 995 * the IO to complete. 996 */ 997 xfs_ail_push_all_sync(mp->m_ail); 998 xfs_wait_buftarg(mp->m_ddev_targp); 999 xfs_buf_lock(mp->m_sb_bp); 1000 xfs_buf_unlock(mp->m_sb_bp); 1001 1002 xfs_log_unmount_write(mp); 1003 } 1004 1005 /* 1006 * Shut down and release the AIL and Log. 1007 * 1008 * During unmount, we need to ensure we flush all the dirty metadata objects 1009 * from the AIL so that the log is empty before we write the unmount record to 1010 * the log. Once this is done, we can tear down the AIL and the log. 1011 */ 1012 void 1013 xfs_log_unmount( 1014 struct xfs_mount *mp) 1015 { 1016 xfs_log_quiesce(mp); 1017 1018 xfs_trans_ail_destroy(mp); 1019 1020 xfs_sysfs_del(&mp->m_log->l_kobj); 1021 1022 xlog_dealloc_log(mp->m_log); 1023 } 1024 1025 void 1026 xfs_log_item_init( 1027 struct xfs_mount *mp, 1028 struct xfs_log_item *item, 1029 int type, 1030 const struct xfs_item_ops *ops) 1031 { 1032 item->li_mountp = mp; 1033 item->li_ailp = mp->m_ail; 1034 item->li_type = type; 1035 item->li_ops = ops; 1036 item->li_lv = NULL; 1037 1038 INIT_LIST_HEAD(&item->li_ail); 1039 INIT_LIST_HEAD(&item->li_cil); 1040 INIT_LIST_HEAD(&item->li_bio_list); 1041 INIT_LIST_HEAD(&item->li_trans); 1042 } 1043 1044 /* 1045 * Wake up processes waiting for log space after we have moved the log tail. 1046 */ 1047 void 1048 xfs_log_space_wake( 1049 struct xfs_mount *mp) 1050 { 1051 struct xlog *log = mp->m_log; 1052 int free_bytes; 1053 1054 if (XLOG_FORCED_SHUTDOWN(log)) 1055 return; 1056 1057 if (!list_empty_careful(&log->l_write_head.waiters)) { 1058 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1059 1060 spin_lock(&log->l_write_head.lock); 1061 free_bytes = xlog_space_left(log, &log->l_write_head.grant); 1062 xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); 1063 spin_unlock(&log->l_write_head.lock); 1064 } 1065 1066 if (!list_empty_careful(&log->l_reserve_head.waiters)) { 1067 ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); 1068 1069 spin_lock(&log->l_reserve_head.lock); 1070 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1071 xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); 1072 spin_unlock(&log->l_reserve_head.lock); 1073 } 1074 } 1075 1076 /* 1077 * Determine if we have a transaction that has gone to disk that needs to be 1078 * covered. To begin the transition to the idle state firstly the log needs to 1079 * be idle. That means the CIL, the AIL and the iclogs needs to be empty before 1080 * we start attempting to cover the log. 1081 * 1082 * Only if we are then in a state where covering is needed, the caller is 1083 * informed that dummy transactions are required to move the log into the idle 1084 * state. 1085 * 1086 * If there are any items in the AIl or CIL, then we do not want to attempt to 1087 * cover the log as we may be in a situation where there isn't log space 1088 * available to run a dummy transaction and this can lead to deadlocks when the 1089 * tail of the log is pinned by an item that is modified in the CIL. Hence 1090 * there's no point in running a dummy transaction at this point because we 1091 * can't start trying to idle the log until both the CIL and AIL are empty. 1092 */ 1093 static int 1094 xfs_log_need_covered(xfs_mount_t *mp) 1095 { 1096 struct xlog *log = mp->m_log; 1097 int needed = 0; 1098 1099 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) 1100 return 0; 1101 1102 if (!xlog_cil_empty(log)) 1103 return 0; 1104 1105 spin_lock(&log->l_icloglock); 1106 switch (log->l_covered_state) { 1107 case XLOG_STATE_COVER_DONE: 1108 case XLOG_STATE_COVER_DONE2: 1109 case XLOG_STATE_COVER_IDLE: 1110 break; 1111 case XLOG_STATE_COVER_NEED: 1112 case XLOG_STATE_COVER_NEED2: 1113 if (xfs_ail_min_lsn(log->l_ailp)) 1114 break; 1115 if (!xlog_iclogs_empty(log)) 1116 break; 1117 1118 needed = 1; 1119 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 1120 log->l_covered_state = XLOG_STATE_COVER_DONE; 1121 else 1122 log->l_covered_state = XLOG_STATE_COVER_DONE2; 1123 break; 1124 default: 1125 needed = 1; 1126 break; 1127 } 1128 spin_unlock(&log->l_icloglock); 1129 return needed; 1130 } 1131 1132 /* 1133 * We may be holding the log iclog lock upon entering this routine. 1134 */ 1135 xfs_lsn_t 1136 xlog_assign_tail_lsn_locked( 1137 struct xfs_mount *mp) 1138 { 1139 struct xlog *log = mp->m_log; 1140 struct xfs_log_item *lip; 1141 xfs_lsn_t tail_lsn; 1142 1143 assert_spin_locked(&mp->m_ail->ail_lock); 1144 1145 /* 1146 * To make sure we always have a valid LSN for the log tail we keep 1147 * track of the last LSN which was committed in log->l_last_sync_lsn, 1148 * and use that when the AIL was empty. 1149 */ 1150 lip = xfs_ail_min(mp->m_ail); 1151 if (lip) 1152 tail_lsn = lip->li_lsn; 1153 else 1154 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 1155 trace_xfs_log_assign_tail_lsn(log, tail_lsn); 1156 atomic64_set(&log->l_tail_lsn, tail_lsn); 1157 return tail_lsn; 1158 } 1159 1160 xfs_lsn_t 1161 xlog_assign_tail_lsn( 1162 struct xfs_mount *mp) 1163 { 1164 xfs_lsn_t tail_lsn; 1165 1166 spin_lock(&mp->m_ail->ail_lock); 1167 tail_lsn = xlog_assign_tail_lsn_locked(mp); 1168 spin_unlock(&mp->m_ail->ail_lock); 1169 1170 return tail_lsn; 1171 } 1172 1173 /* 1174 * Return the space in the log between the tail and the head. The head 1175 * is passed in the cycle/bytes formal parms. In the special case where 1176 * the reserve head has wrapped passed the tail, this calculation is no 1177 * longer valid. In this case, just return 0 which means there is no space 1178 * in the log. This works for all places where this function is called 1179 * with the reserve head. Of course, if the write head were to ever 1180 * wrap the tail, we should blow up. Rather than catch this case here, 1181 * we depend on other ASSERTions in other parts of the code. XXXmiken 1182 * 1183 * This code also handles the case where the reservation head is behind 1184 * the tail. The details of this case are described below, but the end 1185 * result is that we return the size of the log as the amount of space left. 1186 */ 1187 STATIC int 1188 xlog_space_left( 1189 struct xlog *log, 1190 atomic64_t *head) 1191 { 1192 int free_bytes; 1193 int tail_bytes; 1194 int tail_cycle; 1195 int head_cycle; 1196 int head_bytes; 1197 1198 xlog_crack_grant_head(head, &head_cycle, &head_bytes); 1199 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); 1200 tail_bytes = BBTOB(tail_bytes); 1201 if (tail_cycle == head_cycle && head_bytes >= tail_bytes) 1202 free_bytes = log->l_logsize - (head_bytes - tail_bytes); 1203 else if (tail_cycle + 1 < head_cycle) 1204 return 0; 1205 else if (tail_cycle < head_cycle) { 1206 ASSERT(tail_cycle == (head_cycle - 1)); 1207 free_bytes = tail_bytes - head_bytes; 1208 } else { 1209 /* 1210 * The reservation head is behind the tail. 1211 * In this case we just want to return the size of the 1212 * log as the amount of space left. 1213 */ 1214 xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); 1215 xfs_alert(log->l_mp, 1216 " tail_cycle = %d, tail_bytes = %d", 1217 tail_cycle, tail_bytes); 1218 xfs_alert(log->l_mp, 1219 " GH cycle = %d, GH bytes = %d", 1220 head_cycle, head_bytes); 1221 ASSERT(0); 1222 free_bytes = log->l_logsize; 1223 } 1224 return free_bytes; 1225 } 1226 1227 1228 static void 1229 xlog_ioend_work( 1230 struct work_struct *work) 1231 { 1232 struct xlog_in_core *iclog = 1233 container_of(work, struct xlog_in_core, ic_end_io_work); 1234 struct xlog *log = iclog->ic_log; 1235 bool aborted = false; 1236 int error; 1237 1238 error = blk_status_to_errno(iclog->ic_bio.bi_status); 1239 #ifdef DEBUG 1240 /* treat writes with injected CRC errors as failed */ 1241 if (iclog->ic_fail_crc) 1242 error = -EIO; 1243 #endif 1244 1245 /* 1246 * Race to shutdown the filesystem if we see an error. 1247 */ 1248 if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { 1249 xfs_alert(log->l_mp, "log I/O error %d", error); 1250 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 1251 /* 1252 * This flag will be propagated to the trans-committed 1253 * callback routines to let them know that the log-commit 1254 * didn't succeed. 1255 */ 1256 aborted = true; 1257 } else if (iclog->ic_state & XLOG_STATE_IOERROR) { 1258 aborted = true; 1259 } 1260 1261 xlog_state_done_syncing(iclog, aborted); 1262 bio_uninit(&iclog->ic_bio); 1263 1264 /* 1265 * Drop the lock to signal that we are done. Nothing references the 1266 * iclog after this, so an unmount waiting on this lock can now tear it 1267 * down safely. As such, it is unsafe to reference the iclog after the 1268 * unlock as we could race with it being freed. 1269 */ 1270 up(&iclog->ic_sema); 1271 } 1272 1273 /* 1274 * Return size of each in-core log record buffer. 1275 * 1276 * All machines get 8 x 32kB buffers by default, unless tuned otherwise. 1277 * 1278 * If the filesystem blocksize is too large, we may need to choose a 1279 * larger size since the directory code currently logs entire blocks. 1280 */ 1281 STATIC void 1282 xlog_get_iclog_buffer_size( 1283 struct xfs_mount *mp, 1284 struct xlog *log) 1285 { 1286 if (mp->m_logbufs <= 0) 1287 mp->m_logbufs = XLOG_MAX_ICLOGS; 1288 if (mp->m_logbsize <= 0) 1289 mp->m_logbsize = XLOG_BIG_RECORD_BSIZE; 1290 1291 log->l_iclog_bufs = mp->m_logbufs; 1292 log->l_iclog_size = mp->m_logbsize; 1293 1294 /* 1295 * # headers = size / 32k - one header holds cycles from 32k of data. 1296 */ 1297 log->l_iclog_heads = 1298 DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); 1299 log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; 1300 } 1301 1302 void 1303 xfs_log_work_queue( 1304 struct xfs_mount *mp) 1305 { 1306 queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, 1307 msecs_to_jiffies(xfs_syncd_centisecs * 10)); 1308 } 1309 1310 /* 1311 * Every sync period we need to unpin all items in the AIL and push them to 1312 * disk. If there is nothing dirty, then we might need to cover the log to 1313 * indicate that the filesystem is idle. 1314 */ 1315 static void 1316 xfs_log_worker( 1317 struct work_struct *work) 1318 { 1319 struct xlog *log = container_of(to_delayed_work(work), 1320 struct xlog, l_work); 1321 struct xfs_mount *mp = log->l_mp; 1322 1323 /* dgc: errors ignored - not fatal and nowhere to report them */ 1324 if (xfs_log_need_covered(mp)) { 1325 /* 1326 * Dump a transaction into the log that contains no real change. 1327 * This is needed to stamp the current tail LSN into the log 1328 * during the covering operation. 1329 * 1330 * We cannot use an inode here for this - that will push dirty 1331 * state back up into the VFS and then periodic inode flushing 1332 * will prevent log covering from making progress. Hence we 1333 * synchronously log the superblock instead to ensure the 1334 * superblock is immediately unpinned and can be written back. 1335 */ 1336 xfs_sync_sb(mp, true); 1337 } else 1338 xfs_log_force(mp, 0); 1339 1340 /* start pushing all the metadata that is currently dirty */ 1341 xfs_ail_push_all(mp->m_ail); 1342 1343 /* queue us up again */ 1344 xfs_log_work_queue(mp); 1345 } 1346 1347 /* 1348 * This routine initializes some of the log structure for a given mount point. 1349 * Its primary purpose is to fill in enough, so recovery can occur. However, 1350 * some other stuff may be filled in too. 1351 */ 1352 STATIC struct xlog * 1353 xlog_alloc_log( 1354 struct xfs_mount *mp, 1355 struct xfs_buftarg *log_target, 1356 xfs_daddr_t blk_offset, 1357 int num_bblks) 1358 { 1359 struct xlog *log; 1360 xlog_rec_header_t *head; 1361 xlog_in_core_t **iclogp; 1362 xlog_in_core_t *iclog, *prev_iclog=NULL; 1363 int i; 1364 int error = -ENOMEM; 1365 uint log2_size = 0; 1366 1367 log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); 1368 if (!log) { 1369 xfs_warn(mp, "Log allocation failed: No memory!"); 1370 goto out; 1371 } 1372 1373 log->l_mp = mp; 1374 log->l_targ = log_target; 1375 log->l_logsize = BBTOB(num_bblks); 1376 log->l_logBBstart = blk_offset; 1377 log->l_logBBsize = num_bblks; 1378 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1379 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1380 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); 1381 1382 log->l_prev_block = -1; 1383 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1384 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); 1385 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); 1386 log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ 1387 1388 xlog_grant_head_init(&log->l_reserve_head); 1389 xlog_grant_head_init(&log->l_write_head); 1390 1391 error = -EFSCORRUPTED; 1392 if (xfs_sb_version_hassector(&mp->m_sb)) { 1393 log2_size = mp->m_sb.sb_logsectlog; 1394 if (log2_size < BBSHIFT) { 1395 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", 1396 log2_size, BBSHIFT); 1397 goto out_free_log; 1398 } 1399 1400 log2_size -= BBSHIFT; 1401 if (log2_size > mp->m_sectbb_log) { 1402 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", 1403 log2_size, mp->m_sectbb_log); 1404 goto out_free_log; 1405 } 1406 1407 /* for larger sector sizes, must have v2 or external log */ 1408 if (log2_size && log->l_logBBstart > 0 && 1409 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1410 xfs_warn(mp, 1411 "log sector size (0x%x) invalid for configuration.", 1412 log2_size); 1413 goto out_free_log; 1414 } 1415 } 1416 log->l_sectBBsize = 1 << log2_size; 1417 1418 xlog_get_iclog_buffer_size(mp, log); 1419 1420 spin_lock_init(&log->l_icloglock); 1421 init_waitqueue_head(&log->l_flush_wait); 1422 1423 iclogp = &log->l_iclog; 1424 /* 1425 * The amount of memory to allocate for the iclog structure is 1426 * rather funky due to the way the structure is defined. It is 1427 * done this way so that we can use different sizes for machines 1428 * with different amounts of memory. See the definition of 1429 * xlog_in_core_t in xfs_log_priv.h for details. 1430 */ 1431 ASSERT(log->l_iclog_size >= 4096); 1432 for (i = 0; i < log->l_iclog_bufs; i++) { 1433 int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); 1434 size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * 1435 sizeof(struct bio_vec); 1436 1437 iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); 1438 if (!iclog) 1439 goto out_free_iclog; 1440 1441 *iclogp = iclog; 1442 iclog->ic_prev = prev_iclog; 1443 prev_iclog = iclog; 1444 1445 iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, 1446 KM_MAYFAIL | KM_ZERO); 1447 if (!iclog->ic_data) 1448 goto out_free_iclog; 1449 #ifdef DEBUG 1450 log->l_iclog_bak[i] = &iclog->ic_header; 1451 #endif 1452 head = &iclog->ic_header; 1453 memset(head, 0, sizeof(xlog_rec_header_t)); 1454 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1455 head->h_version = cpu_to_be32( 1456 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1457 head->h_size = cpu_to_be32(log->l_iclog_size); 1458 /* new fields */ 1459 head->h_fmt = cpu_to_be32(XLOG_FMT); 1460 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1461 1462 iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; 1463 iclog->ic_state = XLOG_STATE_ACTIVE; 1464 iclog->ic_log = log; 1465 atomic_set(&iclog->ic_refcnt, 0); 1466 spin_lock_init(&iclog->ic_callback_lock); 1467 INIT_LIST_HEAD(&iclog->ic_callbacks); 1468 iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; 1469 1470 init_waitqueue_head(&iclog->ic_force_wait); 1471 init_waitqueue_head(&iclog->ic_write_wait); 1472 INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work); 1473 sema_init(&iclog->ic_sema, 1); 1474 1475 iclogp = &iclog->ic_next; 1476 } 1477 *iclogp = log->l_iclog; /* complete ring */ 1478 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1479 1480 log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", 1481 WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, 1482 mp->m_fsname); 1483 if (!log->l_ioend_workqueue) 1484 goto out_free_iclog; 1485 1486 error = xlog_cil_init(log); 1487 if (error) 1488 goto out_destroy_workqueue; 1489 return log; 1490 1491 out_destroy_workqueue: 1492 destroy_workqueue(log->l_ioend_workqueue); 1493 out_free_iclog: 1494 for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { 1495 prev_iclog = iclog->ic_next; 1496 kmem_free(iclog->ic_data); 1497 kmem_free(iclog); 1498 } 1499 out_free_log: 1500 kmem_free(log); 1501 out: 1502 return ERR_PTR(error); 1503 } /* xlog_alloc_log */ 1504 1505 1506 /* 1507 * Write out the commit record of a transaction associated with the given 1508 * ticket. Return the lsn of the commit record. 1509 */ 1510 STATIC int 1511 xlog_commit_record( 1512 struct xlog *log, 1513 struct xlog_ticket *ticket, 1514 struct xlog_in_core **iclog, 1515 xfs_lsn_t *commitlsnp) 1516 { 1517 struct xfs_mount *mp = log->l_mp; 1518 int error; 1519 struct xfs_log_iovec reg = { 1520 .i_addr = NULL, 1521 .i_len = 0, 1522 .i_type = XLOG_REG_TYPE_COMMIT, 1523 }; 1524 struct xfs_log_vec vec = { 1525 .lv_niovecs = 1, 1526 .lv_iovecp = ®, 1527 }; 1528 1529 ASSERT_ALWAYS(iclog); 1530 error = xlog_write(log, &vec, ticket, commitlsnp, iclog, 1531 XLOG_COMMIT_TRANS); 1532 if (error) 1533 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1534 return error; 1535 } 1536 1537 /* 1538 * Push on the buffer cache code if we ever use more than 75% of the on-disk 1539 * log space. This code pushes on the lsn which would supposedly free up 1540 * the 25% which we want to leave free. We may need to adopt a policy which 1541 * pushes on an lsn which is further along in the log once we reach the high 1542 * water mark. In this manner, we would be creating a low water mark. 1543 */ 1544 STATIC void 1545 xlog_grant_push_ail( 1546 struct xlog *log, 1547 int need_bytes) 1548 { 1549 xfs_lsn_t threshold_lsn = 0; 1550 xfs_lsn_t last_sync_lsn; 1551 int free_blocks; 1552 int free_bytes; 1553 int threshold_block; 1554 int threshold_cycle; 1555 int free_threshold; 1556 1557 ASSERT(BTOBB(need_bytes) < log->l_logBBsize); 1558 1559 free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); 1560 free_blocks = BTOBBT(free_bytes); 1561 1562 /* 1563 * Set the threshold for the minimum number of free blocks in the 1564 * log to the maximum of what the caller needs, one quarter of the 1565 * log, and 256 blocks. 1566 */ 1567 free_threshold = BTOBB(need_bytes); 1568 free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); 1569 free_threshold = max(free_threshold, 256); 1570 if (free_blocks >= free_threshold) 1571 return; 1572 1573 xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, 1574 &threshold_block); 1575 threshold_block += free_threshold; 1576 if (threshold_block >= log->l_logBBsize) { 1577 threshold_block -= log->l_logBBsize; 1578 threshold_cycle += 1; 1579 } 1580 threshold_lsn = xlog_assign_lsn(threshold_cycle, 1581 threshold_block); 1582 /* 1583 * Don't pass in an lsn greater than the lsn of the last 1584 * log record known to be on disk. Use a snapshot of the last sync lsn 1585 * so that it doesn't change between the compare and the set. 1586 */ 1587 last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); 1588 if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) 1589 threshold_lsn = last_sync_lsn; 1590 1591 /* 1592 * Get the transaction layer to kick the dirty buffers out to 1593 * disk asynchronously. No point in trying to do this if 1594 * the filesystem is shutting down. 1595 */ 1596 if (!XLOG_FORCED_SHUTDOWN(log)) 1597 xfs_ail_push(log->l_ailp, threshold_lsn); 1598 } 1599 1600 /* 1601 * Stamp cycle number in every block 1602 */ 1603 STATIC void 1604 xlog_pack_data( 1605 struct xlog *log, 1606 struct xlog_in_core *iclog, 1607 int roundoff) 1608 { 1609 int i, j, k; 1610 int size = iclog->ic_offset + roundoff; 1611 __be32 cycle_lsn; 1612 char *dp; 1613 1614 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 1615 1616 dp = iclog->ic_datap; 1617 for (i = 0; i < BTOBB(size); i++) { 1618 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) 1619 break; 1620 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; 1621 *(__be32 *)dp = cycle_lsn; 1622 dp += BBSIZE; 1623 } 1624 1625 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1626 xlog_in_core_2_t *xhdr = iclog->ic_data; 1627 1628 for ( ; i < BTOBB(size); i++) { 1629 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1630 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 1631 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; 1632 *(__be32 *)dp = cycle_lsn; 1633 dp += BBSIZE; 1634 } 1635 1636 for (i = 1; i < log->l_iclog_heads; i++) 1637 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 1638 } 1639 } 1640 1641 /* 1642 * Calculate the checksum for a log buffer. 1643 * 1644 * This is a little more complicated than it should be because the various 1645 * headers and the actual data are non-contiguous. 1646 */ 1647 __le32 1648 xlog_cksum( 1649 struct xlog *log, 1650 struct xlog_rec_header *rhead, 1651 char *dp, 1652 int size) 1653 { 1654 uint32_t crc; 1655 1656 /* first generate the crc for the record header ... */ 1657 crc = xfs_start_cksum_update((char *)rhead, 1658 sizeof(struct xlog_rec_header), 1659 offsetof(struct xlog_rec_header, h_crc)); 1660 1661 /* ... then for additional cycle data for v2 logs ... */ 1662 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1663 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; 1664 int i; 1665 int xheads; 1666 1667 xheads = size / XLOG_HEADER_CYCLE_SIZE; 1668 if (size % XLOG_HEADER_CYCLE_SIZE) 1669 xheads++; 1670 1671 for (i = 1; i < xheads; i++) { 1672 crc = crc32c(crc, &xhdr[i].hic_xheader, 1673 sizeof(struct xlog_rec_ext_header)); 1674 } 1675 } 1676 1677 /* ... and finally for the payload */ 1678 crc = crc32c(crc, dp, size); 1679 1680 return xfs_end_cksum(crc); 1681 } 1682 1683 static void 1684 xlog_bio_end_io( 1685 struct bio *bio) 1686 { 1687 struct xlog_in_core *iclog = bio->bi_private; 1688 1689 queue_work(iclog->ic_log->l_ioend_workqueue, 1690 &iclog->ic_end_io_work); 1691 } 1692 1693 static void 1694 xlog_map_iclog_data( 1695 struct bio *bio, 1696 void *data, 1697 size_t count) 1698 { 1699 do { 1700 struct page *page = kmem_to_page(data); 1701 unsigned int off = offset_in_page(data); 1702 size_t len = min_t(size_t, count, PAGE_SIZE - off); 1703 1704 WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); 1705 1706 data += len; 1707 count -= len; 1708 } while (count); 1709 } 1710 1711 STATIC void 1712 xlog_write_iclog( 1713 struct xlog *log, 1714 struct xlog_in_core *iclog, 1715 uint64_t bno, 1716 unsigned int count, 1717 bool need_flush) 1718 { 1719 ASSERT(bno < log->l_logBBsize); 1720 1721 /* 1722 * We lock the iclogbufs here so that we can serialise against I/O 1723 * completion during unmount. We might be processing a shutdown 1724 * triggered during unmount, and that can occur asynchronously to the 1725 * unmount thread, and hence we need to ensure that completes before 1726 * tearing down the iclogbufs. Hence we need to hold the buffer lock 1727 * across the log IO to archieve that. 1728 */ 1729 down(&iclog->ic_sema); 1730 if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) { 1731 /* 1732 * It would seem logical to return EIO here, but we rely on 1733 * the log state machine to propagate I/O errors instead of 1734 * doing it here. We kick of the state machine and unlock 1735 * the buffer manually, the code needs to be kept in sync 1736 * with the I/O completion path. 1737 */ 1738 xlog_state_done_syncing(iclog, XFS_LI_ABORTED); 1739 up(&iclog->ic_sema); 1740 return; 1741 } 1742 1743 iclog->ic_io_size = count; 1744 1745 bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); 1746 bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); 1747 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; 1748 iclog->ic_bio.bi_end_io = xlog_bio_end_io; 1749 iclog->ic_bio.bi_private = iclog; 1750 iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; 1751 if (need_flush) 1752 iclog->ic_bio.bi_opf |= REQ_PREFLUSH; 1753 1754 xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size); 1755 if (is_vmalloc_addr(iclog->ic_data)) 1756 flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size); 1757 1758 /* 1759 * If this log buffer would straddle the end of the log we will have 1760 * to split it up into two bios, so that we can continue at the start. 1761 */ 1762 if (bno + BTOBB(count) > log->l_logBBsize) { 1763 struct bio *split; 1764 1765 split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno, 1766 GFP_NOIO, &fs_bio_set); 1767 bio_chain(split, &iclog->ic_bio); 1768 submit_bio(split); 1769 1770 /* restart at logical offset zero for the remainder */ 1771 iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart; 1772 } 1773 1774 submit_bio(&iclog->ic_bio); 1775 } 1776 1777 /* 1778 * We need to bump cycle number for the part of the iclog that is 1779 * written to the start of the log. Watch out for the header magic 1780 * number case, though. 1781 */ 1782 static void 1783 xlog_split_iclog( 1784 struct xlog *log, 1785 void *data, 1786 uint64_t bno, 1787 unsigned int count) 1788 { 1789 unsigned int split_offset = BBTOB(log->l_logBBsize - bno); 1790 unsigned int i; 1791 1792 for (i = split_offset; i < count; i += BBSIZE) { 1793 uint32_t cycle = get_unaligned_be32(data + i); 1794 1795 if (++cycle == XLOG_HEADER_MAGIC_NUM) 1796 cycle++; 1797 put_unaligned_be32(cycle, data + i); 1798 } 1799 } 1800 1801 static int 1802 xlog_calc_iclog_size( 1803 struct xlog *log, 1804 struct xlog_in_core *iclog, 1805 uint32_t *roundoff) 1806 { 1807 uint32_t count_init, count; 1808 bool use_lsunit; 1809 1810 use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 1811 log->l_mp->m_sb.sb_logsunit > 1; 1812 1813 /* Add for LR header */ 1814 count_init = log->l_iclog_hsize + iclog->ic_offset; 1815 1816 /* Round out the log write size */ 1817 if (use_lsunit) { 1818 /* we have a v2 stripe unit to use */ 1819 count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); 1820 } else { 1821 count = BBTOB(BTOBB(count_init)); 1822 } 1823 1824 ASSERT(count >= count_init); 1825 *roundoff = count - count_init; 1826 1827 if (use_lsunit) 1828 ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); 1829 else 1830 ASSERT(*roundoff < BBTOB(1)); 1831 return count; 1832 } 1833 1834 /* 1835 * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 1836 * fashion. Previously, we should have moved the current iclog 1837 * ptr in the log to point to the next available iclog. This allows further 1838 * write to continue while this code syncs out an iclog ready to go. 1839 * Before an in-core log can be written out, the data section must be scanned 1840 * to save away the 1st word of each BBSIZE block into the header. We replace 1841 * it with the current cycle count. Each BBSIZE block is tagged with the 1842 * cycle count because there in an implicit assumption that drives will 1843 * guarantee that entire 512 byte blocks get written at once. In other words, 1844 * we can't have part of a 512 byte block written and part not written. By 1845 * tagging each block, we will know which blocks are valid when recovering 1846 * after an unclean shutdown. 1847 * 1848 * This routine is single threaded on the iclog. No other thread can be in 1849 * this routine with the same iclog. Changing contents of iclog can there- 1850 * fore be done without grabbing the state machine lock. Updating the global 1851 * log will require grabbing the lock though. 1852 * 1853 * The entire log manager uses a logical block numbering scheme. Only 1854 * xlog_write_iclog knows about the fact that the log may not start with 1855 * block zero on a given device. 1856 */ 1857 STATIC void 1858 xlog_sync( 1859 struct xlog *log, 1860 struct xlog_in_core *iclog) 1861 { 1862 unsigned int count; /* byte count of bwrite */ 1863 unsigned int roundoff; /* roundoff to BB or stripe */ 1864 uint64_t bno; 1865 unsigned int size; 1866 bool need_flush = true, split = false; 1867 1868 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1869 1870 count = xlog_calc_iclog_size(log, iclog, &roundoff); 1871 1872 /* move grant heads by roundoff in sync */ 1873 xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); 1874 xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); 1875 1876 /* put cycle number in every block */ 1877 xlog_pack_data(log, iclog, roundoff); 1878 1879 /* real byte length */ 1880 size = iclog->ic_offset; 1881 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) 1882 size += roundoff; 1883 iclog->ic_header.h_len = cpu_to_be32(size); 1884 1885 XFS_STATS_INC(log->l_mp, xs_log_writes); 1886 XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); 1887 1888 bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); 1889 1890 /* Do we need to split this write into 2 parts? */ 1891 if (bno + BTOBB(count) > log->l_logBBsize) { 1892 xlog_split_iclog(log, &iclog->ic_header, bno, count); 1893 split = true; 1894 } 1895 1896 /* calculcate the checksum */ 1897 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, 1898 iclog->ic_datap, size); 1899 /* 1900 * Intentionally corrupt the log record CRC based on the error injection 1901 * frequency, if defined. This facilitates testing log recovery in the 1902 * event of torn writes. Hence, set the IOABORT state to abort the log 1903 * write on I/O completion and shutdown the fs. The subsequent mount 1904 * detects the bad CRC and attempts to recover. 1905 */ 1906 #ifdef DEBUG 1907 if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { 1908 iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); 1909 iclog->ic_fail_crc = true; 1910 xfs_warn(log->l_mp, 1911 "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", 1912 be64_to_cpu(iclog->ic_header.h_lsn)); 1913 } 1914 #endif 1915 1916 /* 1917 * Flush the data device before flushing the log to make sure all meta 1918 * data written back from the AIL actually made it to disk before 1919 * stamping the new log tail LSN into the log buffer. For an external 1920 * log we need to issue the flush explicitly, and unfortunately 1921 * synchronously here; for an internal log we can simply use the block 1922 * layer state machine for preflushes. 1923 */ 1924 if (log->l_targ != log->l_mp->m_ddev_targp || split) { 1925 xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); 1926 need_flush = false; 1927 } 1928 1929 xlog_verify_iclog(log, iclog, count); 1930 xlog_write_iclog(log, iclog, bno, count, need_flush); 1931 } 1932 1933 /* 1934 * Deallocate a log structure 1935 */ 1936 STATIC void 1937 xlog_dealloc_log( 1938 struct xlog *log) 1939 { 1940 xlog_in_core_t *iclog, *next_iclog; 1941 int i; 1942 1943 xlog_cil_destroy(log); 1944 1945 /* 1946 * Cycle all the iclogbuf locks to make sure all log IO completion 1947 * is done before we tear down these buffers. 1948 */ 1949 iclog = log->l_iclog; 1950 for (i = 0; i < log->l_iclog_bufs; i++) { 1951 down(&iclog->ic_sema); 1952 up(&iclog->ic_sema); 1953 iclog = iclog->ic_next; 1954 } 1955 1956 iclog = log->l_iclog; 1957 for (i = 0; i < log->l_iclog_bufs; i++) { 1958 next_iclog = iclog->ic_next; 1959 kmem_free(iclog->ic_data); 1960 kmem_free(iclog); 1961 iclog = next_iclog; 1962 } 1963 1964 log->l_mp->m_log = NULL; 1965 destroy_workqueue(log->l_ioend_workqueue); 1966 kmem_free(log); 1967 } /* xlog_dealloc_log */ 1968 1969 /* 1970 * Update counters atomically now that memcpy is done. 1971 */ 1972 /* ARGSUSED */ 1973 static inline void 1974 xlog_state_finish_copy( 1975 struct xlog *log, 1976 struct xlog_in_core *iclog, 1977 int record_cnt, 1978 int copy_bytes) 1979 { 1980 spin_lock(&log->l_icloglock); 1981 1982 be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); 1983 iclog->ic_offset += copy_bytes; 1984 1985 spin_unlock(&log->l_icloglock); 1986 } /* xlog_state_finish_copy */ 1987 1988 1989 1990 1991 /* 1992 * print out info relating to regions written which consume 1993 * the reservation 1994 */ 1995 void 1996 xlog_print_tic_res( 1997 struct xfs_mount *mp, 1998 struct xlog_ticket *ticket) 1999 { 2000 uint i; 2001 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 2002 2003 /* match with XLOG_REG_TYPE_* in xfs_log.h */ 2004 #define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str 2005 static char *res_type_str[] = { 2006 REG_TYPE_STR(BFORMAT, "bformat"), 2007 REG_TYPE_STR(BCHUNK, "bchunk"), 2008 REG_TYPE_STR(EFI_FORMAT, "efi_format"), 2009 REG_TYPE_STR(EFD_FORMAT, "efd_format"), 2010 REG_TYPE_STR(IFORMAT, "iformat"), 2011 REG_TYPE_STR(ICORE, "icore"), 2012 REG_TYPE_STR(IEXT, "iext"), 2013 REG_TYPE_STR(IBROOT, "ibroot"), 2014 REG_TYPE_STR(ILOCAL, "ilocal"), 2015 REG_TYPE_STR(IATTR_EXT, "iattr_ext"), 2016 REG_TYPE_STR(IATTR_BROOT, "iattr_broot"), 2017 REG_TYPE_STR(IATTR_LOCAL, "iattr_local"), 2018 REG_TYPE_STR(QFORMAT, "qformat"), 2019 REG_TYPE_STR(DQUOT, "dquot"), 2020 REG_TYPE_STR(QUOTAOFF, "quotaoff"), 2021 REG_TYPE_STR(LRHEADER, "LR header"), 2022 REG_TYPE_STR(UNMOUNT, "unmount"), 2023 REG_TYPE_STR(COMMIT, "commit"), 2024 REG_TYPE_STR(TRANSHDR, "trans header"), 2025 REG_TYPE_STR(ICREATE, "inode create"), 2026 REG_TYPE_STR(RUI_FORMAT, "rui_format"), 2027 REG_TYPE_STR(RUD_FORMAT, "rud_format"), 2028 REG_TYPE_STR(CUI_FORMAT, "cui_format"), 2029 REG_TYPE_STR(CUD_FORMAT, "cud_format"), 2030 REG_TYPE_STR(BUI_FORMAT, "bui_format"), 2031 REG_TYPE_STR(BUD_FORMAT, "bud_format"), 2032 }; 2033 BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); 2034 #undef REG_TYPE_STR 2035 2036 xfs_warn(mp, "ticket reservation summary:"); 2037 xfs_warn(mp, " unit res = %d bytes", 2038 ticket->t_unit_res); 2039 xfs_warn(mp, " current res = %d bytes", 2040 ticket->t_curr_res); 2041 xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)", 2042 ticket->t_res_arr_sum, ticket->t_res_o_flow); 2043 xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)", 2044 ticket->t_res_num_ophdrs, ophdr_spc); 2045 xfs_warn(mp, " ophdr + reg = %u bytes", 2046 ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc); 2047 xfs_warn(mp, " num regions = %u", 2048 ticket->t_res_num); 2049 2050 for (i = 0; i < ticket->t_res_num; i++) { 2051 uint r_type = ticket->t_res_arr[i].r_type; 2052 xfs_warn(mp, "region[%u]: %s - %u bytes", i, 2053 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 2054 "bad-rtype" : res_type_str[r_type]), 2055 ticket->t_res_arr[i].r_len); 2056 } 2057 } 2058 2059 /* 2060 * Print a summary of the transaction. 2061 */ 2062 void 2063 xlog_print_trans( 2064 struct xfs_trans *tp) 2065 { 2066 struct xfs_mount *mp = tp->t_mountp; 2067 struct xfs_log_item *lip; 2068 2069 /* dump core transaction and ticket info */ 2070 xfs_warn(mp, "transaction summary:"); 2071 xfs_warn(mp, " log res = %d", tp->t_log_res); 2072 xfs_warn(mp, " log count = %d", tp->t_log_count); 2073 xfs_warn(mp, " flags = 0x%x", tp->t_flags); 2074 2075 xlog_print_tic_res(mp, tp->t_ticket); 2076 2077 /* dump each log item */ 2078 list_for_each_entry(lip, &tp->t_items, li_trans) { 2079 struct xfs_log_vec *lv = lip->li_lv; 2080 struct xfs_log_iovec *vec; 2081 int i; 2082 2083 xfs_warn(mp, "log item: "); 2084 xfs_warn(mp, " type = 0x%x", lip->li_type); 2085 xfs_warn(mp, " flags = 0x%lx", lip->li_flags); 2086 if (!lv) 2087 continue; 2088 xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); 2089 xfs_warn(mp, " size = %d", lv->lv_size); 2090 xfs_warn(mp, " bytes = %d", lv->lv_bytes); 2091 xfs_warn(mp, " buf len = %d", lv->lv_buf_len); 2092 2093 /* dump each iovec for the log item */ 2094 vec = lv->lv_iovecp; 2095 for (i = 0; i < lv->lv_niovecs; i++) { 2096 int dumplen = min(vec->i_len, 32); 2097 2098 xfs_warn(mp, " iovec[%d]", i); 2099 xfs_warn(mp, " type = 0x%x", vec->i_type); 2100 xfs_warn(mp, " len = %d", vec->i_len); 2101 xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i); 2102 xfs_hex_dump(vec->i_addr, dumplen); 2103 2104 vec++; 2105 } 2106 } 2107 } 2108 2109 /* 2110 * Calculate the potential space needed by the log vector. Each region gets 2111 * its own xlog_op_header_t and may need to be double word aligned. 2112 */ 2113 static int 2114 xlog_write_calc_vec_length( 2115 struct xlog_ticket *ticket, 2116 struct xfs_log_vec *log_vector) 2117 { 2118 struct xfs_log_vec *lv; 2119 int headers = 0; 2120 int len = 0; 2121 int i; 2122 2123 /* acct for start rec of xact */ 2124 if (ticket->t_flags & XLOG_TIC_INITED) 2125 headers++; 2126 2127 for (lv = log_vector; lv; lv = lv->lv_next) { 2128 /* we don't write ordered log vectors */ 2129 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) 2130 continue; 2131 2132 headers += lv->lv_niovecs; 2133 2134 for (i = 0; i < lv->lv_niovecs; i++) { 2135 struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; 2136 2137 len += vecp->i_len; 2138 xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); 2139 } 2140 } 2141 2142 ticket->t_res_num_ophdrs += headers; 2143 len += headers * sizeof(struct xlog_op_header); 2144 2145 return len; 2146 } 2147 2148 /* 2149 * If first write for transaction, insert start record We can't be trying to 2150 * commit if we are inited. We can't have any "partial_copy" if we are inited. 2151 */ 2152 static int 2153 xlog_write_start_rec( 2154 struct xlog_op_header *ophdr, 2155 struct xlog_ticket *ticket) 2156 { 2157 if (!(ticket->t_flags & XLOG_TIC_INITED)) 2158 return 0; 2159 2160 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2161 ophdr->oh_clientid = ticket->t_clientid; 2162 ophdr->oh_len = 0; 2163 ophdr->oh_flags = XLOG_START_TRANS; 2164 ophdr->oh_res2 = 0; 2165 2166 ticket->t_flags &= ~XLOG_TIC_INITED; 2167 2168 return sizeof(struct xlog_op_header); 2169 } 2170 2171 static xlog_op_header_t * 2172 xlog_write_setup_ophdr( 2173 struct xlog *log, 2174 struct xlog_op_header *ophdr, 2175 struct xlog_ticket *ticket, 2176 uint flags) 2177 { 2178 ophdr->oh_tid = cpu_to_be32(ticket->t_tid); 2179 ophdr->oh_clientid = ticket->t_clientid; 2180 ophdr->oh_res2 = 0; 2181 2182 /* are we copying a commit or unmount record? */ 2183 ophdr->oh_flags = flags; 2184 2185 /* 2186 * We've seen logs corrupted with bad transaction client ids. This 2187 * makes sure that XFS doesn't generate them on. Turn this into an EIO 2188 * and shut down the filesystem. 2189 */ 2190 switch (ophdr->oh_clientid) { 2191 case XFS_TRANSACTION: 2192 case XFS_VOLUME: 2193 case XFS_LOG: 2194 break; 2195 default: 2196 xfs_warn(log->l_mp, 2197 "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT, 2198 ophdr->oh_clientid, ticket); 2199 return NULL; 2200 } 2201 2202 return ophdr; 2203 } 2204 2205 /* 2206 * Set up the parameters of the region copy into the log. This has 2207 * to handle region write split across multiple log buffers - this 2208 * state is kept external to this function so that this code can 2209 * be written in an obvious, self documenting manner. 2210 */ 2211 static int 2212 xlog_write_setup_copy( 2213 struct xlog_ticket *ticket, 2214 struct xlog_op_header *ophdr, 2215 int space_available, 2216 int space_required, 2217 int *copy_off, 2218 int *copy_len, 2219 int *last_was_partial_copy, 2220 int *bytes_consumed) 2221 { 2222 int still_to_copy; 2223 2224 still_to_copy = space_required - *bytes_consumed; 2225 *copy_off = *bytes_consumed; 2226 2227 if (still_to_copy <= space_available) { 2228 /* write of region completes here */ 2229 *copy_len = still_to_copy; 2230 ophdr->oh_len = cpu_to_be32(*copy_len); 2231 if (*last_was_partial_copy) 2232 ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); 2233 *last_was_partial_copy = 0; 2234 *bytes_consumed = 0; 2235 return 0; 2236 } 2237 2238 /* partial write of region, needs extra log op header reservation */ 2239 *copy_len = space_available; 2240 ophdr->oh_len = cpu_to_be32(*copy_len); 2241 ophdr->oh_flags |= XLOG_CONTINUE_TRANS; 2242 if (*last_was_partial_copy) 2243 ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; 2244 *bytes_consumed += *copy_len; 2245 (*last_was_partial_copy)++; 2246 2247 /* account for new log op header */ 2248 ticket->t_curr_res -= sizeof(struct xlog_op_header); 2249 ticket->t_res_num_ophdrs++; 2250 2251 return sizeof(struct xlog_op_header); 2252 } 2253 2254 static int 2255 xlog_write_copy_finish( 2256 struct xlog *log, 2257 struct xlog_in_core *iclog, 2258 uint flags, 2259 int *record_cnt, 2260 int *data_cnt, 2261 int *partial_copy, 2262 int *partial_copy_len, 2263 int log_offset, 2264 struct xlog_in_core **commit_iclog) 2265 { 2266 if (*partial_copy) { 2267 /* 2268 * This iclog has already been marked WANT_SYNC by 2269 * xlog_state_get_iclog_space. 2270 */ 2271 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2272 *record_cnt = 0; 2273 *data_cnt = 0; 2274 return xlog_state_release_iclog(log, iclog); 2275 } 2276 2277 *partial_copy = 0; 2278 *partial_copy_len = 0; 2279 2280 if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { 2281 /* no more space in this iclog - push it. */ 2282 xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); 2283 *record_cnt = 0; 2284 *data_cnt = 0; 2285 2286 spin_lock(&log->l_icloglock); 2287 xlog_state_want_sync(log, iclog); 2288 spin_unlock(&log->l_icloglock); 2289 2290 if (!commit_iclog) 2291 return xlog_state_release_iclog(log, iclog); 2292 ASSERT(flags & XLOG_COMMIT_TRANS); 2293 *commit_iclog = iclog; 2294 } 2295 2296 return 0; 2297 } 2298 2299 /* 2300 * Write some region out to in-core log 2301 * 2302 * This will be called when writing externally provided regions or when 2303 * writing out a commit record for a given transaction. 2304 * 2305 * General algorithm: 2306 * 1. Find total length of this write. This may include adding to the 2307 * lengths passed in. 2308 * 2. Check whether we violate the tickets reservation. 2309 * 3. While writing to this iclog 2310 * A. Reserve as much space in this iclog as can get 2311 * B. If this is first write, save away start lsn 2312 * C. While writing this region: 2313 * 1. If first write of transaction, write start record 2314 * 2. Write log operation header (header per region) 2315 * 3. Find out if we can fit entire region into this iclog 2316 * 4. Potentially, verify destination memcpy ptr 2317 * 5. Memcpy (partial) region 2318 * 6. If partial copy, release iclog; otherwise, continue 2319 * copying more regions into current iclog 2320 * 4. Mark want sync bit (in simulation mode) 2321 * 5. Release iclog for potential flush to on-disk log. 2322 * 2323 * ERRORS: 2324 * 1. Panic if reservation is overrun. This should never happen since 2325 * reservation amounts are generated internal to the filesystem. 2326 * NOTES: 2327 * 1. Tickets are single threaded data structures. 2328 * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the 2329 * syncing routine. When a single log_write region needs to span 2330 * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set 2331 * on all log operation writes which don't contain the end of the 2332 * region. The XLOG_END_TRANS bit is used for the in-core log 2333 * operation which contains the end of the continued log_write region. 2334 * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, 2335 * we don't really know exactly how much space will be used. As a result, 2336 * we don't update ic_offset until the end when we know exactly how many 2337 * bytes have been written out. 2338 */ 2339 int 2340 xlog_write( 2341 struct xlog *log, 2342 struct xfs_log_vec *log_vector, 2343 struct xlog_ticket *ticket, 2344 xfs_lsn_t *start_lsn, 2345 struct xlog_in_core **commit_iclog, 2346 uint flags) 2347 { 2348 struct xlog_in_core *iclog = NULL; 2349 struct xfs_log_iovec *vecp; 2350 struct xfs_log_vec *lv; 2351 int len; 2352 int index; 2353 int partial_copy = 0; 2354 int partial_copy_len = 0; 2355 int contwr = 0; 2356 int record_cnt = 0; 2357 int data_cnt = 0; 2358 int error; 2359 2360 *start_lsn = 0; 2361 2362 len = xlog_write_calc_vec_length(ticket, log_vector); 2363 2364 /* 2365 * Region headers and bytes are already accounted for. 2366 * We only need to take into account start records and 2367 * split regions in this function. 2368 */ 2369 if (ticket->t_flags & XLOG_TIC_INITED) 2370 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2371 2372 /* 2373 * Commit record headers need to be accounted for. These 2374 * come in as separate writes so are easy to detect. 2375 */ 2376 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) 2377 ticket->t_curr_res -= sizeof(xlog_op_header_t); 2378 2379 if (ticket->t_curr_res < 0) { 2380 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 2381 "ctx ticket reservation ran out. Need to up reservation"); 2382 xlog_print_tic_res(log->l_mp, ticket); 2383 xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); 2384 } 2385 2386 index = 0; 2387 lv = log_vector; 2388 vecp = lv->lv_iovecp; 2389 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2390 void *ptr; 2391 int log_offset; 2392 2393 error = xlog_state_get_iclog_space(log, len, &iclog, ticket, 2394 &contwr, &log_offset); 2395 if (error) 2396 return error; 2397 2398 ASSERT(log_offset <= iclog->ic_size - 1); 2399 ptr = iclog->ic_datap + log_offset; 2400 2401 /* start_lsn is the first lsn written to. That's all we need. */ 2402 if (!*start_lsn) 2403 *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2404 2405 /* 2406 * This loop writes out as many regions as can fit in the amount 2407 * of space which was allocated by xlog_state_get_iclog_space(). 2408 */ 2409 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { 2410 struct xfs_log_iovec *reg; 2411 struct xlog_op_header *ophdr; 2412 int start_rec_copy; 2413 int copy_len; 2414 int copy_off; 2415 bool ordered = false; 2416 2417 /* ordered log vectors have no regions to write */ 2418 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { 2419 ASSERT(lv->lv_niovecs == 0); 2420 ordered = true; 2421 goto next_lv; 2422 } 2423 2424 reg = &vecp[index]; 2425 ASSERT(reg->i_len % sizeof(int32_t) == 0); 2426 ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); 2427 2428 start_rec_copy = xlog_write_start_rec(ptr, ticket); 2429 if (start_rec_copy) { 2430 record_cnt++; 2431 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2432 start_rec_copy); 2433 } 2434 2435 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); 2436 if (!ophdr) 2437 return -EIO; 2438 2439 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2440 sizeof(struct xlog_op_header)); 2441 2442 len += xlog_write_setup_copy(ticket, ophdr, 2443 iclog->ic_size-log_offset, 2444 reg->i_len, 2445 ©_off, ©_len, 2446 &partial_copy, 2447 &partial_copy_len); 2448 xlog_verify_dest_ptr(log, ptr); 2449 2450 /* 2451 * Copy region. 2452 * 2453 * Unmount records just log an opheader, so can have 2454 * empty payloads with no data region to copy. Hence we 2455 * only copy the payload if the vector says it has data 2456 * to copy. 2457 */ 2458 ASSERT(copy_len >= 0); 2459 if (copy_len > 0) { 2460 memcpy(ptr, reg->i_addr + copy_off, copy_len); 2461 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2462 copy_len); 2463 } 2464 copy_len += start_rec_copy + sizeof(xlog_op_header_t); 2465 record_cnt++; 2466 data_cnt += contwr ? copy_len : 0; 2467 2468 error = xlog_write_copy_finish(log, iclog, flags, 2469 &record_cnt, &data_cnt, 2470 &partial_copy, 2471 &partial_copy_len, 2472 log_offset, 2473 commit_iclog); 2474 if (error) 2475 return error; 2476 2477 /* 2478 * if we had a partial copy, we need to get more iclog 2479 * space but we don't want to increment the region 2480 * index because there is still more is this region to 2481 * write. 2482 * 2483 * If we completed writing this region, and we flushed 2484 * the iclog (indicated by resetting of the record 2485 * count), then we also need to get more log space. If 2486 * this was the last record, though, we are done and 2487 * can just return. 2488 */ 2489 if (partial_copy) 2490 break; 2491 2492 if (++index == lv->lv_niovecs) { 2493 next_lv: 2494 lv = lv->lv_next; 2495 index = 0; 2496 if (lv) 2497 vecp = lv->lv_iovecp; 2498 } 2499 if (record_cnt == 0 && !ordered) { 2500 if (!lv) 2501 return 0; 2502 break; 2503 } 2504 } 2505 } 2506 2507 ASSERT(len == 0); 2508 2509 xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); 2510 if (!commit_iclog) 2511 return xlog_state_release_iclog(log, iclog); 2512 2513 ASSERT(flags & XLOG_COMMIT_TRANS); 2514 *commit_iclog = iclog; 2515 return 0; 2516 } 2517 2518 2519 /***************************************************************************** 2520 * 2521 * State Machine functions 2522 * 2523 ***************************************************************************** 2524 */ 2525 2526 /* 2527 * An iclog has just finished IO completion processing, so we need to update 2528 * the iclog state and propagate that up into the overall log state. Hence we 2529 * prepare the iclog for cleaning, and then clean all the pending dirty iclogs 2530 * starting from the head, and then wake up any threads that are waiting for the 2531 * iclog to be marked clean. 2532 * 2533 * The ordering of marking iclogs ACTIVE must be maintained, so an iclog 2534 * doesn't become ACTIVE beyond one that is SYNCING. This is also required to 2535 * maintain the notion that we use a ordered wait queue to hold off would be 2536 * writers to the log when every iclog is trying to sync to disk. 2537 * 2538 * Caller must hold the icloglock before calling us. 2539 * 2540 * State Change: !IOERROR -> DIRTY -> ACTIVE 2541 */ 2542 STATIC void 2543 xlog_state_clean_iclog( 2544 struct xlog *log, 2545 struct xlog_in_core *dirty_iclog) 2546 { 2547 struct xlog_in_core *iclog; 2548 int changed = 0; 2549 2550 /* Prepare the completed iclog. */ 2551 if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR)) 2552 dirty_iclog->ic_state = XLOG_STATE_DIRTY; 2553 2554 /* Walk all the iclogs to update the ordered active state. */ 2555 iclog = log->l_iclog; 2556 do { 2557 if (iclog->ic_state == XLOG_STATE_DIRTY) { 2558 iclog->ic_state = XLOG_STATE_ACTIVE; 2559 iclog->ic_offset = 0; 2560 ASSERT(list_empty_careful(&iclog->ic_callbacks)); 2561 /* 2562 * If the number of ops in this iclog indicate it just 2563 * contains the dummy transaction, we can 2564 * change state into IDLE (the second time around). 2565 * Otherwise we should change the state into 2566 * NEED a dummy. 2567 * We don't need to cover the dummy. 2568 */ 2569 if (!changed && 2570 (be32_to_cpu(iclog->ic_header.h_num_logops) == 2571 XLOG_COVER_OPS)) { 2572 changed = 1; 2573 } else { 2574 /* 2575 * We have two dirty iclogs so start over 2576 * This could also be num of ops indicates 2577 * this is not the dummy going out. 2578 */ 2579 changed = 2; 2580 } 2581 iclog->ic_header.h_num_logops = 0; 2582 memset(iclog->ic_header.h_cycle_data, 0, 2583 sizeof(iclog->ic_header.h_cycle_data)); 2584 iclog->ic_header.h_lsn = 0; 2585 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) 2586 /* do nothing */; 2587 else 2588 break; /* stop cleaning */ 2589 iclog = iclog->ic_next; 2590 } while (iclog != log->l_iclog); 2591 2592 2593 /* 2594 * Wake up threads waiting in xfs_log_force() for the dirty iclog 2595 * to be cleaned. 2596 */ 2597 wake_up_all(&dirty_iclog->ic_force_wait); 2598 2599 /* 2600 * Change state for the dummy log recording. 2601 * We usually go to NEED. But we go to NEED2 if the changed indicates 2602 * we are done writing the dummy record. 2603 * If we are done with the second dummy recored (DONE2), then 2604 * we go to IDLE. 2605 */ 2606 if (changed) { 2607 switch (log->l_covered_state) { 2608 case XLOG_STATE_COVER_IDLE: 2609 case XLOG_STATE_COVER_NEED: 2610 case XLOG_STATE_COVER_NEED2: 2611 log->l_covered_state = XLOG_STATE_COVER_NEED; 2612 break; 2613 2614 case XLOG_STATE_COVER_DONE: 2615 if (changed == 1) 2616 log->l_covered_state = XLOG_STATE_COVER_NEED2; 2617 else 2618 log->l_covered_state = XLOG_STATE_COVER_NEED; 2619 break; 2620 2621 case XLOG_STATE_COVER_DONE2: 2622 if (changed == 1) 2623 log->l_covered_state = XLOG_STATE_COVER_IDLE; 2624 else 2625 log->l_covered_state = XLOG_STATE_COVER_NEED; 2626 break; 2627 2628 default: 2629 ASSERT(0); 2630 } 2631 } 2632 } 2633 2634 STATIC xfs_lsn_t 2635 xlog_get_lowest_lsn( 2636 struct xlog *log) 2637 { 2638 struct xlog_in_core *iclog = log->l_iclog; 2639 xfs_lsn_t lowest_lsn = 0, lsn; 2640 2641 do { 2642 if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) 2643 continue; 2644 2645 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2646 if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) 2647 lowest_lsn = lsn; 2648 } while ((iclog = iclog->ic_next) != log->l_iclog); 2649 2650 return lowest_lsn; 2651 } 2652 2653 /* 2654 * Completion of a iclog IO does not imply that a transaction has completed, as 2655 * transactions can be large enough to span many iclogs. We cannot change the 2656 * tail of the log half way through a transaction as this may be the only 2657 * transaction in the log and moving the tail to point to the middle of it 2658 * will prevent recovery from finding the start of the transaction. Hence we 2659 * should only update the last_sync_lsn if this iclog contains transaction 2660 * completion callbacks on it. 2661 * 2662 * We have to do this before we drop the icloglock to ensure we are the only one 2663 * that can update it. 2664 * 2665 * If we are moving the last_sync_lsn forwards, we also need to ensure we kick 2666 * the reservation grant head pushing. This is due to the fact that the push 2667 * target is bound by the current last_sync_lsn value. Hence if we have a large 2668 * amount of log space bound up in this committing transaction then the 2669 * last_sync_lsn value may be the limiting factor preventing tail pushing from 2670 * freeing space in the log. Hence once we've updated the last_sync_lsn we 2671 * should push the AIL to ensure the push target (and hence the grant head) is 2672 * no longer bound by the old log head location and can move forwards and make 2673 * progress again. 2674 */ 2675 static void 2676 xlog_state_set_callback( 2677 struct xlog *log, 2678 struct xlog_in_core *iclog, 2679 xfs_lsn_t header_lsn) 2680 { 2681 iclog->ic_state = XLOG_STATE_CALLBACK; 2682 2683 ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), 2684 header_lsn) <= 0); 2685 2686 if (list_empty_careful(&iclog->ic_callbacks)) 2687 return; 2688 2689 atomic64_set(&log->l_last_sync_lsn, header_lsn); 2690 xlog_grant_push_ail(log, 0); 2691 } 2692 2693 /* 2694 * Return true if we need to stop processing, false to continue to the next 2695 * iclog. The caller will need to run callbacks if the iclog is returned in the 2696 * XLOG_STATE_CALLBACK state. 2697 */ 2698 static bool 2699 xlog_state_iodone_process_iclog( 2700 struct xlog *log, 2701 struct xlog_in_core *iclog, 2702 struct xlog_in_core *completed_iclog, 2703 bool *ioerror) 2704 { 2705 xfs_lsn_t lowest_lsn; 2706 xfs_lsn_t header_lsn; 2707 2708 /* Skip all iclogs in the ACTIVE & DIRTY states */ 2709 if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) 2710 return false; 2711 2712 /* 2713 * Between marking a filesystem SHUTDOWN and stopping the log, we do 2714 * flush all iclogs to disk (if there wasn't a log I/O error). So, we do 2715 * want things to go smoothly in case of just a SHUTDOWN w/o a 2716 * LOG_IO_ERROR. 2717 */ 2718 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2719 *ioerror = true; 2720 return false; 2721 } 2722 2723 /* 2724 * Can only perform callbacks in order. Since this iclog is not in the 2725 * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean 2726 * up. If we set our iclog to DO_CALLBACK, we will not process it when 2727 * we retry since a previous iclog is in the CALLBACK and the state 2728 * cannot change since we are holding the l_icloglock. 2729 */ 2730 if (!(iclog->ic_state & 2731 (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) { 2732 if (completed_iclog && 2733 (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) { 2734 completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK; 2735 } 2736 return true; 2737 } 2738 2739 /* 2740 * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC 2741 * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught 2742 * by the above if and are going to clean (i.e. we aren't doing their 2743 * callbacks) see the above if. 2744 * 2745 * We will do one more check here to see if we have chased our tail 2746 * around. If this is not the lowest lsn iclog, then we will leave it 2747 * for another completion to process. 2748 */ 2749 header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2750 lowest_lsn = xlog_get_lowest_lsn(log); 2751 if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) 2752 return false; 2753 2754 xlog_state_set_callback(log, iclog, header_lsn); 2755 return false; 2756 2757 } 2758 2759 /* 2760 * Keep processing entries in the iclog callback list until we come around and 2761 * it is empty. We need to atomically see that the list is empty and change the 2762 * state to DIRTY so that we don't miss any more callbacks being added. 2763 * 2764 * This function is called with the icloglock held and returns with it held. We 2765 * drop it while running callbacks, however, as holding it over thousands of 2766 * callbacks is unnecessary and causes excessive contention if we do. 2767 */ 2768 static void 2769 xlog_state_do_iclog_callbacks( 2770 struct xlog *log, 2771 struct xlog_in_core *iclog, 2772 bool aborted) 2773 { 2774 spin_unlock(&log->l_icloglock); 2775 spin_lock(&iclog->ic_callback_lock); 2776 while (!list_empty(&iclog->ic_callbacks)) { 2777 LIST_HEAD(tmp); 2778 2779 list_splice_init(&iclog->ic_callbacks, &tmp); 2780 2781 spin_unlock(&iclog->ic_callback_lock); 2782 xlog_cil_process_committed(&tmp, aborted); 2783 spin_lock(&iclog->ic_callback_lock); 2784 } 2785 2786 /* 2787 * Pick up the icloglock while still holding the callback lock so we 2788 * serialise against anyone trying to add more callbacks to this iclog 2789 * now we've finished processing. 2790 */ 2791 spin_lock(&log->l_icloglock); 2792 spin_unlock(&iclog->ic_callback_lock); 2793 } 2794 2795 #ifdef DEBUG 2796 /* 2797 * Make one last gasp attempt to see if iclogs are being left in limbo. If the 2798 * above loop finds an iclog earlier than the current iclog and in one of the 2799 * syncing states, the current iclog is put into DO_CALLBACK and the callbacks 2800 * are deferred to the completion of the earlier iclog. Walk the iclogs in order 2801 * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in 2802 * one of the syncing states. 2803 * 2804 * Note that SYNCING|IOERROR is a valid state so we cannot just check for 2805 * ic_state == SYNCING. 2806 */ 2807 static void 2808 xlog_state_callback_check_state( 2809 struct xlog *log) 2810 { 2811 struct xlog_in_core *first_iclog = log->l_iclog; 2812 struct xlog_in_core *iclog = first_iclog; 2813 2814 do { 2815 ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); 2816 /* 2817 * Terminate the loop if iclogs are found in states 2818 * which will cause other threads to clean up iclogs. 2819 * 2820 * SYNCING - i/o completion will go through logs 2821 * DONE_SYNC - interrupt thread should be waiting for 2822 * l_icloglock 2823 * IOERROR - give up hope all ye who enter here 2824 */ 2825 if (iclog->ic_state == XLOG_STATE_WANT_SYNC || 2826 iclog->ic_state & XLOG_STATE_SYNCING || 2827 iclog->ic_state == XLOG_STATE_DONE_SYNC || 2828 iclog->ic_state == XLOG_STATE_IOERROR ) 2829 break; 2830 iclog = iclog->ic_next; 2831 } while (first_iclog != iclog); 2832 } 2833 #else 2834 #define xlog_state_callback_check_state(l) ((void)0) 2835 #endif 2836 2837 STATIC void 2838 xlog_state_do_callback( 2839 struct xlog *log, 2840 bool aborted, 2841 struct xlog_in_core *ciclog) 2842 { 2843 struct xlog_in_core *iclog; 2844 struct xlog_in_core *first_iclog; 2845 bool did_callbacks = false; 2846 bool cycled_icloglock; 2847 bool ioerror; 2848 int flushcnt = 0; 2849 int repeats = 0; 2850 2851 spin_lock(&log->l_icloglock); 2852 do { 2853 /* 2854 * Scan all iclogs starting with the one pointed to by the 2855 * log. Reset this starting point each time the log is 2856 * unlocked (during callbacks). 2857 * 2858 * Keep looping through iclogs until one full pass is made 2859 * without running any callbacks. 2860 */ 2861 first_iclog = log->l_iclog; 2862 iclog = log->l_iclog; 2863 cycled_icloglock = false; 2864 ioerror = false; 2865 repeats++; 2866 2867 do { 2868 if (xlog_state_iodone_process_iclog(log, iclog, 2869 ciclog, &ioerror)) 2870 break; 2871 2872 if (!(iclog->ic_state & 2873 (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) { 2874 iclog = iclog->ic_next; 2875 continue; 2876 } 2877 2878 /* 2879 * Running callbacks will drop the icloglock which means 2880 * we'll have to run at least one more complete loop. 2881 */ 2882 cycled_icloglock = true; 2883 xlog_state_do_iclog_callbacks(log, iclog, aborted); 2884 2885 xlog_state_clean_iclog(log, iclog); 2886 iclog = iclog->ic_next; 2887 } while (first_iclog != iclog); 2888 2889 did_callbacks |= cycled_icloglock; 2890 2891 if (repeats > 5000) { 2892 flushcnt += repeats; 2893 repeats = 0; 2894 xfs_warn(log->l_mp, 2895 "%s: possible infinite loop (%d iterations)", 2896 __func__, flushcnt); 2897 } 2898 } while (!ioerror && cycled_icloglock); 2899 2900 if (did_callbacks) 2901 xlog_state_callback_check_state(log); 2902 2903 if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) 2904 wake_up_all(&log->l_flush_wait); 2905 2906 spin_unlock(&log->l_icloglock); 2907 } 2908 2909 2910 /* 2911 * Finish transitioning this iclog to the dirty state. 2912 * 2913 * Make sure that we completely execute this routine only when this is 2914 * the last call to the iclog. There is a good chance that iclog flushes, 2915 * when we reach the end of the physical log, get turned into 2 separate 2916 * calls to bwrite. Hence, one iclog flush could generate two calls to this 2917 * routine. By using the reference count bwritecnt, we guarantee that only 2918 * the second completion goes through. 2919 * 2920 * Callbacks could take time, so they are done outside the scope of the 2921 * global state machine log lock. 2922 */ 2923 STATIC void 2924 xlog_state_done_syncing( 2925 struct xlog_in_core *iclog, 2926 bool aborted) 2927 { 2928 struct xlog *log = iclog->ic_log; 2929 2930 spin_lock(&log->l_icloglock); 2931 2932 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || 2933 iclog->ic_state == XLOG_STATE_IOERROR); 2934 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 2935 2936 /* 2937 * If we got an error, either on the first buffer, or in the case of 2938 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, 2939 * and none should ever be attempted to be written to disk 2940 * again. 2941 */ 2942 if (iclog->ic_state != XLOG_STATE_IOERROR) 2943 iclog->ic_state = XLOG_STATE_DONE_SYNC; 2944 2945 /* 2946 * Someone could be sleeping prior to writing out the next 2947 * iclog buffer, we wake them all, one will get to do the 2948 * I/O, the others get to wait for the result. 2949 */ 2950 wake_up_all(&iclog->ic_write_wait); 2951 spin_unlock(&log->l_icloglock); 2952 xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ 2953 } /* xlog_state_done_syncing */ 2954 2955 2956 /* 2957 * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must 2958 * sleep. We wait on the flush queue on the head iclog as that should be 2959 * the first iclog to complete flushing. Hence if all iclogs are syncing, 2960 * we will wait here and all new writes will sleep until a sync completes. 2961 * 2962 * The in-core logs are used in a circular fashion. They are not used 2963 * out-of-order even when an iclog past the head is free. 2964 * 2965 * return: 2966 * * log_offset where xlog_write() can start writing into the in-core 2967 * log's data space. 2968 * * in-core log pointer to which xlog_write() should write. 2969 * * boolean indicating this is a continued write to an in-core log. 2970 * If this is the last write, then the in-core log's offset field 2971 * needs to be incremented, depending on the amount of data which 2972 * is copied. 2973 */ 2974 STATIC int 2975 xlog_state_get_iclog_space( 2976 struct xlog *log, 2977 int len, 2978 struct xlog_in_core **iclogp, 2979 struct xlog_ticket *ticket, 2980 int *continued_write, 2981 int *logoffsetp) 2982 { 2983 int log_offset; 2984 xlog_rec_header_t *head; 2985 xlog_in_core_t *iclog; 2986 int error; 2987 2988 restart: 2989 spin_lock(&log->l_icloglock); 2990 if (XLOG_FORCED_SHUTDOWN(log)) { 2991 spin_unlock(&log->l_icloglock); 2992 return -EIO; 2993 } 2994 2995 iclog = log->l_iclog; 2996 if (iclog->ic_state != XLOG_STATE_ACTIVE) { 2997 XFS_STATS_INC(log->l_mp, xs_log_noiclogs); 2998 2999 /* Wait for log writes to have flushed */ 3000 xlog_wait(&log->l_flush_wait, &log->l_icloglock); 3001 goto restart; 3002 } 3003 3004 head = &iclog->ic_header; 3005 3006 atomic_inc(&iclog->ic_refcnt); /* prevents sync */ 3007 log_offset = iclog->ic_offset; 3008 3009 /* On the 1st write to an iclog, figure out lsn. This works 3010 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are 3011 * committing to. If the offset is set, that's how many blocks 3012 * must be written. 3013 */ 3014 if (log_offset == 0) { 3015 ticket->t_curr_res -= log->l_iclog_hsize; 3016 xlog_tic_add_region(ticket, 3017 log->l_iclog_hsize, 3018 XLOG_REG_TYPE_LRHEADER); 3019 head->h_cycle = cpu_to_be32(log->l_curr_cycle); 3020 head->h_lsn = cpu_to_be64( 3021 xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); 3022 ASSERT(log->l_curr_block >= 0); 3023 } 3024 3025 /* If there is enough room to write everything, then do it. Otherwise, 3026 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC 3027 * bit is on, so this will get flushed out. Don't update ic_offset 3028 * until you know exactly how many bytes get copied. Therefore, wait 3029 * until later to update ic_offset. 3030 * 3031 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's 3032 * can fit into remaining data section. 3033 */ 3034 if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { 3035 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 3036 3037 /* 3038 * If I'm the only one writing to this iclog, sync it to disk. 3039 * We need to do an atomic compare and decrement here to avoid 3040 * racing with concurrent atomic_dec_and_lock() calls in 3041 * xlog_state_release_iclog() when there is more than one 3042 * reference to the iclog. 3043 */ 3044 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { 3045 /* we are the only one */ 3046 spin_unlock(&log->l_icloglock); 3047 error = xlog_state_release_iclog(log, iclog); 3048 if (error) 3049 return error; 3050 } else { 3051 spin_unlock(&log->l_icloglock); 3052 } 3053 goto restart; 3054 } 3055 3056 /* Do we have enough room to write the full amount in the remainder 3057 * of this iclog? Or must we continue a write on the next iclog and 3058 * mark this iclog as completely taken? In the case where we switch 3059 * iclogs (to mark it taken), this particular iclog will release/sync 3060 * to disk in xlog_write(). 3061 */ 3062 if (len <= iclog->ic_size - iclog->ic_offset) { 3063 *continued_write = 0; 3064 iclog->ic_offset += len; 3065 } else { 3066 *continued_write = 1; 3067 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 3068 } 3069 *iclogp = iclog; 3070 3071 ASSERT(iclog->ic_offset <= iclog->ic_size); 3072 spin_unlock(&log->l_icloglock); 3073 3074 *logoffsetp = log_offset; 3075 return 0; 3076 } /* xlog_state_get_iclog_space */ 3077 3078 /* The first cnt-1 times through here we don't need to 3079 * move the grant write head because the permanent 3080 * reservation has reserved cnt times the unit amount. 3081 * Release part of current permanent unit reservation and 3082 * reset current reservation to be one units worth. Also 3083 * move grant reservation head forward. 3084 */ 3085 STATIC void 3086 xlog_regrant_reserve_log_space( 3087 struct xlog *log, 3088 struct xlog_ticket *ticket) 3089 { 3090 trace_xfs_log_regrant_reserve_enter(log, ticket); 3091 3092 if (ticket->t_cnt > 0) 3093 ticket->t_cnt--; 3094 3095 xlog_grant_sub_space(log, &log->l_reserve_head.grant, 3096 ticket->t_curr_res); 3097 xlog_grant_sub_space(log, &log->l_write_head.grant, 3098 ticket->t_curr_res); 3099 ticket->t_curr_res = ticket->t_unit_res; 3100 xlog_tic_reset_res(ticket); 3101 3102 trace_xfs_log_regrant_reserve_sub(log, ticket); 3103 3104 /* just return if we still have some of the pre-reserved space */ 3105 if (ticket->t_cnt > 0) 3106 return; 3107 3108 xlog_grant_add_space(log, &log->l_reserve_head.grant, 3109 ticket->t_unit_res); 3110 3111 trace_xfs_log_regrant_reserve_exit(log, ticket); 3112 3113 ticket->t_curr_res = ticket->t_unit_res; 3114 xlog_tic_reset_res(ticket); 3115 } /* xlog_regrant_reserve_log_space */ 3116 3117 3118 /* 3119 * Give back the space left from a reservation. 3120 * 3121 * All the information we need to make a correct determination of space left 3122 * is present. For non-permanent reservations, things are quite easy. The 3123 * count should have been decremented to zero. We only need to deal with the 3124 * space remaining in the current reservation part of the ticket. If the 3125 * ticket contains a permanent reservation, there may be left over space which 3126 * needs to be released. A count of N means that N-1 refills of the current 3127 * reservation can be done before we need to ask for more space. The first 3128 * one goes to fill up the first current reservation. Once we run out of 3129 * space, the count will stay at zero and the only space remaining will be 3130 * in the current reservation field. 3131 */ 3132 STATIC void 3133 xlog_ungrant_log_space( 3134 struct xlog *log, 3135 struct xlog_ticket *ticket) 3136 { 3137 int bytes; 3138 3139 if (ticket->t_cnt > 0) 3140 ticket->t_cnt--; 3141 3142 trace_xfs_log_ungrant_enter(log, ticket); 3143 trace_xfs_log_ungrant_sub(log, ticket); 3144 3145 /* 3146 * If this is a permanent reservation ticket, we may be able to free 3147 * up more space based on the remaining count. 3148 */ 3149 bytes = ticket->t_curr_res; 3150 if (ticket->t_cnt > 0) { 3151 ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); 3152 bytes += ticket->t_unit_res*ticket->t_cnt; 3153 } 3154 3155 xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); 3156 xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); 3157 3158 trace_xfs_log_ungrant_exit(log, ticket); 3159 3160 xfs_log_space_wake(log->l_mp); 3161 } 3162 3163 /* 3164 * Flush iclog to disk if this is the last reference to the given iclog and 3165 * the WANT_SYNC bit is set. 3166 * 3167 * When this function is entered, the iclog is not necessarily in the 3168 * WANT_SYNC state. It may be sitting around waiting to get filled. 3169 * 3170 * 3171 */ 3172 STATIC int 3173 xlog_state_release_iclog( 3174 struct xlog *log, 3175 struct xlog_in_core *iclog) 3176 { 3177 int sync = 0; /* do we sync? */ 3178 3179 if (iclog->ic_state & XLOG_STATE_IOERROR) 3180 return -EIO; 3181 3182 ASSERT(atomic_read(&iclog->ic_refcnt) > 0); 3183 if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) 3184 return 0; 3185 3186 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3187 spin_unlock(&log->l_icloglock); 3188 return -EIO; 3189 } 3190 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || 3191 iclog->ic_state == XLOG_STATE_WANT_SYNC); 3192 3193 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { 3194 /* update tail before writing to iclog */ 3195 xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); 3196 sync++; 3197 iclog->ic_state = XLOG_STATE_SYNCING; 3198 iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); 3199 xlog_verify_tail_lsn(log, iclog, tail_lsn); 3200 /* cycle incremented when incrementing curr_block */ 3201 } 3202 spin_unlock(&log->l_icloglock); 3203 3204 /* 3205 * We let the log lock go, so it's possible that we hit a log I/O 3206 * error or some other SHUTDOWN condition that marks the iclog 3207 * as XLOG_STATE_IOERROR before the bwrite. However, we know that 3208 * this iclog has consistent data, so we ignore IOERROR 3209 * flags after this point. 3210 */ 3211 if (sync) 3212 xlog_sync(log, iclog); 3213 return 0; 3214 } /* xlog_state_release_iclog */ 3215 3216 3217 /* 3218 * This routine will mark the current iclog in the ring as WANT_SYNC 3219 * and move the current iclog pointer to the next iclog in the ring. 3220 * When this routine is called from xlog_state_get_iclog_space(), the 3221 * exact size of the iclog has not yet been determined. All we know is 3222 * that every data block. We have run out of space in this log record. 3223 */ 3224 STATIC void 3225 xlog_state_switch_iclogs( 3226 struct xlog *log, 3227 struct xlog_in_core *iclog, 3228 int eventual_size) 3229 { 3230 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 3231 if (!eventual_size) 3232 eventual_size = iclog->ic_offset; 3233 iclog->ic_state = XLOG_STATE_WANT_SYNC; 3234 iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); 3235 log->l_prev_block = log->l_curr_block; 3236 log->l_prev_cycle = log->l_curr_cycle; 3237 3238 /* roll log?: ic_offset changed later */ 3239 log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); 3240 3241 /* Round up to next log-sunit */ 3242 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3243 log->l_mp->m_sb.sb_logsunit > 1) { 3244 uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); 3245 log->l_curr_block = roundup(log->l_curr_block, sunit_bb); 3246 } 3247 3248 if (log->l_curr_block >= log->l_logBBsize) { 3249 /* 3250 * Rewind the current block before the cycle is bumped to make 3251 * sure that the combined LSN never transiently moves forward 3252 * when the log wraps to the next cycle. This is to support the 3253 * unlocked sample of these fields from xlog_valid_lsn(). Most 3254 * other cases should acquire l_icloglock. 3255 */ 3256 log->l_curr_block -= log->l_logBBsize; 3257 ASSERT(log->l_curr_block >= 0); 3258 smp_wmb(); 3259 log->l_curr_cycle++; 3260 if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) 3261 log->l_curr_cycle++; 3262 } 3263 ASSERT(iclog == log->l_iclog); 3264 log->l_iclog = iclog->ic_next; 3265 } /* xlog_state_switch_iclogs */ 3266 3267 /* 3268 * Write out all data in the in-core log as of this exact moment in time. 3269 * 3270 * Data may be written to the in-core log during this call. However, 3271 * we don't guarantee this data will be written out. A change from past 3272 * implementation means this routine will *not* write out zero length LRs. 3273 * 3274 * Basically, we try and perform an intelligent scan of the in-core logs. 3275 * If we determine there is no flushable data, we just return. There is no 3276 * flushable data if: 3277 * 3278 * 1. the current iclog is active and has no data; the previous iclog 3279 * is in the active or dirty state. 3280 * 2. the current iclog is drity, and the previous iclog is in the 3281 * active or dirty state. 3282 * 3283 * We may sleep if: 3284 * 3285 * 1. the current iclog is not in the active nor dirty state. 3286 * 2. the current iclog dirty, and the previous iclog is not in the 3287 * active nor dirty state. 3288 * 3. the current iclog is active, and there is another thread writing 3289 * to this particular iclog. 3290 * 4. a) the current iclog is active and has no other writers 3291 * b) when we return from flushing out this iclog, it is still 3292 * not in the active nor dirty state. 3293 */ 3294 int 3295 xfs_log_force( 3296 struct xfs_mount *mp, 3297 uint flags) 3298 { 3299 struct xlog *log = mp->m_log; 3300 struct xlog_in_core *iclog; 3301 xfs_lsn_t lsn; 3302 3303 XFS_STATS_INC(mp, xs_log_force); 3304 trace_xfs_log_force(mp, 0, _RET_IP_); 3305 3306 xlog_cil_force(log); 3307 3308 spin_lock(&log->l_icloglock); 3309 iclog = log->l_iclog; 3310 if (iclog->ic_state & XLOG_STATE_IOERROR) 3311 goto out_error; 3312 3313 if (iclog->ic_state == XLOG_STATE_DIRTY || 3314 (iclog->ic_state == XLOG_STATE_ACTIVE && 3315 atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { 3316 /* 3317 * If the head is dirty or (active and empty), then we need to 3318 * look at the previous iclog. 3319 * 3320 * If the previous iclog is active or dirty we are done. There 3321 * is nothing to sync out. Otherwise, we attach ourselves to the 3322 * previous iclog and go to sleep. 3323 */ 3324 iclog = iclog->ic_prev; 3325 if (iclog->ic_state == XLOG_STATE_ACTIVE || 3326 iclog->ic_state == XLOG_STATE_DIRTY) 3327 goto out_unlock; 3328 } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3329 if (atomic_read(&iclog->ic_refcnt) == 0) { 3330 /* 3331 * We are the only one with access to this iclog. 3332 * 3333 * Flush it out now. There should be a roundoff of zero 3334 * to show that someone has already taken care of the 3335 * roundoff from the previous sync. 3336 */ 3337 atomic_inc(&iclog->ic_refcnt); 3338 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 3339 xlog_state_switch_iclogs(log, iclog, 0); 3340 spin_unlock(&log->l_icloglock); 3341 3342 if (xlog_state_release_iclog(log, iclog)) 3343 return -EIO; 3344 3345 spin_lock(&log->l_icloglock); 3346 if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || 3347 iclog->ic_state == XLOG_STATE_DIRTY) 3348 goto out_unlock; 3349 } else { 3350 /* 3351 * Someone else is writing to this iclog. 3352 * 3353 * Use its call to flush out the data. However, the 3354 * other thread may not force out this LR, so we mark 3355 * it WANT_SYNC. 3356 */ 3357 xlog_state_switch_iclogs(log, iclog, 0); 3358 } 3359 } else { 3360 /* 3361 * If the head iclog is not active nor dirty, we just attach 3362 * ourselves to the head and go to sleep if necessary. 3363 */ 3364 ; 3365 } 3366 3367 if (!(flags & XFS_LOG_SYNC)) 3368 goto out_unlock; 3369 3370 if (iclog->ic_state & XLOG_STATE_IOERROR) 3371 goto out_error; 3372 XFS_STATS_INC(mp, xs_log_force_sleep); 3373 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3374 if (iclog->ic_state & XLOG_STATE_IOERROR) 3375 return -EIO; 3376 return 0; 3377 3378 out_unlock: 3379 spin_unlock(&log->l_icloglock); 3380 return 0; 3381 out_error: 3382 spin_unlock(&log->l_icloglock); 3383 return -EIO; 3384 } 3385 3386 static int 3387 __xfs_log_force_lsn( 3388 struct xfs_mount *mp, 3389 xfs_lsn_t lsn, 3390 uint flags, 3391 int *log_flushed, 3392 bool already_slept) 3393 { 3394 struct xlog *log = mp->m_log; 3395 struct xlog_in_core *iclog; 3396 3397 spin_lock(&log->l_icloglock); 3398 iclog = log->l_iclog; 3399 if (iclog->ic_state & XLOG_STATE_IOERROR) 3400 goto out_error; 3401 3402 while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { 3403 iclog = iclog->ic_next; 3404 if (iclog == log->l_iclog) 3405 goto out_unlock; 3406 } 3407 3408 if (iclog->ic_state == XLOG_STATE_DIRTY) 3409 goto out_unlock; 3410 3411 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3412 /* 3413 * We sleep here if we haven't already slept (e.g. this is the 3414 * first time we've looked at the correct iclog buf) and the 3415 * buffer before us is going to be sync'ed. The reason for this 3416 * is that if we are doing sync transactions here, by waiting 3417 * for the previous I/O to complete, we can allow a few more 3418 * transactions into this iclog before we close it down. 3419 * 3420 * Otherwise, we mark the buffer WANT_SYNC, and bump up the 3421 * refcnt so we can release the log (which drops the ref count). 3422 * The state switch keeps new transaction commits from using 3423 * this buffer. When the current commits finish writing into 3424 * the buffer, the refcount will drop to zero and the buffer 3425 * will go out then. 3426 */ 3427 if (!already_slept && 3428 (iclog->ic_prev->ic_state & 3429 (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { 3430 ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); 3431 3432 XFS_STATS_INC(mp, xs_log_force_sleep); 3433 3434 xlog_wait(&iclog->ic_prev->ic_write_wait, 3435 &log->l_icloglock); 3436 return -EAGAIN; 3437 } 3438 atomic_inc(&iclog->ic_refcnt); 3439 xlog_state_switch_iclogs(log, iclog, 0); 3440 spin_unlock(&log->l_icloglock); 3441 if (xlog_state_release_iclog(log, iclog)) 3442 return -EIO; 3443 if (log_flushed) 3444 *log_flushed = 1; 3445 spin_lock(&log->l_icloglock); 3446 } 3447 3448 if (!(flags & XFS_LOG_SYNC) || 3449 (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) 3450 goto out_unlock; 3451 3452 if (iclog->ic_state & XLOG_STATE_IOERROR) 3453 goto out_error; 3454 3455 XFS_STATS_INC(mp, xs_log_force_sleep); 3456 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3457 if (iclog->ic_state & XLOG_STATE_IOERROR) 3458 return -EIO; 3459 return 0; 3460 3461 out_unlock: 3462 spin_unlock(&log->l_icloglock); 3463 return 0; 3464 out_error: 3465 spin_unlock(&log->l_icloglock); 3466 return -EIO; 3467 } 3468 3469 /* 3470 * Force the in-core log to disk for a specific LSN. 3471 * 3472 * Find in-core log with lsn. 3473 * If it is in the DIRTY state, just return. 3474 * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC 3475 * state and go to sleep or return. 3476 * If it is in any other state, go to sleep or return. 3477 * 3478 * Synchronous forces are implemented with a wait queue. All callers trying 3479 * to force a given lsn to disk must wait on the queue attached to the 3480 * specific in-core log. When given in-core log finally completes its write 3481 * to disk, that thread will wake up all threads waiting on the queue. 3482 */ 3483 int 3484 xfs_log_force_lsn( 3485 struct xfs_mount *mp, 3486 xfs_lsn_t lsn, 3487 uint flags, 3488 int *log_flushed) 3489 { 3490 int ret; 3491 ASSERT(lsn != 0); 3492 3493 XFS_STATS_INC(mp, xs_log_force); 3494 trace_xfs_log_force(mp, lsn, _RET_IP_); 3495 3496 lsn = xlog_cil_force_lsn(mp->m_log, lsn); 3497 if (lsn == NULLCOMMITLSN) 3498 return 0; 3499 3500 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); 3501 if (ret == -EAGAIN) 3502 ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); 3503 return ret; 3504 } 3505 3506 /* 3507 * Called when we want to mark the current iclog as being ready to sync to 3508 * disk. 3509 */ 3510 STATIC void 3511 xlog_state_want_sync( 3512 struct xlog *log, 3513 struct xlog_in_core *iclog) 3514 { 3515 assert_spin_locked(&log->l_icloglock); 3516 3517 if (iclog->ic_state == XLOG_STATE_ACTIVE) { 3518 xlog_state_switch_iclogs(log, iclog, 0); 3519 } else { 3520 ASSERT(iclog->ic_state & 3521 (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); 3522 } 3523 } 3524 3525 3526 /***************************************************************************** 3527 * 3528 * TICKET functions 3529 * 3530 ***************************************************************************** 3531 */ 3532 3533 /* 3534 * Free a used ticket when its refcount falls to zero. 3535 */ 3536 void 3537 xfs_log_ticket_put( 3538 xlog_ticket_t *ticket) 3539 { 3540 ASSERT(atomic_read(&ticket->t_ref) > 0); 3541 if (atomic_dec_and_test(&ticket->t_ref)) 3542 kmem_zone_free(xfs_log_ticket_zone, ticket); 3543 } 3544 3545 xlog_ticket_t * 3546 xfs_log_ticket_get( 3547 xlog_ticket_t *ticket) 3548 { 3549 ASSERT(atomic_read(&ticket->t_ref) > 0); 3550 atomic_inc(&ticket->t_ref); 3551 return ticket; 3552 } 3553 3554 /* 3555 * Figure out the total log space unit (in bytes) that would be 3556 * required for a log ticket. 3557 */ 3558 int 3559 xfs_log_calc_unit_res( 3560 struct xfs_mount *mp, 3561 int unit_bytes) 3562 { 3563 struct xlog *log = mp->m_log; 3564 int iclog_space; 3565 uint num_headers; 3566 3567 /* 3568 * Permanent reservations have up to 'cnt'-1 active log operations 3569 * in the log. A unit in this case is the amount of space for one 3570 * of these log operations. Normal reservations have a cnt of 1 3571 * and their unit amount is the total amount of space required. 3572 * 3573 * The following lines of code account for non-transaction data 3574 * which occupy space in the on-disk log. 3575 * 3576 * Normal form of a transaction is: 3577 * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph> 3578 * and then there are LR hdrs, split-recs and roundoff at end of syncs. 3579 * 3580 * We need to account for all the leadup data and trailer data 3581 * around the transaction data. 3582 * And then we need to account for the worst case in terms of using 3583 * more space. 3584 * The worst case will happen if: 3585 * - the placement of the transaction happens to be such that the 3586 * roundoff is at its maximum 3587 * - the transaction data is synced before the commit record is synced 3588 * i.e. <transaction-data><roundoff> | <commit-rec><roundoff> 3589 * Therefore the commit record is in its own Log Record. 3590 * This can happen as the commit record is called with its 3591 * own region to xlog_write(). 3592 * This then means that in the worst case, roundoff can happen for 3593 * the commit-rec as well. 3594 * The commit-rec is smaller than padding in this scenario and so it is 3595 * not added separately. 3596 */ 3597 3598 /* for trans header */ 3599 unit_bytes += sizeof(xlog_op_header_t); 3600 unit_bytes += sizeof(xfs_trans_header_t); 3601 3602 /* for start-rec */ 3603 unit_bytes += sizeof(xlog_op_header_t); 3604 3605 /* 3606 * for LR headers - the space for data in an iclog is the size minus 3607 * the space used for the headers. If we use the iclog size, then we 3608 * undercalculate the number of headers required. 3609 * 3610 * Furthermore - the addition of op headers for split-recs might 3611 * increase the space required enough to require more log and op 3612 * headers, so take that into account too. 3613 * 3614 * IMPORTANT: This reservation makes the assumption that if this 3615 * transaction is the first in an iclog and hence has the LR headers 3616 * accounted to it, then the remaining space in the iclog is 3617 * exclusively for this transaction. i.e. if the transaction is larger 3618 * than the iclog, it will be the only thing in that iclog. 3619 * Fundamentally, this means we must pass the entire log vector to 3620 * xlog_write to guarantee this. 3621 */ 3622 iclog_space = log->l_iclog_size - log->l_iclog_hsize; 3623 num_headers = howmany(unit_bytes, iclog_space); 3624 3625 /* for split-recs - ophdrs added when data split over LRs */ 3626 unit_bytes += sizeof(xlog_op_header_t) * num_headers; 3627 3628 /* add extra header reservations if we overrun */ 3629 while (!num_headers || 3630 howmany(unit_bytes, iclog_space) > num_headers) { 3631 unit_bytes += sizeof(xlog_op_header_t); 3632 num_headers++; 3633 } 3634 unit_bytes += log->l_iclog_hsize * num_headers; 3635 3636 /* for commit-rec LR header - note: padding will subsume the ophdr */ 3637 unit_bytes += log->l_iclog_hsize; 3638 3639 /* for roundoff padding for transaction data and one for commit record */ 3640 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { 3641 /* log su roundoff */ 3642 unit_bytes += 2 * mp->m_sb.sb_logsunit; 3643 } else { 3644 /* BB roundoff */ 3645 unit_bytes += 2 * BBSIZE; 3646 } 3647 3648 return unit_bytes; 3649 } 3650 3651 /* 3652 * Allocate and initialise a new log ticket. 3653 */ 3654 struct xlog_ticket * 3655 xlog_ticket_alloc( 3656 struct xlog *log, 3657 int unit_bytes, 3658 int cnt, 3659 char client, 3660 bool permanent, 3661 xfs_km_flags_t alloc_flags) 3662 { 3663 struct xlog_ticket *tic; 3664 int unit_res; 3665 3666 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); 3667 if (!tic) 3668 return NULL; 3669 3670 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); 3671 3672 atomic_set(&tic->t_ref, 1); 3673 tic->t_task = current; 3674 INIT_LIST_HEAD(&tic->t_queue); 3675 tic->t_unit_res = unit_res; 3676 tic->t_curr_res = unit_res; 3677 tic->t_cnt = cnt; 3678 tic->t_ocnt = cnt; 3679 tic->t_tid = prandom_u32(); 3680 tic->t_clientid = client; 3681 tic->t_flags = XLOG_TIC_INITED; 3682 if (permanent) 3683 tic->t_flags |= XLOG_TIC_PERM_RESERV; 3684 3685 xlog_tic_reset_res(tic); 3686 3687 return tic; 3688 } 3689 3690 3691 /****************************************************************************** 3692 * 3693 * Log debug routines 3694 * 3695 ****************************************************************************** 3696 */ 3697 #if defined(DEBUG) 3698 /* 3699 * Make sure that the destination ptr is within the valid data region of 3700 * one of the iclogs. This uses backup pointers stored in a different 3701 * part of the log in case we trash the log structure. 3702 */ 3703 STATIC void 3704 xlog_verify_dest_ptr( 3705 struct xlog *log, 3706 void *ptr) 3707 { 3708 int i; 3709 int good_ptr = 0; 3710 3711 for (i = 0; i < log->l_iclog_bufs; i++) { 3712 if (ptr >= log->l_iclog_bak[i] && 3713 ptr <= log->l_iclog_bak[i] + log->l_iclog_size) 3714 good_ptr++; 3715 } 3716 3717 if (!good_ptr) 3718 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); 3719 } 3720 3721 /* 3722 * Check to make sure the grant write head didn't just over lap the tail. If 3723 * the cycles are the same, we can't be overlapping. Otherwise, make sure that 3724 * the cycles differ by exactly one and check the byte count. 3725 * 3726 * This check is run unlocked, so can give false positives. Rather than assert 3727 * on failures, use a warn-once flag and a panic tag to allow the admin to 3728 * determine if they want to panic the machine when such an error occurs. For 3729 * debug kernels this will have the same effect as using an assert but, unlinke 3730 * an assert, it can be turned off at runtime. 3731 */ 3732 STATIC void 3733 xlog_verify_grant_tail( 3734 struct xlog *log) 3735 { 3736 int tail_cycle, tail_blocks; 3737 int cycle, space; 3738 3739 xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); 3740 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); 3741 if (tail_cycle != cycle) { 3742 if (cycle - 1 != tail_cycle && 3743 !(log->l_flags & XLOG_TAIL_WARN)) { 3744 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3745 "%s: cycle - 1 != tail_cycle", __func__); 3746 log->l_flags |= XLOG_TAIL_WARN; 3747 } 3748 3749 if (space > BBTOB(tail_blocks) && 3750 !(log->l_flags & XLOG_TAIL_WARN)) { 3751 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, 3752 "%s: space > BBTOB(tail_blocks)", __func__); 3753 log->l_flags |= XLOG_TAIL_WARN; 3754 } 3755 } 3756 } 3757 3758 /* check if it will fit */ 3759 STATIC void 3760 xlog_verify_tail_lsn( 3761 struct xlog *log, 3762 struct xlog_in_core *iclog, 3763 xfs_lsn_t tail_lsn) 3764 { 3765 int blocks; 3766 3767 if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { 3768 blocks = 3769 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3770 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3771 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3772 } else { 3773 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3774 3775 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3776 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); 3777 3778 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3779 if (blocks < BTOBB(iclog->ic_offset) + 1) 3780 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); 3781 } 3782 } /* xlog_verify_tail_lsn */ 3783 3784 /* 3785 * Perform a number of checks on the iclog before writing to disk. 3786 * 3787 * 1. Make sure the iclogs are still circular 3788 * 2. Make sure we have a good magic number 3789 * 3. Make sure we don't have magic numbers in the data 3790 * 4. Check fields of each log operation header for: 3791 * A. Valid client identifier 3792 * B. tid ptr value falls in valid ptr space (user space code) 3793 * C. Length in log record header is correct according to the 3794 * individual operation headers within record. 3795 * 5. When a bwrite will occur within 5 blocks of the front of the physical 3796 * log, check the preceding blocks of the physical log to make sure all 3797 * the cycle numbers agree with the current cycle number. 3798 */ 3799 STATIC void 3800 xlog_verify_iclog( 3801 struct xlog *log, 3802 struct xlog_in_core *iclog, 3803 int count) 3804 { 3805 xlog_op_header_t *ophead; 3806 xlog_in_core_t *icptr; 3807 xlog_in_core_2_t *xhdr; 3808 void *base_ptr, *ptr, *p; 3809 ptrdiff_t field_offset; 3810 uint8_t clientid; 3811 int len, i, j, k, op_len; 3812 int idx; 3813 3814 /* check validity of iclog pointers */ 3815 spin_lock(&log->l_icloglock); 3816 icptr = log->l_iclog; 3817 for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) 3818 ASSERT(icptr); 3819 3820 if (icptr != log->l_iclog) 3821 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); 3822 spin_unlock(&log->l_icloglock); 3823 3824 /* check log magic numbers */ 3825 if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3826 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); 3827 3828 base_ptr = ptr = &iclog->ic_header; 3829 p = &iclog->ic_header; 3830 for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { 3831 if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 3832 xfs_emerg(log->l_mp, "%s: unexpected magic num", 3833 __func__); 3834 } 3835 3836 /* check fields */ 3837 len = be32_to_cpu(iclog->ic_header.h_num_logops); 3838 base_ptr = ptr = iclog->ic_datap; 3839 ophead = ptr; 3840 xhdr = iclog->ic_data; 3841 for (i = 0; i < len; i++) { 3842 ophead = ptr; 3843 3844 /* clientid is only 1 byte */ 3845 p = &ophead->oh_clientid; 3846 field_offset = p - base_ptr; 3847 if (field_offset & 0x1ff) { 3848 clientid = ophead->oh_clientid; 3849 } else { 3850 idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); 3851 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3852 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3853 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3854 clientid = xlog_get_client_id( 3855 xhdr[j].hic_xheader.xh_cycle_data[k]); 3856 } else { 3857 clientid = xlog_get_client_id( 3858 iclog->ic_header.h_cycle_data[idx]); 3859 } 3860 } 3861 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3862 xfs_warn(log->l_mp, 3863 "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx", 3864 __func__, clientid, ophead, 3865 (unsigned long)field_offset); 3866 3867 /* check length */ 3868 p = &ophead->oh_len; 3869 field_offset = p - base_ptr; 3870 if (field_offset & 0x1ff) { 3871 op_len = be32_to_cpu(ophead->oh_len); 3872 } else { 3873 idx = BTOBBT((uintptr_t)&ophead->oh_len - 3874 (uintptr_t)iclog->ic_datap); 3875 if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { 3876 j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3877 k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3878 op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); 3879 } else { 3880 op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); 3881 } 3882 } 3883 ptr += sizeof(xlog_op_header_t) + op_len; 3884 } 3885 } /* xlog_verify_iclog */ 3886 #endif 3887 3888 /* 3889 * Mark all iclogs IOERROR. l_icloglock is held by the caller. 3890 */ 3891 STATIC int 3892 xlog_state_ioerror( 3893 struct xlog *log) 3894 { 3895 xlog_in_core_t *iclog, *ic; 3896 3897 iclog = log->l_iclog; 3898 if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { 3899 /* 3900 * Mark all the incore logs IOERROR. 3901 * From now on, no log flushes will result. 3902 */ 3903 ic = iclog; 3904 do { 3905 ic->ic_state = XLOG_STATE_IOERROR; 3906 ic = ic->ic_next; 3907 } while (ic != iclog); 3908 return 0; 3909 } 3910 /* 3911 * Return non-zero, if state transition has already happened. 3912 */ 3913 return 1; 3914 } 3915 3916 /* 3917 * This is called from xfs_force_shutdown, when we're forcibly 3918 * shutting down the filesystem, typically because of an IO error. 3919 * Our main objectives here are to make sure that: 3920 * a. if !logerror, flush the logs to disk. Anything modified 3921 * after this is ignored. 3922 * b. the filesystem gets marked 'SHUTDOWN' for all interested 3923 * parties to find out, 'atomically'. 3924 * c. those who're sleeping on log reservations, pinned objects and 3925 * other resources get woken up, and be told the bad news. 3926 * d. nothing new gets queued up after (b) and (c) are done. 3927 * 3928 * Note: for the !logerror case we need to flush the regions held in memory out 3929 * to disk first. This needs to be done before the log is marked as shutdown, 3930 * otherwise the iclog writes will fail. 3931 */ 3932 int 3933 xfs_log_force_umount( 3934 struct xfs_mount *mp, 3935 int logerror) 3936 { 3937 struct xlog *log; 3938 int retval; 3939 3940 log = mp->m_log; 3941 3942 /* 3943 * If this happens during log recovery, don't worry about 3944 * locking; the log isn't open for business yet. 3945 */ 3946 if (!log || 3947 log->l_flags & XLOG_ACTIVE_RECOVERY) { 3948 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3949 if (mp->m_sb_bp) 3950 mp->m_sb_bp->b_flags |= XBF_DONE; 3951 return 0; 3952 } 3953 3954 /* 3955 * Somebody could've already done the hard work for us. 3956 * No need to get locks for this. 3957 */ 3958 if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { 3959 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3960 return 1; 3961 } 3962 3963 /* 3964 * Flush all the completed transactions to disk before marking the log 3965 * being shut down. We need to do it in this order to ensure that 3966 * completed operations are safely on disk before we shut down, and that 3967 * we don't have to issue any buffer IO after the shutdown flags are set 3968 * to guarantee this. 3969 */ 3970 if (!logerror) 3971 xfs_log_force(mp, XFS_LOG_SYNC); 3972 3973 /* 3974 * mark the filesystem and the as in a shutdown state and wake 3975 * everybody up to tell them the bad news. 3976 */ 3977 spin_lock(&log->l_icloglock); 3978 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3979 if (mp->m_sb_bp) 3980 mp->m_sb_bp->b_flags |= XBF_DONE; 3981 3982 /* 3983 * Mark the log and the iclogs with IO error flags to prevent any 3984 * further log IO from being issued or completed. 3985 */ 3986 log->l_flags |= XLOG_IO_ERROR; 3987 retval = xlog_state_ioerror(log); 3988 spin_unlock(&log->l_icloglock); 3989 3990 /* 3991 * We don't want anybody waiting for log reservations after this. That 3992 * means we have to wake up everybody queued up on reserveq as well as 3993 * writeq. In addition, we make sure in xlog_{re}grant_log_space that 3994 * we don't enqueue anything once the SHUTDOWN flag is set, and this 3995 * action is protected by the grant locks. 3996 */ 3997 xlog_grant_head_wake_all(&log->l_reserve_head); 3998 xlog_grant_head_wake_all(&log->l_write_head); 3999 4000 /* 4001 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first 4002 * as if the log writes were completed. The abort handling in the log 4003 * item committed callback functions will do this again under lock to 4004 * avoid races. 4005 */ 4006 spin_lock(&log->l_cilp->xc_push_lock); 4007 wake_up_all(&log->l_cilp->xc_commit_wait); 4008 spin_unlock(&log->l_cilp->xc_push_lock); 4009 xlog_state_do_callback(log, true, NULL); 4010 4011 #ifdef XFSERRORDEBUG 4012 { 4013 xlog_in_core_t *iclog; 4014 4015 spin_lock(&log->l_icloglock); 4016 iclog = log->l_iclog; 4017 do { 4018 ASSERT(iclog->ic_callback == 0); 4019 iclog = iclog->ic_next; 4020 } while (iclog != log->l_iclog); 4021 spin_unlock(&log->l_icloglock); 4022 } 4023 #endif 4024 /* return non-zero if log IOERROR transition had already happened */ 4025 return retval; 4026 } 4027 4028 STATIC int 4029 xlog_iclogs_empty( 4030 struct xlog *log) 4031 { 4032 xlog_in_core_t *iclog; 4033 4034 iclog = log->l_iclog; 4035 do { 4036 /* endianness does not matter here, zero is zero in 4037 * any language. 4038 */ 4039 if (iclog->ic_header.h_num_logops) 4040 return 0; 4041 iclog = iclog->ic_next; 4042 } while (iclog != log->l_iclog); 4043 return 1; 4044 } 4045 4046 /* 4047 * Verify that an LSN stamped into a piece of metadata is valid. This is 4048 * intended for use in read verifiers on v5 superblocks. 4049 */ 4050 bool 4051 xfs_log_check_lsn( 4052 struct xfs_mount *mp, 4053 xfs_lsn_t lsn) 4054 { 4055 struct xlog *log = mp->m_log; 4056 bool valid; 4057 4058 /* 4059 * norecovery mode skips mount-time log processing and unconditionally 4060 * resets the in-core LSN. We can't validate in this mode, but 4061 * modifications are not allowed anyways so just return true. 4062 */ 4063 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 4064 return true; 4065 4066 /* 4067 * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is 4068 * handled by recovery and thus safe to ignore here. 4069 */ 4070 if (lsn == NULLCOMMITLSN) 4071 return true; 4072 4073 valid = xlog_valid_lsn(mp->m_log, lsn); 4074 4075 /* warn the user about what's gone wrong before verifier failure */ 4076 if (!valid) { 4077 spin_lock(&log->l_icloglock); 4078 xfs_warn(mp, 4079 "Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " 4080 "Please unmount and run xfs_repair (>= v4.3) to resolve.", 4081 CYCLE_LSN(lsn), BLOCK_LSN(lsn), 4082 log->l_curr_cycle, log->l_curr_block); 4083 spin_unlock(&log->l_icloglock); 4084 } 4085 4086 return valid; 4087 } 4088 4089 bool 4090 xfs_log_in_recovery( 4091 struct xfs_mount *mp) 4092 { 4093 struct xlog *log = mp->m_log; 4094 4095 return log->l_flags & XLOG_ACTIVE_RECOVERY; 4096 } 4097