1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_bit.h" 25 #include "xfs_sb.h" 26 #include "xfs_mount.h" 27 #include "xfs_da_format.h" 28 #include "xfs_da_btree.h" 29 #include "xfs_inode.h" 30 #include "xfs_trans.h" 31 #include "xfs_log.h" 32 #include "xfs_log_priv.h" 33 #include "xfs_log_recover.h" 34 #include "xfs_inode_item.h" 35 #include "xfs_extfree_item.h" 36 #include "xfs_trans_priv.h" 37 #include "xfs_alloc.h" 38 #include "xfs_ialloc.h" 39 #include "xfs_quota.h" 40 #include "xfs_cksum.h" 41 #include "xfs_trace.h" 42 #include "xfs_icache.h" 43 #include "xfs_bmap_btree.h" 44 #include "xfs_error.h" 45 #include "xfs_dir2.h" 46 47 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) 48 49 STATIC int 50 xlog_find_zeroed( 51 struct xlog *, 52 xfs_daddr_t *); 53 STATIC int 54 xlog_clear_stale_blocks( 55 struct xlog *, 56 xfs_lsn_t); 57 #if defined(DEBUG) 58 STATIC void 59 xlog_recover_check_summary( 60 struct xlog *); 61 #else 62 #define xlog_recover_check_summary(log) 63 #endif 64 STATIC int 65 xlog_do_recovery_pass( 66 struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); 67 68 /* 69 * This structure is used during recovery to record the buf log items which 70 * have been canceled and should not be replayed. 71 */ 72 struct xfs_buf_cancel { 73 xfs_daddr_t bc_blkno; 74 uint bc_len; 75 int bc_refcount; 76 struct list_head bc_list; 77 }; 78 79 /* 80 * Sector aligned buffer routines for buffer create/read/write/access 81 */ 82 83 /* 84 * Verify the given count of basic blocks is valid number of blocks 85 * to specify for an operation involving the given XFS log buffer. 86 * Returns nonzero if the count is valid, 0 otherwise. 87 */ 88 89 static inline int 90 xlog_buf_bbcount_valid( 91 struct xlog *log, 92 int bbcount) 93 { 94 return bbcount > 0 && bbcount <= log->l_logBBsize; 95 } 96 97 /* 98 * Allocate a buffer to hold log data. The buffer needs to be able 99 * to map to a range of nbblks basic blocks at any valid (basic 100 * block) offset within the log. 101 */ 102 STATIC xfs_buf_t * 103 xlog_get_bp( 104 struct xlog *log, 105 int nbblks) 106 { 107 struct xfs_buf *bp; 108 109 if (!xlog_buf_bbcount_valid(log, nbblks)) { 110 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 111 nbblks); 112 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 113 return NULL; 114 } 115 116 /* 117 * We do log I/O in units of log sectors (a power-of-2 118 * multiple of the basic block size), so we round up the 119 * requested size to accommodate the basic blocks required 120 * for complete log sectors. 121 * 122 * In addition, the buffer may be used for a non-sector- 123 * aligned block offset, in which case an I/O of the 124 * requested size could extend beyond the end of the 125 * buffer. If the requested size is only 1 basic block it 126 * will never straddle a sector boundary, so this won't be 127 * an issue. Nor will this be a problem if the log I/O is 128 * done in basic blocks (sector size 1). But otherwise we 129 * extend the buffer by one extra log sector to ensure 130 * there's space to accommodate this possibility. 131 */ 132 if (nbblks > 1 && log->l_sectBBsize > 1) 133 nbblks += log->l_sectBBsize; 134 nbblks = round_up(nbblks, log->l_sectBBsize); 135 136 bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); 137 if (bp) 138 xfs_buf_unlock(bp); 139 return bp; 140 } 141 142 STATIC void 143 xlog_put_bp( 144 xfs_buf_t *bp) 145 { 146 xfs_buf_free(bp); 147 } 148 149 /* 150 * Return the address of the start of the given block number's data 151 * in a log buffer. The buffer covers a log sector-aligned region. 152 */ 153 STATIC char * 154 xlog_align( 155 struct xlog *log, 156 xfs_daddr_t blk_no, 157 int nbblks, 158 struct xfs_buf *bp) 159 { 160 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); 161 162 ASSERT(offset + nbblks <= bp->b_length); 163 return bp->b_addr + BBTOB(offset); 164 } 165 166 167 /* 168 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 169 */ 170 STATIC int 171 xlog_bread_noalign( 172 struct xlog *log, 173 xfs_daddr_t blk_no, 174 int nbblks, 175 struct xfs_buf *bp) 176 { 177 int error; 178 179 if (!xlog_buf_bbcount_valid(log, nbblks)) { 180 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 181 nbblks); 182 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 183 return -EFSCORRUPTED; 184 } 185 186 blk_no = round_down(blk_no, log->l_sectBBsize); 187 nbblks = round_up(nbblks, log->l_sectBBsize); 188 189 ASSERT(nbblks > 0); 190 ASSERT(nbblks <= bp->b_length); 191 192 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 193 bp->b_flags |= XBF_READ; 194 bp->b_io_length = nbblks; 195 bp->b_error = 0; 196 197 error = xfs_buf_submit_wait(bp); 198 if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) 199 xfs_buf_ioerror_alert(bp, __func__); 200 return error; 201 } 202 203 STATIC int 204 xlog_bread( 205 struct xlog *log, 206 xfs_daddr_t blk_no, 207 int nbblks, 208 struct xfs_buf *bp, 209 char **offset) 210 { 211 int error; 212 213 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 214 if (error) 215 return error; 216 217 *offset = xlog_align(log, blk_no, nbblks, bp); 218 return 0; 219 } 220 221 /* 222 * Read at an offset into the buffer. Returns with the buffer in it's original 223 * state regardless of the result of the read. 224 */ 225 STATIC int 226 xlog_bread_offset( 227 struct xlog *log, 228 xfs_daddr_t blk_no, /* block to read from */ 229 int nbblks, /* blocks to read */ 230 struct xfs_buf *bp, 231 char *offset) 232 { 233 char *orig_offset = bp->b_addr; 234 int orig_len = BBTOB(bp->b_length); 235 int error, error2; 236 237 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); 238 if (error) 239 return error; 240 241 error = xlog_bread_noalign(log, blk_no, nbblks, bp); 242 243 /* must reset buffer pointer even on error */ 244 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); 245 if (error) 246 return error; 247 return error2; 248 } 249 250 /* 251 * Write out the buffer at the given block for the given number of blocks. 252 * The buffer is kept locked across the write and is returned locked. 253 * This can only be used for synchronous log writes. 254 */ 255 STATIC int 256 xlog_bwrite( 257 struct xlog *log, 258 xfs_daddr_t blk_no, 259 int nbblks, 260 struct xfs_buf *bp) 261 { 262 int error; 263 264 if (!xlog_buf_bbcount_valid(log, nbblks)) { 265 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 266 nbblks); 267 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 268 return -EFSCORRUPTED; 269 } 270 271 blk_no = round_down(blk_no, log->l_sectBBsize); 272 nbblks = round_up(nbblks, log->l_sectBBsize); 273 274 ASSERT(nbblks > 0); 275 ASSERT(nbblks <= bp->b_length); 276 277 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 278 xfs_buf_hold(bp); 279 xfs_buf_lock(bp); 280 bp->b_io_length = nbblks; 281 bp->b_error = 0; 282 283 error = xfs_bwrite(bp); 284 if (error) 285 xfs_buf_ioerror_alert(bp, __func__); 286 xfs_buf_relse(bp); 287 return error; 288 } 289 290 #ifdef DEBUG 291 /* 292 * dump debug superblock and log record information 293 */ 294 STATIC void 295 xlog_header_check_dump( 296 xfs_mount_t *mp, 297 xlog_rec_header_t *head) 298 { 299 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", 300 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 301 xfs_debug(mp, " log : uuid = %pU, fmt = %d", 302 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 303 } 304 #else 305 #define xlog_header_check_dump(mp, head) 306 #endif 307 308 /* 309 * check log record header for recovery 310 */ 311 STATIC int 312 xlog_header_check_recover( 313 xfs_mount_t *mp, 314 xlog_rec_header_t *head) 315 { 316 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); 317 318 /* 319 * IRIX doesn't write the h_fmt field and leaves it zeroed 320 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover 321 * a dirty log created in IRIX. 322 */ 323 if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { 324 xfs_warn(mp, 325 "dirty log written in incompatible format - can't recover"); 326 xlog_header_check_dump(mp, head); 327 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 328 XFS_ERRLEVEL_HIGH, mp); 329 return -EFSCORRUPTED; 330 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 331 xfs_warn(mp, 332 "dirty log entry has mismatched uuid - can't recover"); 333 xlog_header_check_dump(mp, head); 334 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 335 XFS_ERRLEVEL_HIGH, mp); 336 return -EFSCORRUPTED; 337 } 338 return 0; 339 } 340 341 /* 342 * read the head block of the log and check the header 343 */ 344 STATIC int 345 xlog_header_check_mount( 346 xfs_mount_t *mp, 347 xlog_rec_header_t *head) 348 { 349 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); 350 351 if (uuid_is_nil(&head->h_fs_uuid)) { 352 /* 353 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If 354 * h_fs_uuid is nil, we assume this log was last mounted 355 * by IRIX and continue. 356 */ 357 xfs_warn(mp, "nil uuid in log - IRIX style log"); 358 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 359 xfs_warn(mp, "log has mismatched uuid - can't recover"); 360 xlog_header_check_dump(mp, head); 361 XFS_ERROR_REPORT("xlog_header_check_mount", 362 XFS_ERRLEVEL_HIGH, mp); 363 return -EFSCORRUPTED; 364 } 365 return 0; 366 } 367 368 STATIC void 369 xlog_recover_iodone( 370 struct xfs_buf *bp) 371 { 372 if (bp->b_error) { 373 /* 374 * We're not going to bother about retrying 375 * this during recovery. One strike! 376 */ 377 if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 378 xfs_buf_ioerror_alert(bp, __func__); 379 xfs_force_shutdown(bp->b_target->bt_mount, 380 SHUTDOWN_META_IO_ERROR); 381 } 382 } 383 bp->b_iodone = NULL; 384 xfs_buf_ioend(bp); 385 } 386 387 /* 388 * This routine finds (to an approximation) the first block in the physical 389 * log which contains the given cycle. It uses a binary search algorithm. 390 * Note that the algorithm can not be perfect because the disk will not 391 * necessarily be perfect. 392 */ 393 STATIC int 394 xlog_find_cycle_start( 395 struct xlog *log, 396 struct xfs_buf *bp, 397 xfs_daddr_t first_blk, 398 xfs_daddr_t *last_blk, 399 uint cycle) 400 { 401 char *offset; 402 xfs_daddr_t mid_blk; 403 xfs_daddr_t end_blk; 404 uint mid_cycle; 405 int error; 406 407 end_blk = *last_blk; 408 mid_blk = BLK_AVG(first_blk, end_blk); 409 while (mid_blk != first_blk && mid_blk != end_blk) { 410 error = xlog_bread(log, mid_blk, 1, bp, &offset); 411 if (error) 412 return error; 413 mid_cycle = xlog_get_cycle(offset); 414 if (mid_cycle == cycle) 415 end_blk = mid_blk; /* last_half_cycle == mid_cycle */ 416 else 417 first_blk = mid_blk; /* first_half_cycle == mid_cycle */ 418 mid_blk = BLK_AVG(first_blk, end_blk); 419 } 420 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || 421 (mid_blk == end_blk && mid_blk-1 == first_blk)); 422 423 *last_blk = end_blk; 424 425 return 0; 426 } 427 428 /* 429 * Check that a range of blocks does not contain stop_on_cycle_no. 430 * Fill in *new_blk with the block offset where such a block is 431 * found, or with -1 (an invalid block number) if there is no such 432 * block in the range. The scan needs to occur from front to back 433 * and the pointer into the region must be updated since a later 434 * routine will need to perform another test. 435 */ 436 STATIC int 437 xlog_find_verify_cycle( 438 struct xlog *log, 439 xfs_daddr_t start_blk, 440 int nbblks, 441 uint stop_on_cycle_no, 442 xfs_daddr_t *new_blk) 443 { 444 xfs_daddr_t i, j; 445 uint cycle; 446 xfs_buf_t *bp; 447 xfs_daddr_t bufblks; 448 char *buf = NULL; 449 int error = 0; 450 451 /* 452 * Greedily allocate a buffer big enough to handle the full 453 * range of basic blocks we'll be examining. If that fails, 454 * try a smaller size. We need to be able to read at least 455 * a log sector, or we're out of luck. 456 */ 457 bufblks = 1 << ffs(nbblks); 458 while (bufblks > log->l_logBBsize) 459 bufblks >>= 1; 460 while (!(bp = xlog_get_bp(log, bufblks))) { 461 bufblks >>= 1; 462 if (bufblks < log->l_sectBBsize) 463 return -ENOMEM; 464 } 465 466 for (i = start_blk; i < start_blk + nbblks; i += bufblks) { 467 int bcount; 468 469 bcount = min(bufblks, (start_blk + nbblks - i)); 470 471 error = xlog_bread(log, i, bcount, bp, &buf); 472 if (error) 473 goto out; 474 475 for (j = 0; j < bcount; j++) { 476 cycle = xlog_get_cycle(buf); 477 if (cycle == stop_on_cycle_no) { 478 *new_blk = i+j; 479 goto out; 480 } 481 482 buf += BBSIZE; 483 } 484 } 485 486 *new_blk = -1; 487 488 out: 489 xlog_put_bp(bp); 490 return error; 491 } 492 493 /* 494 * Potentially backup over partial log record write. 495 * 496 * In the typical case, last_blk is the number of the block directly after 497 * a good log record. Therefore, we subtract one to get the block number 498 * of the last block in the given buffer. extra_bblks contains the number 499 * of blocks we would have read on a previous read. This happens when the 500 * last log record is split over the end of the physical log. 501 * 502 * extra_bblks is the number of blocks potentially verified on a previous 503 * call to this routine. 504 */ 505 STATIC int 506 xlog_find_verify_log_record( 507 struct xlog *log, 508 xfs_daddr_t start_blk, 509 xfs_daddr_t *last_blk, 510 int extra_bblks) 511 { 512 xfs_daddr_t i; 513 xfs_buf_t *bp; 514 char *offset = NULL; 515 xlog_rec_header_t *head = NULL; 516 int error = 0; 517 int smallmem = 0; 518 int num_blks = *last_blk - start_blk; 519 int xhdrs; 520 521 ASSERT(start_blk != 0 || *last_blk != start_blk); 522 523 if (!(bp = xlog_get_bp(log, num_blks))) { 524 if (!(bp = xlog_get_bp(log, 1))) 525 return -ENOMEM; 526 smallmem = 1; 527 } else { 528 error = xlog_bread(log, start_blk, num_blks, bp, &offset); 529 if (error) 530 goto out; 531 offset += ((num_blks - 1) << BBSHIFT); 532 } 533 534 for (i = (*last_blk) - 1; i >= 0; i--) { 535 if (i < start_blk) { 536 /* valid log record not found */ 537 xfs_warn(log->l_mp, 538 "Log inconsistent (didn't find previous header)"); 539 ASSERT(0); 540 error = -EIO; 541 goto out; 542 } 543 544 if (smallmem) { 545 error = xlog_bread(log, i, 1, bp, &offset); 546 if (error) 547 goto out; 548 } 549 550 head = (xlog_rec_header_t *)offset; 551 552 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) 553 break; 554 555 if (!smallmem) 556 offset -= BBSIZE; 557 } 558 559 /* 560 * We hit the beginning of the physical log & still no header. Return 561 * to caller. If caller can handle a return of -1, then this routine 562 * will be called again for the end of the physical log. 563 */ 564 if (i == -1) { 565 error = 1; 566 goto out; 567 } 568 569 /* 570 * We have the final block of the good log (the first block 571 * of the log record _before_ the head. So we check the uuid. 572 */ 573 if ((error = xlog_header_check_mount(log->l_mp, head))) 574 goto out; 575 576 /* 577 * We may have found a log record header before we expected one. 578 * last_blk will be the 1st block # with a given cycle #. We may end 579 * up reading an entire log record. In this case, we don't want to 580 * reset last_blk. Only when last_blk points in the middle of a log 581 * record do we update last_blk. 582 */ 583 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 584 uint h_size = be32_to_cpu(head->h_size); 585 586 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; 587 if (h_size % XLOG_HEADER_CYCLE_SIZE) 588 xhdrs++; 589 } else { 590 xhdrs = 1; 591 } 592 593 if (*last_blk - i + extra_bblks != 594 BTOBB(be32_to_cpu(head->h_len)) + xhdrs) 595 *last_blk = i; 596 597 out: 598 xlog_put_bp(bp); 599 return error; 600 } 601 602 /* 603 * Head is defined to be the point of the log where the next log write 604 * could go. This means that incomplete LR writes at the end are 605 * eliminated when calculating the head. We aren't guaranteed that previous 606 * LR have complete transactions. We only know that a cycle number of 607 * current cycle number -1 won't be present in the log if we start writing 608 * from our current block number. 609 * 610 * last_blk contains the block number of the first block with a given 611 * cycle number. 612 * 613 * Return: zero if normal, non-zero if error. 614 */ 615 STATIC int 616 xlog_find_head( 617 struct xlog *log, 618 xfs_daddr_t *return_head_blk) 619 { 620 xfs_buf_t *bp; 621 char *offset; 622 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; 623 int num_scan_bblks; 624 uint first_half_cycle, last_half_cycle; 625 uint stop_on_cycle; 626 int error, log_bbnum = log->l_logBBsize; 627 628 /* Is the end of the log device zeroed? */ 629 error = xlog_find_zeroed(log, &first_blk); 630 if (error < 0) { 631 xfs_warn(log->l_mp, "empty log check failed"); 632 return error; 633 } 634 if (error == 1) { 635 *return_head_blk = first_blk; 636 637 /* Is the whole lot zeroed? */ 638 if (!first_blk) { 639 /* Linux XFS shouldn't generate totally zeroed logs - 640 * mkfs etc write a dummy unmount record to a fresh 641 * log so we can store the uuid in there 642 */ 643 xfs_warn(log->l_mp, "totally zeroed log"); 644 } 645 646 return 0; 647 } 648 649 first_blk = 0; /* get cycle # of 1st block */ 650 bp = xlog_get_bp(log, 1); 651 if (!bp) 652 return -ENOMEM; 653 654 error = xlog_bread(log, 0, 1, bp, &offset); 655 if (error) 656 goto bp_err; 657 658 first_half_cycle = xlog_get_cycle(offset); 659 660 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 661 error = xlog_bread(log, last_blk, 1, bp, &offset); 662 if (error) 663 goto bp_err; 664 665 last_half_cycle = xlog_get_cycle(offset); 666 ASSERT(last_half_cycle != 0); 667 668 /* 669 * If the 1st half cycle number is equal to the last half cycle number, 670 * then the entire log is stamped with the same cycle number. In this 671 * case, head_blk can't be set to zero (which makes sense). The below 672 * math doesn't work out properly with head_blk equal to zero. Instead, 673 * we set it to log_bbnum which is an invalid block number, but this 674 * value makes the math correct. If head_blk doesn't changed through 675 * all the tests below, *head_blk is set to zero at the very end rather 676 * than log_bbnum. In a sense, log_bbnum and zero are the same block 677 * in a circular file. 678 */ 679 if (first_half_cycle == last_half_cycle) { 680 /* 681 * In this case we believe that the entire log should have 682 * cycle number last_half_cycle. We need to scan backwards 683 * from the end verifying that there are no holes still 684 * containing last_half_cycle - 1. If we find such a hole, 685 * then the start of that hole will be the new head. The 686 * simple case looks like 687 * x | x ... | x - 1 | x 688 * Another case that fits this picture would be 689 * x | x + 1 | x ... | x 690 * In this case the head really is somewhere at the end of the 691 * log, as one of the latest writes at the beginning was 692 * incomplete. 693 * One more case is 694 * x | x + 1 | x ... | x - 1 | x 695 * This is really the combination of the above two cases, and 696 * the head has to end up at the start of the x-1 hole at the 697 * end of the log. 698 * 699 * In the 256k log case, we will read from the beginning to the 700 * end of the log and search for cycle numbers equal to x-1. 701 * We don't worry about the x+1 blocks that we encounter, 702 * because we know that they cannot be the head since the log 703 * started with x. 704 */ 705 head_blk = log_bbnum; 706 stop_on_cycle = last_half_cycle - 1; 707 } else { 708 /* 709 * In this case we want to find the first block with cycle 710 * number matching last_half_cycle. We expect the log to be 711 * some variation on 712 * x + 1 ... | x ... | x 713 * The first block with cycle number x (last_half_cycle) will 714 * be where the new head belongs. First we do a binary search 715 * for the first occurrence of last_half_cycle. The binary 716 * search may not be totally accurate, so then we scan back 717 * from there looking for occurrences of last_half_cycle before 718 * us. If that backwards scan wraps around the beginning of 719 * the log, then we look for occurrences of last_half_cycle - 1 720 * at the end of the log. The cases we're looking for look 721 * like 722 * v binary search stopped here 723 * x + 1 ... | x | x + 1 | x ... | x 724 * ^ but we want to locate this spot 725 * or 726 * <---------> less than scan distance 727 * x + 1 ... | x ... | x - 1 | x 728 * ^ we want to locate this spot 729 */ 730 stop_on_cycle = last_half_cycle; 731 if ((error = xlog_find_cycle_start(log, bp, first_blk, 732 &head_blk, last_half_cycle))) 733 goto bp_err; 734 } 735 736 /* 737 * Now validate the answer. Scan back some number of maximum possible 738 * blocks and make sure each one has the expected cycle number. The 739 * maximum is determined by the total possible amount of buffering 740 * in the in-core log. The following number can be made tighter if 741 * we actually look at the block size of the filesystem. 742 */ 743 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 744 if (head_blk >= num_scan_bblks) { 745 /* 746 * We are guaranteed that the entire check can be performed 747 * in one buffer. 748 */ 749 start_blk = head_blk - num_scan_bblks; 750 if ((error = xlog_find_verify_cycle(log, 751 start_blk, num_scan_bblks, 752 stop_on_cycle, &new_blk))) 753 goto bp_err; 754 if (new_blk != -1) 755 head_blk = new_blk; 756 } else { /* need to read 2 parts of log */ 757 /* 758 * We are going to scan backwards in the log in two parts. 759 * First we scan the physical end of the log. In this part 760 * of the log, we are looking for blocks with cycle number 761 * last_half_cycle - 1. 762 * If we find one, then we know that the log starts there, as 763 * we've found a hole that didn't get written in going around 764 * the end of the physical log. The simple case for this is 765 * x + 1 ... | x ... | x - 1 | x 766 * <---------> less than scan distance 767 * If all of the blocks at the end of the log have cycle number 768 * last_half_cycle, then we check the blocks at the start of 769 * the log looking for occurrences of last_half_cycle. If we 770 * find one, then our current estimate for the location of the 771 * first occurrence of last_half_cycle is wrong and we move 772 * back to the hole we've found. This case looks like 773 * x + 1 ... | x | x + 1 | x ... 774 * ^ binary search stopped here 775 * Another case we need to handle that only occurs in 256k 776 * logs is 777 * x + 1 ... | x ... | x+1 | x ... 778 * ^ binary search stops here 779 * In a 256k log, the scan at the end of the log will see the 780 * x + 1 blocks. We need to skip past those since that is 781 * certainly not the head of the log. By searching for 782 * last_half_cycle-1 we accomplish that. 783 */ 784 ASSERT(head_blk <= INT_MAX && 785 (xfs_daddr_t) num_scan_bblks >= head_blk); 786 start_blk = log_bbnum - (num_scan_bblks - head_blk); 787 if ((error = xlog_find_verify_cycle(log, start_blk, 788 num_scan_bblks - (int)head_blk, 789 (stop_on_cycle - 1), &new_blk))) 790 goto bp_err; 791 if (new_blk != -1) { 792 head_blk = new_blk; 793 goto validate_head; 794 } 795 796 /* 797 * Scan beginning of log now. The last part of the physical 798 * log is good. This scan needs to verify that it doesn't find 799 * the last_half_cycle. 800 */ 801 start_blk = 0; 802 ASSERT(head_blk <= INT_MAX); 803 if ((error = xlog_find_verify_cycle(log, 804 start_blk, (int)head_blk, 805 stop_on_cycle, &new_blk))) 806 goto bp_err; 807 if (new_blk != -1) 808 head_blk = new_blk; 809 } 810 811 validate_head: 812 /* 813 * Now we need to make sure head_blk is not pointing to a block in 814 * the middle of a log record. 815 */ 816 num_scan_bblks = XLOG_REC_SHIFT(log); 817 if (head_blk >= num_scan_bblks) { 818 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ 819 820 /* start ptr at last block ptr before head_blk */ 821 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); 822 if (error == 1) 823 error = -EIO; 824 if (error) 825 goto bp_err; 826 } else { 827 start_blk = 0; 828 ASSERT(head_blk <= INT_MAX); 829 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); 830 if (error < 0) 831 goto bp_err; 832 if (error == 1) { 833 /* We hit the beginning of the log during our search */ 834 start_blk = log_bbnum - (num_scan_bblks - head_blk); 835 new_blk = log_bbnum; 836 ASSERT(start_blk <= INT_MAX && 837 (xfs_daddr_t) log_bbnum-start_blk >= 0); 838 ASSERT(head_blk <= INT_MAX); 839 error = xlog_find_verify_log_record(log, start_blk, 840 &new_blk, (int)head_blk); 841 if (error == 1) 842 error = -EIO; 843 if (error) 844 goto bp_err; 845 if (new_blk != log_bbnum) 846 head_blk = new_blk; 847 } else if (error) 848 goto bp_err; 849 } 850 851 xlog_put_bp(bp); 852 if (head_blk == log_bbnum) 853 *return_head_blk = 0; 854 else 855 *return_head_blk = head_blk; 856 /* 857 * When returning here, we have a good block number. Bad block 858 * means that during a previous crash, we didn't have a clean break 859 * from cycle number N to cycle number N-1. In this case, we need 860 * to find the first block with cycle number N-1. 861 */ 862 return 0; 863 864 bp_err: 865 xlog_put_bp(bp); 866 867 if (error) 868 xfs_warn(log->l_mp, "failed to find log head"); 869 return error; 870 } 871 872 /* 873 * Seek backwards in the log for log record headers. 874 * 875 * Given a starting log block, walk backwards until we find the provided number 876 * of records or hit the provided tail block. The return value is the number of 877 * records encountered or a negative error code. The log block and buffer 878 * pointer of the last record seen are returned in rblk and rhead respectively. 879 */ 880 STATIC int 881 xlog_rseek_logrec_hdr( 882 struct xlog *log, 883 xfs_daddr_t head_blk, 884 xfs_daddr_t tail_blk, 885 int count, 886 struct xfs_buf *bp, 887 xfs_daddr_t *rblk, 888 struct xlog_rec_header **rhead, 889 bool *wrapped) 890 { 891 int i; 892 int error; 893 int found = 0; 894 char *offset = NULL; 895 xfs_daddr_t end_blk; 896 897 *wrapped = false; 898 899 /* 900 * Walk backwards from the head block until we hit the tail or the first 901 * block in the log. 902 */ 903 end_blk = head_blk > tail_blk ? tail_blk : 0; 904 for (i = (int) head_blk - 1; i >= end_blk; i--) { 905 error = xlog_bread(log, i, 1, bp, &offset); 906 if (error) 907 goto out_error; 908 909 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 910 *rblk = i; 911 *rhead = (struct xlog_rec_header *) offset; 912 if (++found == count) 913 break; 914 } 915 } 916 917 /* 918 * If we haven't hit the tail block or the log record header count, 919 * start looking again from the end of the physical log. Note that 920 * callers can pass head == tail if the tail is not yet known. 921 */ 922 if (tail_blk >= head_blk && found != count) { 923 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { 924 error = xlog_bread(log, i, 1, bp, &offset); 925 if (error) 926 goto out_error; 927 928 if (*(__be32 *)offset == 929 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 930 *wrapped = true; 931 *rblk = i; 932 *rhead = (struct xlog_rec_header *) offset; 933 if (++found == count) 934 break; 935 } 936 } 937 } 938 939 return found; 940 941 out_error: 942 return error; 943 } 944 945 /* 946 * Seek forward in the log for log record headers. 947 * 948 * Given head and tail blocks, walk forward from the tail block until we find 949 * the provided number of records or hit the head block. The return value is the 950 * number of records encountered or a negative error code. The log block and 951 * buffer pointer of the last record seen are returned in rblk and rhead 952 * respectively. 953 */ 954 STATIC int 955 xlog_seek_logrec_hdr( 956 struct xlog *log, 957 xfs_daddr_t head_blk, 958 xfs_daddr_t tail_blk, 959 int count, 960 struct xfs_buf *bp, 961 xfs_daddr_t *rblk, 962 struct xlog_rec_header **rhead, 963 bool *wrapped) 964 { 965 int i; 966 int error; 967 int found = 0; 968 char *offset = NULL; 969 xfs_daddr_t end_blk; 970 971 *wrapped = false; 972 973 /* 974 * Walk forward from the tail block until we hit the head or the last 975 * block in the log. 976 */ 977 end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; 978 for (i = (int) tail_blk; i <= end_blk; i++) { 979 error = xlog_bread(log, i, 1, bp, &offset); 980 if (error) 981 goto out_error; 982 983 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 984 *rblk = i; 985 *rhead = (struct xlog_rec_header *) offset; 986 if (++found == count) 987 break; 988 } 989 } 990 991 /* 992 * If we haven't hit the head block or the log record header count, 993 * start looking again from the start of the physical log. 994 */ 995 if (tail_blk > head_blk && found != count) { 996 for (i = 0; i < (int) head_blk; i++) { 997 error = xlog_bread(log, i, 1, bp, &offset); 998 if (error) 999 goto out_error; 1000 1001 if (*(__be32 *)offset == 1002 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { 1003 *wrapped = true; 1004 *rblk = i; 1005 *rhead = (struct xlog_rec_header *) offset; 1006 if (++found == count) 1007 break; 1008 } 1009 } 1010 } 1011 1012 return found; 1013 1014 out_error: 1015 return error; 1016 } 1017 1018 /* 1019 * Check the log tail for torn writes. This is required when torn writes are 1020 * detected at the head and the head had to be walked back to a previous record. 1021 * The tail of the previous record must now be verified to ensure the torn 1022 * writes didn't corrupt the previous tail. 1023 * 1024 * Return an error if CRC verification fails as recovery cannot proceed. 1025 */ 1026 STATIC int 1027 xlog_verify_tail( 1028 struct xlog *log, 1029 xfs_daddr_t head_blk, 1030 xfs_daddr_t tail_blk) 1031 { 1032 struct xlog_rec_header *thead; 1033 struct xfs_buf *bp; 1034 xfs_daddr_t first_bad; 1035 int count; 1036 int error = 0; 1037 bool wrapped; 1038 xfs_daddr_t tmp_head; 1039 1040 bp = xlog_get_bp(log, 1); 1041 if (!bp) 1042 return -ENOMEM; 1043 1044 /* 1045 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get 1046 * a temporary head block that points after the last possible 1047 * concurrently written record of the tail. 1048 */ 1049 count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, 1050 XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, 1051 &wrapped); 1052 if (count < 0) { 1053 error = count; 1054 goto out; 1055 } 1056 1057 /* 1058 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran 1059 * into the actual log head. tmp_head points to the start of the record 1060 * so update it to the actual head block. 1061 */ 1062 if (count < XLOG_MAX_ICLOGS + 1) 1063 tmp_head = head_blk; 1064 1065 /* 1066 * We now have a tail and temporary head block that covers at least 1067 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these 1068 * records were completely written. Run a CRC verification pass from 1069 * tail to head and return the result. 1070 */ 1071 error = xlog_do_recovery_pass(log, tmp_head, tail_blk, 1072 XLOG_RECOVER_CRCPASS, &first_bad); 1073 1074 out: 1075 xlog_put_bp(bp); 1076 return error; 1077 } 1078 1079 /* 1080 * Detect and trim torn writes from the head of the log. 1081 * 1082 * Storage without sector atomicity guarantees can result in torn writes in the 1083 * log in the event of a crash. Our only means to detect this scenario is via 1084 * CRC verification. While we can't always be certain that CRC verification 1085 * failure is due to a torn write vs. an unrelated corruption, we do know that 1086 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at 1087 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of 1088 * the log and treat failures in this range as torn writes as a matter of 1089 * policy. In the event of CRC failure, the head is walked back to the last good 1090 * record in the log and the tail is updated from that record and verified. 1091 */ 1092 STATIC int 1093 xlog_verify_head( 1094 struct xlog *log, 1095 xfs_daddr_t *head_blk, /* in/out: unverified head */ 1096 xfs_daddr_t *tail_blk, /* out: tail block */ 1097 struct xfs_buf *bp, 1098 xfs_daddr_t *rhead_blk, /* start blk of last record */ 1099 struct xlog_rec_header **rhead, /* ptr to last record */ 1100 bool *wrapped) /* last rec. wraps phys. log */ 1101 { 1102 struct xlog_rec_header *tmp_rhead; 1103 struct xfs_buf *tmp_bp; 1104 xfs_daddr_t first_bad; 1105 xfs_daddr_t tmp_rhead_blk; 1106 int found; 1107 int error; 1108 bool tmp_wrapped; 1109 1110 /* 1111 * Check the head of the log for torn writes. Search backwards from the 1112 * head until we hit the tail or the maximum number of log record I/Os 1113 * that could have been in flight at one time. Use a temporary buffer so 1114 * we don't trash the rhead/bp pointers from the caller. 1115 */ 1116 tmp_bp = xlog_get_bp(log, 1); 1117 if (!tmp_bp) 1118 return -ENOMEM; 1119 error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, 1120 XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, 1121 &tmp_rhead, &tmp_wrapped); 1122 xlog_put_bp(tmp_bp); 1123 if (error < 0) 1124 return error; 1125 1126 /* 1127 * Now run a CRC verification pass over the records starting at the 1128 * block found above to the current head. If a CRC failure occurs, the 1129 * log block of the first bad record is saved in first_bad. 1130 */ 1131 error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, 1132 XLOG_RECOVER_CRCPASS, &first_bad); 1133 if (error == -EFSBADCRC) { 1134 /* 1135 * We've hit a potential torn write. Reset the error and warn 1136 * about it. 1137 */ 1138 error = 0; 1139 xfs_warn(log->l_mp, 1140 "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", 1141 first_bad, *head_blk); 1142 1143 /* 1144 * Get the header block and buffer pointer for the last good 1145 * record before the bad record. 1146 * 1147 * Note that xlog_find_tail() clears the blocks at the new head 1148 * (i.e., the records with invalid CRC) if the cycle number 1149 * matches the the current cycle. 1150 */ 1151 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, 1152 rhead_blk, rhead, wrapped); 1153 if (found < 0) 1154 return found; 1155 if (found == 0) /* XXX: right thing to do here? */ 1156 return -EIO; 1157 1158 /* 1159 * Reset the head block to the starting block of the first bad 1160 * log record and set the tail block based on the last good 1161 * record. 1162 * 1163 * Bail out if the updated head/tail match as this indicates 1164 * possible corruption outside of the acceptable 1165 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... 1166 */ 1167 *head_blk = first_bad; 1168 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); 1169 if (*head_blk == *tail_blk) { 1170 ASSERT(0); 1171 return 0; 1172 } 1173 1174 /* 1175 * Now verify the tail based on the updated head. This is 1176 * required because the torn writes trimmed from the head could 1177 * have been written over the tail of a previous record. Return 1178 * any errors since recovery cannot proceed if the tail is 1179 * corrupt. 1180 * 1181 * XXX: This leaves a gap in truly robust protection from torn 1182 * writes in the log. If the head is behind the tail, the tail 1183 * pushes forward to create some space and then a crash occurs 1184 * causing the writes into the previous record's tail region to 1185 * tear, log recovery isn't able to recover. 1186 * 1187 * How likely is this to occur? If possible, can we do something 1188 * more intelligent here? Is it safe to push the tail forward if 1189 * we can determine that the tail is within the range of the 1190 * torn write (e.g., the kernel can only overwrite the tail if 1191 * it has actually been pushed forward)? Alternatively, could we 1192 * somehow prevent this condition at runtime? 1193 */ 1194 error = xlog_verify_tail(log, *head_blk, *tail_blk); 1195 } 1196 1197 return error; 1198 } 1199 1200 /* 1201 * Check whether the head of the log points to an unmount record. In other 1202 * words, determine whether the log is clean. If so, update the in-core state 1203 * appropriately. 1204 */ 1205 static int 1206 xlog_check_unmount_rec( 1207 struct xlog *log, 1208 xfs_daddr_t *head_blk, 1209 xfs_daddr_t *tail_blk, 1210 struct xlog_rec_header *rhead, 1211 xfs_daddr_t rhead_blk, 1212 struct xfs_buf *bp, 1213 bool *clean) 1214 { 1215 struct xlog_op_header *op_head; 1216 xfs_daddr_t umount_data_blk; 1217 xfs_daddr_t after_umount_blk; 1218 int hblks; 1219 int error; 1220 char *offset; 1221 1222 *clean = false; 1223 1224 /* 1225 * Look for unmount record. If we find it, then we know there was a 1226 * clean unmount. Since 'i' could be the last block in the physical 1227 * log, we convert to a log block before comparing to the head_blk. 1228 * 1229 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks() 1230 * below. We won't want to clear the unmount record if there is one, so 1231 * we pass the lsn of the unmount record rather than the block after it. 1232 */ 1233 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 1234 int h_size = be32_to_cpu(rhead->h_size); 1235 int h_version = be32_to_cpu(rhead->h_version); 1236 1237 if ((h_version & XLOG_VERSION_2) && 1238 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 1239 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 1240 if (h_size % XLOG_HEADER_CYCLE_SIZE) 1241 hblks++; 1242 } else { 1243 hblks = 1; 1244 } 1245 } else { 1246 hblks = 1; 1247 } 1248 after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); 1249 after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); 1250 if (*head_blk == after_umount_blk && 1251 be32_to_cpu(rhead->h_num_logops) == 1) { 1252 umount_data_blk = rhead_blk + hblks; 1253 umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); 1254 error = xlog_bread(log, umount_data_blk, 1, bp, &offset); 1255 if (error) 1256 return error; 1257 1258 op_head = (struct xlog_op_header *)offset; 1259 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 1260 /* 1261 * Set tail and last sync so that newly written log 1262 * records will point recovery to after the current 1263 * unmount record. 1264 */ 1265 xlog_assign_atomic_lsn(&log->l_tail_lsn, 1266 log->l_curr_cycle, after_umount_blk); 1267 xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1268 log->l_curr_cycle, after_umount_blk); 1269 *tail_blk = after_umount_blk; 1270 1271 *clean = true; 1272 } 1273 } 1274 1275 return 0; 1276 } 1277 1278 static void 1279 xlog_set_state( 1280 struct xlog *log, 1281 xfs_daddr_t head_blk, 1282 struct xlog_rec_header *rhead, 1283 xfs_daddr_t rhead_blk, 1284 bool bump_cycle) 1285 { 1286 /* 1287 * Reset log values according to the state of the log when we 1288 * crashed. In the case where head_blk == 0, we bump curr_cycle 1289 * one because the next write starts a new cycle rather than 1290 * continuing the cycle of the last good log record. At this 1291 * point we have guaranteed that all partial log records have been 1292 * accounted for. Therefore, we know that the last good log record 1293 * written was complete and ended exactly on the end boundary 1294 * of the physical log. 1295 */ 1296 log->l_prev_block = rhead_blk; 1297 log->l_curr_block = (int)head_blk; 1298 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 1299 if (bump_cycle) 1300 log->l_curr_cycle++; 1301 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); 1302 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); 1303 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, 1304 BBTOB(log->l_curr_block)); 1305 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, 1306 BBTOB(log->l_curr_block)); 1307 } 1308 1309 /* 1310 * Find the sync block number or the tail of the log. 1311 * 1312 * This will be the block number of the last record to have its 1313 * associated buffers synced to disk. Every log record header has 1314 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy 1315 * to get a sync block number. The only concern is to figure out which 1316 * log record header to believe. 1317 * 1318 * The following algorithm uses the log record header with the largest 1319 * lsn. The entire log record does not need to be valid. We only care 1320 * that the header is valid. 1321 * 1322 * We could speed up search by using current head_blk buffer, but it is not 1323 * available. 1324 */ 1325 STATIC int 1326 xlog_find_tail( 1327 struct xlog *log, 1328 xfs_daddr_t *head_blk, 1329 xfs_daddr_t *tail_blk) 1330 { 1331 xlog_rec_header_t *rhead; 1332 char *offset = NULL; 1333 xfs_buf_t *bp; 1334 int error; 1335 xfs_daddr_t rhead_blk; 1336 xfs_lsn_t tail_lsn; 1337 bool wrapped = false; 1338 bool clean = false; 1339 1340 /* 1341 * Find previous log record 1342 */ 1343 if ((error = xlog_find_head(log, head_blk))) 1344 return error; 1345 ASSERT(*head_blk < INT_MAX); 1346 1347 bp = xlog_get_bp(log, 1); 1348 if (!bp) 1349 return -ENOMEM; 1350 if (*head_blk == 0) { /* special case */ 1351 error = xlog_bread(log, 0, 1, bp, &offset); 1352 if (error) 1353 goto done; 1354 1355 if (xlog_get_cycle(offset) == 0) { 1356 *tail_blk = 0; 1357 /* leave all other log inited values alone */ 1358 goto done; 1359 } 1360 } 1361 1362 /* 1363 * Search backwards through the log looking for the log record header 1364 * block. This wraps all the way back around to the head so something is 1365 * seriously wrong if we can't find it. 1366 */ 1367 error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, 1368 &rhead_blk, &rhead, &wrapped); 1369 if (error < 0) 1370 return error; 1371 if (!error) { 1372 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); 1373 return -EIO; 1374 } 1375 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); 1376 1377 /* 1378 * Set the log state based on the current head record. 1379 */ 1380 xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); 1381 tail_lsn = atomic64_read(&log->l_tail_lsn); 1382 1383 /* 1384 * Look for an unmount record at the head of the log. This sets the log 1385 * state to determine whether recovery is necessary. 1386 */ 1387 error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, 1388 rhead_blk, bp, &clean); 1389 if (error) 1390 goto done; 1391 1392 /* 1393 * Verify the log head if the log is not clean (e.g., we have anything 1394 * but an unmount record at the head). This uses CRC verification to 1395 * detect and trim torn writes. If discovered, CRC failures are 1396 * considered torn writes and the log head is trimmed accordingly. 1397 * 1398 * Note that we can only run CRC verification when the log is dirty 1399 * because there's no guarantee that the log data behind an unmount 1400 * record is compatible with the current architecture. 1401 */ 1402 if (!clean) { 1403 xfs_daddr_t orig_head = *head_blk; 1404 1405 error = xlog_verify_head(log, head_blk, tail_blk, bp, 1406 &rhead_blk, &rhead, &wrapped); 1407 if (error) 1408 goto done; 1409 1410 /* update in-core state again if the head changed */ 1411 if (*head_blk != orig_head) { 1412 xlog_set_state(log, *head_blk, rhead, rhead_blk, 1413 wrapped); 1414 tail_lsn = atomic64_read(&log->l_tail_lsn); 1415 error = xlog_check_unmount_rec(log, head_blk, tail_blk, 1416 rhead, rhead_blk, bp, 1417 &clean); 1418 if (error) 1419 goto done; 1420 } 1421 } 1422 1423 /* 1424 * Note that the unmount was clean. If the unmount was not clean, we 1425 * need to know this to rebuild the superblock counters from the perag 1426 * headers if we have a filesystem using non-persistent counters. 1427 */ 1428 if (clean) 1429 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; 1430 1431 /* 1432 * Make sure that there are no blocks in front of the head 1433 * with the same cycle number as the head. This can happen 1434 * because we allow multiple outstanding log writes concurrently, 1435 * and the later writes might make it out before earlier ones. 1436 * 1437 * We use the lsn from before modifying it so that we'll never 1438 * overwrite the unmount record after a clean unmount. 1439 * 1440 * Do this only if we are going to recover the filesystem 1441 * 1442 * NOTE: This used to say "if (!readonly)" 1443 * However on Linux, we can & do recover a read-only filesystem. 1444 * We only skip recovery if NORECOVERY is specified on mount, 1445 * in which case we would not be here. 1446 * 1447 * But... if the -device- itself is readonly, just skip this. 1448 * We can't recover this device anyway, so it won't matter. 1449 */ 1450 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) 1451 error = xlog_clear_stale_blocks(log, tail_lsn); 1452 1453 done: 1454 xlog_put_bp(bp); 1455 1456 if (error) 1457 xfs_warn(log->l_mp, "failed to locate log tail"); 1458 return error; 1459 } 1460 1461 /* 1462 * Is the log zeroed at all? 1463 * 1464 * The last binary search should be changed to perform an X block read 1465 * once X becomes small enough. You can then search linearly through 1466 * the X blocks. This will cut down on the number of reads we need to do. 1467 * 1468 * If the log is partially zeroed, this routine will pass back the blkno 1469 * of the first block with cycle number 0. It won't have a complete LR 1470 * preceding it. 1471 * 1472 * Return: 1473 * 0 => the log is completely written to 1474 * 1 => use *blk_no as the first block of the log 1475 * <0 => error has occurred 1476 */ 1477 STATIC int 1478 xlog_find_zeroed( 1479 struct xlog *log, 1480 xfs_daddr_t *blk_no) 1481 { 1482 xfs_buf_t *bp; 1483 char *offset; 1484 uint first_cycle, last_cycle; 1485 xfs_daddr_t new_blk, last_blk, start_blk; 1486 xfs_daddr_t num_scan_bblks; 1487 int error, log_bbnum = log->l_logBBsize; 1488 1489 *blk_no = 0; 1490 1491 /* check totally zeroed log */ 1492 bp = xlog_get_bp(log, 1); 1493 if (!bp) 1494 return -ENOMEM; 1495 error = xlog_bread(log, 0, 1, bp, &offset); 1496 if (error) 1497 goto bp_err; 1498 1499 first_cycle = xlog_get_cycle(offset); 1500 if (first_cycle == 0) { /* completely zeroed log */ 1501 *blk_no = 0; 1502 xlog_put_bp(bp); 1503 return 1; 1504 } 1505 1506 /* check partially zeroed log */ 1507 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); 1508 if (error) 1509 goto bp_err; 1510 1511 last_cycle = xlog_get_cycle(offset); 1512 if (last_cycle != 0) { /* log completely written to */ 1513 xlog_put_bp(bp); 1514 return 0; 1515 } else if (first_cycle != 1) { 1516 /* 1517 * If the cycle of the last block is zero, the cycle of 1518 * the first block must be 1. If it's not, maybe we're 1519 * not looking at a log... Bail out. 1520 */ 1521 xfs_warn(log->l_mp, 1522 "Log inconsistent or not a log (last==0, first!=1)"); 1523 error = -EINVAL; 1524 goto bp_err; 1525 } 1526 1527 /* we have a partially zeroed log */ 1528 last_blk = log_bbnum-1; 1529 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) 1530 goto bp_err; 1531 1532 /* 1533 * Validate the answer. Because there is no way to guarantee that 1534 * the entire log is made up of log records which are the same size, 1535 * we scan over the defined maximum blocks. At this point, the maximum 1536 * is not chosen to mean anything special. XXXmiken 1537 */ 1538 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 1539 ASSERT(num_scan_bblks <= INT_MAX); 1540 1541 if (last_blk < num_scan_bblks) 1542 num_scan_bblks = last_blk; 1543 start_blk = last_blk - num_scan_bblks; 1544 1545 /* 1546 * We search for any instances of cycle number 0 that occur before 1547 * our current estimate of the head. What we're trying to detect is 1548 * 1 ... | 0 | 1 | 0... 1549 * ^ binary search ends here 1550 */ 1551 if ((error = xlog_find_verify_cycle(log, start_blk, 1552 (int)num_scan_bblks, 0, &new_blk))) 1553 goto bp_err; 1554 if (new_blk != -1) 1555 last_blk = new_blk; 1556 1557 /* 1558 * Potentially backup over partial log record write. We don't need 1559 * to search the end of the log because we know it is zero. 1560 */ 1561 error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); 1562 if (error == 1) 1563 error = -EIO; 1564 if (error) 1565 goto bp_err; 1566 1567 *blk_no = last_blk; 1568 bp_err: 1569 xlog_put_bp(bp); 1570 if (error) 1571 return error; 1572 return 1; 1573 } 1574 1575 /* 1576 * These are simple subroutines used by xlog_clear_stale_blocks() below 1577 * to initialize a buffer full of empty log record headers and write 1578 * them into the log. 1579 */ 1580 STATIC void 1581 xlog_add_record( 1582 struct xlog *log, 1583 char *buf, 1584 int cycle, 1585 int block, 1586 int tail_cycle, 1587 int tail_block) 1588 { 1589 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; 1590 1591 memset(buf, 0, BBSIZE); 1592 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1593 recp->h_cycle = cpu_to_be32(cycle); 1594 recp->h_version = cpu_to_be32( 1595 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1596 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); 1597 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); 1598 recp->h_fmt = cpu_to_be32(XLOG_FMT); 1599 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); 1600 } 1601 1602 STATIC int 1603 xlog_write_log_records( 1604 struct xlog *log, 1605 int cycle, 1606 int start_block, 1607 int blocks, 1608 int tail_cycle, 1609 int tail_block) 1610 { 1611 char *offset; 1612 xfs_buf_t *bp; 1613 int balign, ealign; 1614 int sectbb = log->l_sectBBsize; 1615 int end_block = start_block + blocks; 1616 int bufblks; 1617 int error = 0; 1618 int i, j = 0; 1619 1620 /* 1621 * Greedily allocate a buffer big enough to handle the full 1622 * range of basic blocks to be written. If that fails, try 1623 * a smaller size. We need to be able to write at least a 1624 * log sector, or we're out of luck. 1625 */ 1626 bufblks = 1 << ffs(blocks); 1627 while (bufblks > log->l_logBBsize) 1628 bufblks >>= 1; 1629 while (!(bp = xlog_get_bp(log, bufblks))) { 1630 bufblks >>= 1; 1631 if (bufblks < sectbb) 1632 return -ENOMEM; 1633 } 1634 1635 /* We may need to do a read at the start to fill in part of 1636 * the buffer in the starting sector not covered by the first 1637 * write below. 1638 */ 1639 balign = round_down(start_block, sectbb); 1640 if (balign != start_block) { 1641 error = xlog_bread_noalign(log, start_block, 1, bp); 1642 if (error) 1643 goto out_put_bp; 1644 1645 j = start_block - balign; 1646 } 1647 1648 for (i = start_block; i < end_block; i += bufblks) { 1649 int bcount, endcount; 1650 1651 bcount = min(bufblks, end_block - start_block); 1652 endcount = bcount - j; 1653 1654 /* We may need to do a read at the end to fill in part of 1655 * the buffer in the final sector not covered by the write. 1656 * If this is the same sector as the above read, skip it. 1657 */ 1658 ealign = round_down(end_block, sectbb); 1659 if (j == 0 && (start_block + endcount > ealign)) { 1660 offset = bp->b_addr + BBTOB(ealign - start_block); 1661 error = xlog_bread_offset(log, ealign, sectbb, 1662 bp, offset); 1663 if (error) 1664 break; 1665 1666 } 1667 1668 offset = xlog_align(log, start_block, endcount, bp); 1669 for (; j < endcount; j++) { 1670 xlog_add_record(log, offset, cycle, i+j, 1671 tail_cycle, tail_block); 1672 offset += BBSIZE; 1673 } 1674 error = xlog_bwrite(log, start_block, endcount, bp); 1675 if (error) 1676 break; 1677 start_block += endcount; 1678 j = 0; 1679 } 1680 1681 out_put_bp: 1682 xlog_put_bp(bp); 1683 return error; 1684 } 1685 1686 /* 1687 * This routine is called to blow away any incomplete log writes out 1688 * in front of the log head. We do this so that we won't become confused 1689 * if we come up, write only a little bit more, and then crash again. 1690 * If we leave the partial log records out there, this situation could 1691 * cause us to think those partial writes are valid blocks since they 1692 * have the current cycle number. We get rid of them by overwriting them 1693 * with empty log records with the old cycle number rather than the 1694 * current one. 1695 * 1696 * The tail lsn is passed in rather than taken from 1697 * the log so that we will not write over the unmount record after a 1698 * clean unmount in a 512 block log. Doing so would leave the log without 1699 * any valid log records in it until a new one was written. If we crashed 1700 * during that time we would not be able to recover. 1701 */ 1702 STATIC int 1703 xlog_clear_stale_blocks( 1704 struct xlog *log, 1705 xfs_lsn_t tail_lsn) 1706 { 1707 int tail_cycle, head_cycle; 1708 int tail_block, head_block; 1709 int tail_distance, max_distance; 1710 int distance; 1711 int error; 1712 1713 tail_cycle = CYCLE_LSN(tail_lsn); 1714 tail_block = BLOCK_LSN(tail_lsn); 1715 head_cycle = log->l_curr_cycle; 1716 head_block = log->l_curr_block; 1717 1718 /* 1719 * Figure out the distance between the new head of the log 1720 * and the tail. We want to write over any blocks beyond the 1721 * head that we may have written just before the crash, but 1722 * we don't want to overwrite the tail of the log. 1723 */ 1724 if (head_cycle == tail_cycle) { 1725 /* 1726 * The tail is behind the head in the physical log, 1727 * so the distance from the head to the tail is the 1728 * distance from the head to the end of the log plus 1729 * the distance from the beginning of the log to the 1730 * tail. 1731 */ 1732 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { 1733 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", 1734 XFS_ERRLEVEL_LOW, log->l_mp); 1735 return -EFSCORRUPTED; 1736 } 1737 tail_distance = tail_block + (log->l_logBBsize - head_block); 1738 } else { 1739 /* 1740 * The head is behind the tail in the physical log, 1741 * so the distance from the head to the tail is just 1742 * the tail block minus the head block. 1743 */ 1744 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ 1745 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", 1746 XFS_ERRLEVEL_LOW, log->l_mp); 1747 return -EFSCORRUPTED; 1748 } 1749 tail_distance = tail_block - head_block; 1750 } 1751 1752 /* 1753 * If the head is right up against the tail, we can't clear 1754 * anything. 1755 */ 1756 if (tail_distance <= 0) { 1757 ASSERT(tail_distance == 0); 1758 return 0; 1759 } 1760 1761 max_distance = XLOG_TOTAL_REC_SHIFT(log); 1762 /* 1763 * Take the smaller of the maximum amount of outstanding I/O 1764 * we could have and the distance to the tail to clear out. 1765 * We take the smaller so that we don't overwrite the tail and 1766 * we don't waste all day writing from the head to the tail 1767 * for no reason. 1768 */ 1769 max_distance = MIN(max_distance, tail_distance); 1770 1771 if ((head_block + max_distance) <= log->l_logBBsize) { 1772 /* 1773 * We can stomp all the blocks we need to without 1774 * wrapping around the end of the log. Just do it 1775 * in a single write. Use the cycle number of the 1776 * current cycle minus one so that the log will look like: 1777 * n ... | n - 1 ... 1778 */ 1779 error = xlog_write_log_records(log, (head_cycle - 1), 1780 head_block, max_distance, tail_cycle, 1781 tail_block); 1782 if (error) 1783 return error; 1784 } else { 1785 /* 1786 * We need to wrap around the end of the physical log in 1787 * order to clear all the blocks. Do it in two separate 1788 * I/Os. The first write should be from the head to the 1789 * end of the physical log, and it should use the current 1790 * cycle number minus one just like above. 1791 */ 1792 distance = log->l_logBBsize - head_block; 1793 error = xlog_write_log_records(log, (head_cycle - 1), 1794 head_block, distance, tail_cycle, 1795 tail_block); 1796 1797 if (error) 1798 return error; 1799 1800 /* 1801 * Now write the blocks at the start of the physical log. 1802 * This writes the remainder of the blocks we want to clear. 1803 * It uses the current cycle number since we're now on the 1804 * same cycle as the head so that we get: 1805 * n ... n ... | n - 1 ... 1806 * ^^^^^ blocks we're writing 1807 */ 1808 distance = max_distance - (log->l_logBBsize - head_block); 1809 error = xlog_write_log_records(log, head_cycle, 0, distance, 1810 tail_cycle, tail_block); 1811 if (error) 1812 return error; 1813 } 1814 1815 return 0; 1816 } 1817 1818 /****************************************************************************** 1819 * 1820 * Log recover routines 1821 * 1822 ****************************************************************************** 1823 */ 1824 1825 /* 1826 * Sort the log items in the transaction. 1827 * 1828 * The ordering constraints are defined by the inode allocation and unlink 1829 * behaviour. The rules are: 1830 * 1831 * 1. Every item is only logged once in a given transaction. Hence it 1832 * represents the last logged state of the item. Hence ordering is 1833 * dependent on the order in which operations need to be performed so 1834 * required initial conditions are always met. 1835 * 1836 * 2. Cancelled buffers are recorded in pass 1 in a separate table and 1837 * there's nothing to replay from them so we can simply cull them 1838 * from the transaction. However, we can't do that until after we've 1839 * replayed all the other items because they may be dependent on the 1840 * cancelled buffer and replaying the cancelled buffer can remove it 1841 * form the cancelled buffer table. Hence they have tobe done last. 1842 * 1843 * 3. Inode allocation buffers must be replayed before inode items that 1844 * read the buffer and replay changes into it. For filesystems using the 1845 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get 1846 * treated the same as inode allocation buffers as they create and 1847 * initialise the buffers directly. 1848 * 1849 * 4. Inode unlink buffers must be replayed after inode items are replayed. 1850 * This ensures that inodes are completely flushed to the inode buffer 1851 * in a "free" state before we remove the unlinked inode list pointer. 1852 * 1853 * Hence the ordering needs to be inode allocation buffers first, inode items 1854 * second, inode unlink buffers third and cancelled buffers last. 1855 * 1856 * But there's a problem with that - we can't tell an inode allocation buffer 1857 * apart from a regular buffer, so we can't separate them. We can, however, 1858 * tell an inode unlink buffer from the others, and so we can separate them out 1859 * from all the other buffers and move them to last. 1860 * 1861 * Hence, 4 lists, in order from head to tail: 1862 * - buffer_list for all buffers except cancelled/inode unlink buffers 1863 * - item_list for all non-buffer items 1864 * - inode_buffer_list for inode unlink buffers 1865 * - cancel_list for the cancelled buffers 1866 * 1867 * Note that we add objects to the tail of the lists so that first-to-last 1868 * ordering is preserved within the lists. Adding objects to the head of the 1869 * list means when we traverse from the head we walk them in last-to-first 1870 * order. For cancelled buffers and inode unlink buffers this doesn't matter, 1871 * but for all other items there may be specific ordering that we need to 1872 * preserve. 1873 */ 1874 STATIC int 1875 xlog_recover_reorder_trans( 1876 struct xlog *log, 1877 struct xlog_recover *trans, 1878 int pass) 1879 { 1880 xlog_recover_item_t *item, *n; 1881 int error = 0; 1882 LIST_HEAD(sort_list); 1883 LIST_HEAD(cancel_list); 1884 LIST_HEAD(buffer_list); 1885 LIST_HEAD(inode_buffer_list); 1886 LIST_HEAD(inode_list); 1887 1888 list_splice_init(&trans->r_itemq, &sort_list); 1889 list_for_each_entry_safe(item, n, &sort_list, ri_list) { 1890 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1891 1892 switch (ITEM_TYPE(item)) { 1893 case XFS_LI_ICREATE: 1894 list_move_tail(&item->ri_list, &buffer_list); 1895 break; 1896 case XFS_LI_BUF: 1897 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1898 trace_xfs_log_recover_item_reorder_head(log, 1899 trans, item, pass); 1900 list_move(&item->ri_list, &cancel_list); 1901 break; 1902 } 1903 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 1904 list_move(&item->ri_list, &inode_buffer_list); 1905 break; 1906 } 1907 list_move_tail(&item->ri_list, &buffer_list); 1908 break; 1909 case XFS_LI_INODE: 1910 case XFS_LI_DQUOT: 1911 case XFS_LI_QUOTAOFF: 1912 case XFS_LI_EFD: 1913 case XFS_LI_EFI: 1914 trace_xfs_log_recover_item_reorder_tail(log, 1915 trans, item, pass); 1916 list_move_tail(&item->ri_list, &inode_list); 1917 break; 1918 default: 1919 xfs_warn(log->l_mp, 1920 "%s: unrecognized type of log operation", 1921 __func__); 1922 ASSERT(0); 1923 /* 1924 * return the remaining items back to the transaction 1925 * item list so they can be freed in caller. 1926 */ 1927 if (!list_empty(&sort_list)) 1928 list_splice_init(&sort_list, &trans->r_itemq); 1929 error = -EIO; 1930 goto out; 1931 } 1932 } 1933 out: 1934 ASSERT(list_empty(&sort_list)); 1935 if (!list_empty(&buffer_list)) 1936 list_splice(&buffer_list, &trans->r_itemq); 1937 if (!list_empty(&inode_list)) 1938 list_splice_tail(&inode_list, &trans->r_itemq); 1939 if (!list_empty(&inode_buffer_list)) 1940 list_splice_tail(&inode_buffer_list, &trans->r_itemq); 1941 if (!list_empty(&cancel_list)) 1942 list_splice_tail(&cancel_list, &trans->r_itemq); 1943 return error; 1944 } 1945 1946 /* 1947 * Build up the table of buf cancel records so that we don't replay 1948 * cancelled data in the second pass. For buffer records that are 1949 * not cancel records, there is nothing to do here so we just return. 1950 * 1951 * If we get a cancel record which is already in the table, this indicates 1952 * that the buffer was cancelled multiple times. In order to ensure 1953 * that during pass 2 we keep the record in the table until we reach its 1954 * last occurrence in the log, we keep a reference count in the cancel 1955 * record in the table to tell us how many times we expect to see this 1956 * record during the second pass. 1957 */ 1958 STATIC int 1959 xlog_recover_buffer_pass1( 1960 struct xlog *log, 1961 struct xlog_recover_item *item) 1962 { 1963 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1964 struct list_head *bucket; 1965 struct xfs_buf_cancel *bcp; 1966 1967 /* 1968 * If this isn't a cancel buffer item, then just return. 1969 */ 1970 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { 1971 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1972 return 0; 1973 } 1974 1975 /* 1976 * Insert an xfs_buf_cancel record into the hash table of them. 1977 * If there is already an identical record, bump its reference count. 1978 */ 1979 bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); 1980 list_for_each_entry(bcp, bucket, bc_list) { 1981 if (bcp->bc_blkno == buf_f->blf_blkno && 1982 bcp->bc_len == buf_f->blf_len) { 1983 bcp->bc_refcount++; 1984 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); 1985 return 0; 1986 } 1987 } 1988 1989 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); 1990 bcp->bc_blkno = buf_f->blf_blkno; 1991 bcp->bc_len = buf_f->blf_len; 1992 bcp->bc_refcount = 1; 1993 list_add_tail(&bcp->bc_list, bucket); 1994 1995 trace_xfs_log_recover_buf_cancel_add(log, buf_f); 1996 return 0; 1997 } 1998 1999 /* 2000 * Check to see whether the buffer being recovered has a corresponding 2001 * entry in the buffer cancel record table. If it is, return the cancel 2002 * buffer structure to the caller. 2003 */ 2004 STATIC struct xfs_buf_cancel * 2005 xlog_peek_buffer_cancelled( 2006 struct xlog *log, 2007 xfs_daddr_t blkno, 2008 uint len, 2009 ushort flags) 2010 { 2011 struct list_head *bucket; 2012 struct xfs_buf_cancel *bcp; 2013 2014 if (!log->l_buf_cancel_table) { 2015 /* empty table means no cancelled buffers in the log */ 2016 ASSERT(!(flags & XFS_BLF_CANCEL)); 2017 return NULL; 2018 } 2019 2020 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 2021 list_for_each_entry(bcp, bucket, bc_list) { 2022 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 2023 return bcp; 2024 } 2025 2026 /* 2027 * We didn't find a corresponding entry in the table, so return 0 so 2028 * that the buffer is NOT cancelled. 2029 */ 2030 ASSERT(!(flags & XFS_BLF_CANCEL)); 2031 return NULL; 2032 } 2033 2034 /* 2035 * If the buffer is being cancelled then return 1 so that it will be cancelled, 2036 * otherwise return 0. If the buffer is actually a buffer cancel item 2037 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the 2038 * table and remove it from the table if this is the last reference. 2039 * 2040 * We remove the cancel record from the table when we encounter its last 2041 * occurrence in the log so that if the same buffer is re-used again after its 2042 * last cancellation we actually replay the changes made at that point. 2043 */ 2044 STATIC int 2045 xlog_check_buffer_cancelled( 2046 struct xlog *log, 2047 xfs_daddr_t blkno, 2048 uint len, 2049 ushort flags) 2050 { 2051 struct xfs_buf_cancel *bcp; 2052 2053 bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); 2054 if (!bcp) 2055 return 0; 2056 2057 /* 2058 * We've go a match, so return 1 so that the recovery of this buffer 2059 * is cancelled. If this buffer is actually a buffer cancel log 2060 * item, then decrement the refcount on the one in the table and 2061 * remove it if this is the last reference. 2062 */ 2063 if (flags & XFS_BLF_CANCEL) { 2064 if (--bcp->bc_refcount == 0) { 2065 list_del(&bcp->bc_list); 2066 kmem_free(bcp); 2067 } 2068 } 2069 return 1; 2070 } 2071 2072 /* 2073 * Perform recovery for a buffer full of inodes. In these buffers, the only 2074 * data which should be recovered is that which corresponds to the 2075 * di_next_unlinked pointers in the on disk inode structures. The rest of the 2076 * data for the inodes is always logged through the inodes themselves rather 2077 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 2078 * 2079 * The only time when buffers full of inodes are fully recovered is when the 2080 * buffer is full of newly allocated inodes. In this case the buffer will 2081 * not be marked as an inode buffer and so will be sent to 2082 * xlog_recover_do_reg_buffer() below during recovery. 2083 */ 2084 STATIC int 2085 xlog_recover_do_inode_buffer( 2086 struct xfs_mount *mp, 2087 xlog_recover_item_t *item, 2088 struct xfs_buf *bp, 2089 xfs_buf_log_format_t *buf_f) 2090 { 2091 int i; 2092 int item_index = 0; 2093 int bit = 0; 2094 int nbits = 0; 2095 int reg_buf_offset = 0; 2096 int reg_buf_bytes = 0; 2097 int next_unlinked_offset; 2098 int inodes_per_buf; 2099 xfs_agino_t *logged_nextp; 2100 xfs_agino_t *buffer_nextp; 2101 2102 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 2103 2104 /* 2105 * Post recovery validation only works properly on CRC enabled 2106 * filesystems. 2107 */ 2108 if (xfs_sb_version_hascrc(&mp->m_sb)) 2109 bp->b_ops = &xfs_inode_buf_ops; 2110 2111 inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; 2112 for (i = 0; i < inodes_per_buf; i++) { 2113 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 2114 offsetof(xfs_dinode_t, di_next_unlinked); 2115 2116 while (next_unlinked_offset >= 2117 (reg_buf_offset + reg_buf_bytes)) { 2118 /* 2119 * The next di_next_unlinked field is beyond 2120 * the current logged region. Find the next 2121 * logged region that contains or is beyond 2122 * the current di_next_unlinked field. 2123 */ 2124 bit += nbits; 2125 bit = xfs_next_bit(buf_f->blf_data_map, 2126 buf_f->blf_map_size, bit); 2127 2128 /* 2129 * If there are no more logged regions in the 2130 * buffer, then we're done. 2131 */ 2132 if (bit == -1) 2133 return 0; 2134 2135 nbits = xfs_contig_bits(buf_f->blf_data_map, 2136 buf_f->blf_map_size, bit); 2137 ASSERT(nbits > 0); 2138 reg_buf_offset = bit << XFS_BLF_SHIFT; 2139 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 2140 item_index++; 2141 } 2142 2143 /* 2144 * If the current logged region starts after the current 2145 * di_next_unlinked field, then move on to the next 2146 * di_next_unlinked field. 2147 */ 2148 if (next_unlinked_offset < reg_buf_offset) 2149 continue; 2150 2151 ASSERT(item->ri_buf[item_index].i_addr != NULL); 2152 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 2153 ASSERT((reg_buf_offset + reg_buf_bytes) <= 2154 BBTOB(bp->b_io_length)); 2155 2156 /* 2157 * The current logged region contains a copy of the 2158 * current di_next_unlinked field. Extract its value 2159 * and copy it to the buffer copy. 2160 */ 2161 logged_nextp = item->ri_buf[item_index].i_addr + 2162 next_unlinked_offset - reg_buf_offset; 2163 if (unlikely(*logged_nextp == 0)) { 2164 xfs_alert(mp, 2165 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " 2166 "Trying to replay bad (0) inode di_next_unlinked field.", 2167 item, bp); 2168 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 2169 XFS_ERRLEVEL_LOW, mp); 2170 return -EFSCORRUPTED; 2171 } 2172 2173 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 2174 *buffer_nextp = *logged_nextp; 2175 2176 /* 2177 * If necessary, recalculate the CRC in the on-disk inode. We 2178 * have to leave the inode in a consistent state for whoever 2179 * reads it next.... 2180 */ 2181 xfs_dinode_calc_crc(mp, 2182 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 2183 2184 } 2185 2186 return 0; 2187 } 2188 2189 /* 2190 * V5 filesystems know the age of the buffer on disk being recovered. We can 2191 * have newer objects on disk than we are replaying, and so for these cases we 2192 * don't want to replay the current change as that will make the buffer contents 2193 * temporarily invalid on disk. 2194 * 2195 * The magic number might not match the buffer type we are going to recover 2196 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 2197 * extract the LSN of the existing object in the buffer based on it's current 2198 * magic number. If we don't recognise the magic number in the buffer, then 2199 * return a LSN of -1 so that the caller knows it was an unrecognised block and 2200 * so can recover the buffer. 2201 * 2202 * Note: we cannot rely solely on magic number matches to determine that the 2203 * buffer has a valid LSN - we also need to verify that it belongs to this 2204 * filesystem, so we need to extract the object's LSN and compare it to that 2205 * which we read from the superblock. If the UUIDs don't match, then we've got a 2206 * stale metadata block from an old filesystem instance that we need to recover 2207 * over the top of. 2208 */ 2209 static xfs_lsn_t 2210 xlog_recover_get_buf_lsn( 2211 struct xfs_mount *mp, 2212 struct xfs_buf *bp) 2213 { 2214 __uint32_t magic32; 2215 __uint16_t magic16; 2216 __uint16_t magicda; 2217 void *blk = bp->b_addr; 2218 uuid_t *uuid; 2219 xfs_lsn_t lsn = -1; 2220 2221 /* v4 filesystems always recover immediately */ 2222 if (!xfs_sb_version_hascrc(&mp->m_sb)) 2223 goto recover_immediately; 2224 2225 magic32 = be32_to_cpu(*(__be32 *)blk); 2226 switch (magic32) { 2227 case XFS_ABTB_CRC_MAGIC: 2228 case XFS_ABTC_CRC_MAGIC: 2229 case XFS_ABTB_MAGIC: 2230 case XFS_ABTC_MAGIC: 2231 case XFS_IBT_CRC_MAGIC: 2232 case XFS_IBT_MAGIC: { 2233 struct xfs_btree_block *btb = blk; 2234 2235 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 2236 uuid = &btb->bb_u.s.bb_uuid; 2237 break; 2238 } 2239 case XFS_BMAP_CRC_MAGIC: 2240 case XFS_BMAP_MAGIC: { 2241 struct xfs_btree_block *btb = blk; 2242 2243 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 2244 uuid = &btb->bb_u.l.bb_uuid; 2245 break; 2246 } 2247 case XFS_AGF_MAGIC: 2248 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 2249 uuid = &((struct xfs_agf *)blk)->agf_uuid; 2250 break; 2251 case XFS_AGFL_MAGIC: 2252 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 2253 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 2254 break; 2255 case XFS_AGI_MAGIC: 2256 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 2257 uuid = &((struct xfs_agi *)blk)->agi_uuid; 2258 break; 2259 case XFS_SYMLINK_MAGIC: 2260 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 2261 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 2262 break; 2263 case XFS_DIR3_BLOCK_MAGIC: 2264 case XFS_DIR3_DATA_MAGIC: 2265 case XFS_DIR3_FREE_MAGIC: 2266 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 2267 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 2268 break; 2269 case XFS_ATTR3_RMT_MAGIC: 2270 /* 2271 * Remote attr blocks are written synchronously, rather than 2272 * being logged. That means they do not contain a valid LSN 2273 * (i.e. transactionally ordered) in them, and hence any time we 2274 * see a buffer to replay over the top of a remote attribute 2275 * block we should simply do so. 2276 */ 2277 goto recover_immediately; 2278 case XFS_SB_MAGIC: 2279 /* 2280 * superblock uuids are magic. We may or may not have a 2281 * sb_meta_uuid on disk, but it will be set in the in-core 2282 * superblock. We set the uuid pointer for verification 2283 * according to the superblock feature mask to ensure we check 2284 * the relevant UUID in the superblock. 2285 */ 2286 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 2287 if (xfs_sb_version_hasmetauuid(&mp->m_sb)) 2288 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 2289 else 2290 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 2291 break; 2292 default: 2293 break; 2294 } 2295 2296 if (lsn != (xfs_lsn_t)-1) { 2297 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 2298 goto recover_immediately; 2299 return lsn; 2300 } 2301 2302 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 2303 switch (magicda) { 2304 case XFS_DIR3_LEAF1_MAGIC: 2305 case XFS_DIR3_LEAFN_MAGIC: 2306 case XFS_DA3_NODE_MAGIC: 2307 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 2308 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 2309 break; 2310 default: 2311 break; 2312 } 2313 2314 if (lsn != (xfs_lsn_t)-1) { 2315 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) 2316 goto recover_immediately; 2317 return lsn; 2318 } 2319 2320 /* 2321 * We do individual object checks on dquot and inode buffers as they 2322 * have their own individual LSN records. Also, we could have a stale 2323 * buffer here, so we have to at least recognise these buffer types. 2324 * 2325 * A notd complexity here is inode unlinked list processing - it logs 2326 * the inode directly in the buffer, but we don't know which inodes have 2327 * been modified, and there is no global buffer LSN. Hence we need to 2328 * recover all inode buffer types immediately. This problem will be 2329 * fixed by logical logging of the unlinked list modifications. 2330 */ 2331 magic16 = be16_to_cpu(*(__be16 *)blk); 2332 switch (magic16) { 2333 case XFS_DQUOT_MAGIC: 2334 case XFS_DINODE_MAGIC: 2335 goto recover_immediately; 2336 default: 2337 break; 2338 } 2339 2340 /* unknown buffer contents, recover immediately */ 2341 2342 recover_immediately: 2343 return (xfs_lsn_t)-1; 2344 2345 } 2346 2347 /* 2348 * Validate the recovered buffer is of the correct type and attach the 2349 * appropriate buffer operations to them for writeback. Magic numbers are in a 2350 * few places: 2351 * the first 16 bits of the buffer (inode buffer, dquot buffer), 2352 * the first 32 bits of the buffer (most blocks), 2353 * inside a struct xfs_da_blkinfo at the start of the buffer. 2354 */ 2355 static void 2356 xlog_recover_validate_buf_type( 2357 struct xfs_mount *mp, 2358 struct xfs_buf *bp, 2359 xfs_buf_log_format_t *buf_f) 2360 { 2361 struct xfs_da_blkinfo *info = bp->b_addr; 2362 __uint32_t magic32; 2363 __uint16_t magic16; 2364 __uint16_t magicda; 2365 2366 /* 2367 * We can only do post recovery validation on items on CRC enabled 2368 * fielsystems as we need to know when the buffer was written to be able 2369 * to determine if we should have replayed the item. If we replay old 2370 * metadata over a newer buffer, then it will enter a temporarily 2371 * inconsistent state resulting in verification failures. Hence for now 2372 * just avoid the verification stage for non-crc filesystems 2373 */ 2374 if (!xfs_sb_version_hascrc(&mp->m_sb)) 2375 return; 2376 2377 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 2378 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 2379 magicda = be16_to_cpu(info->magic); 2380 switch (xfs_blft_from_flags(buf_f)) { 2381 case XFS_BLFT_BTREE_BUF: 2382 switch (magic32) { 2383 case XFS_ABTB_CRC_MAGIC: 2384 case XFS_ABTC_CRC_MAGIC: 2385 case XFS_ABTB_MAGIC: 2386 case XFS_ABTC_MAGIC: 2387 bp->b_ops = &xfs_allocbt_buf_ops; 2388 break; 2389 case XFS_IBT_CRC_MAGIC: 2390 case XFS_FIBT_CRC_MAGIC: 2391 case XFS_IBT_MAGIC: 2392 case XFS_FIBT_MAGIC: 2393 bp->b_ops = &xfs_inobt_buf_ops; 2394 break; 2395 case XFS_BMAP_CRC_MAGIC: 2396 case XFS_BMAP_MAGIC: 2397 bp->b_ops = &xfs_bmbt_buf_ops; 2398 break; 2399 default: 2400 xfs_warn(mp, "Bad btree block magic!"); 2401 ASSERT(0); 2402 break; 2403 } 2404 break; 2405 case XFS_BLFT_AGF_BUF: 2406 if (magic32 != XFS_AGF_MAGIC) { 2407 xfs_warn(mp, "Bad AGF block magic!"); 2408 ASSERT(0); 2409 break; 2410 } 2411 bp->b_ops = &xfs_agf_buf_ops; 2412 break; 2413 case XFS_BLFT_AGFL_BUF: 2414 if (magic32 != XFS_AGFL_MAGIC) { 2415 xfs_warn(mp, "Bad AGFL block magic!"); 2416 ASSERT(0); 2417 break; 2418 } 2419 bp->b_ops = &xfs_agfl_buf_ops; 2420 break; 2421 case XFS_BLFT_AGI_BUF: 2422 if (magic32 != XFS_AGI_MAGIC) { 2423 xfs_warn(mp, "Bad AGI block magic!"); 2424 ASSERT(0); 2425 break; 2426 } 2427 bp->b_ops = &xfs_agi_buf_ops; 2428 break; 2429 case XFS_BLFT_UDQUOT_BUF: 2430 case XFS_BLFT_PDQUOT_BUF: 2431 case XFS_BLFT_GDQUOT_BUF: 2432 #ifdef CONFIG_XFS_QUOTA 2433 if (magic16 != XFS_DQUOT_MAGIC) { 2434 xfs_warn(mp, "Bad DQUOT block magic!"); 2435 ASSERT(0); 2436 break; 2437 } 2438 bp->b_ops = &xfs_dquot_buf_ops; 2439 #else 2440 xfs_alert(mp, 2441 "Trying to recover dquots without QUOTA support built in!"); 2442 ASSERT(0); 2443 #endif 2444 break; 2445 case XFS_BLFT_DINO_BUF: 2446 if (magic16 != XFS_DINODE_MAGIC) { 2447 xfs_warn(mp, "Bad INODE block magic!"); 2448 ASSERT(0); 2449 break; 2450 } 2451 bp->b_ops = &xfs_inode_buf_ops; 2452 break; 2453 case XFS_BLFT_SYMLINK_BUF: 2454 if (magic32 != XFS_SYMLINK_MAGIC) { 2455 xfs_warn(mp, "Bad symlink block magic!"); 2456 ASSERT(0); 2457 break; 2458 } 2459 bp->b_ops = &xfs_symlink_buf_ops; 2460 break; 2461 case XFS_BLFT_DIR_BLOCK_BUF: 2462 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 2463 magic32 != XFS_DIR3_BLOCK_MAGIC) { 2464 xfs_warn(mp, "Bad dir block magic!"); 2465 ASSERT(0); 2466 break; 2467 } 2468 bp->b_ops = &xfs_dir3_block_buf_ops; 2469 break; 2470 case XFS_BLFT_DIR_DATA_BUF: 2471 if (magic32 != XFS_DIR2_DATA_MAGIC && 2472 magic32 != XFS_DIR3_DATA_MAGIC) { 2473 xfs_warn(mp, "Bad dir data magic!"); 2474 ASSERT(0); 2475 break; 2476 } 2477 bp->b_ops = &xfs_dir3_data_buf_ops; 2478 break; 2479 case XFS_BLFT_DIR_FREE_BUF: 2480 if (magic32 != XFS_DIR2_FREE_MAGIC && 2481 magic32 != XFS_DIR3_FREE_MAGIC) { 2482 xfs_warn(mp, "Bad dir3 free magic!"); 2483 ASSERT(0); 2484 break; 2485 } 2486 bp->b_ops = &xfs_dir3_free_buf_ops; 2487 break; 2488 case XFS_BLFT_DIR_LEAF1_BUF: 2489 if (magicda != XFS_DIR2_LEAF1_MAGIC && 2490 magicda != XFS_DIR3_LEAF1_MAGIC) { 2491 xfs_warn(mp, "Bad dir leaf1 magic!"); 2492 ASSERT(0); 2493 break; 2494 } 2495 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 2496 break; 2497 case XFS_BLFT_DIR_LEAFN_BUF: 2498 if (magicda != XFS_DIR2_LEAFN_MAGIC && 2499 magicda != XFS_DIR3_LEAFN_MAGIC) { 2500 xfs_warn(mp, "Bad dir leafn magic!"); 2501 ASSERT(0); 2502 break; 2503 } 2504 bp->b_ops = &xfs_dir3_leafn_buf_ops; 2505 break; 2506 case XFS_BLFT_DA_NODE_BUF: 2507 if (magicda != XFS_DA_NODE_MAGIC && 2508 magicda != XFS_DA3_NODE_MAGIC) { 2509 xfs_warn(mp, "Bad da node magic!"); 2510 ASSERT(0); 2511 break; 2512 } 2513 bp->b_ops = &xfs_da3_node_buf_ops; 2514 break; 2515 case XFS_BLFT_ATTR_LEAF_BUF: 2516 if (magicda != XFS_ATTR_LEAF_MAGIC && 2517 magicda != XFS_ATTR3_LEAF_MAGIC) { 2518 xfs_warn(mp, "Bad attr leaf magic!"); 2519 ASSERT(0); 2520 break; 2521 } 2522 bp->b_ops = &xfs_attr3_leaf_buf_ops; 2523 break; 2524 case XFS_BLFT_ATTR_RMT_BUF: 2525 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 2526 xfs_warn(mp, "Bad attr remote magic!"); 2527 ASSERT(0); 2528 break; 2529 } 2530 bp->b_ops = &xfs_attr3_rmt_buf_ops; 2531 break; 2532 case XFS_BLFT_SB_BUF: 2533 if (magic32 != XFS_SB_MAGIC) { 2534 xfs_warn(mp, "Bad SB block magic!"); 2535 ASSERT(0); 2536 break; 2537 } 2538 bp->b_ops = &xfs_sb_buf_ops; 2539 break; 2540 #ifdef CONFIG_XFS_RT 2541 case XFS_BLFT_RTBITMAP_BUF: 2542 case XFS_BLFT_RTSUMMARY_BUF: 2543 /* no magic numbers for verification of RT buffers */ 2544 bp->b_ops = &xfs_rtbuf_ops; 2545 break; 2546 #endif /* CONFIG_XFS_RT */ 2547 default: 2548 xfs_warn(mp, "Unknown buffer type %d!", 2549 xfs_blft_from_flags(buf_f)); 2550 break; 2551 } 2552 } 2553 2554 /* 2555 * Perform a 'normal' buffer recovery. Each logged region of the 2556 * buffer should be copied over the corresponding region in the 2557 * given buffer. The bitmap in the buf log format structure indicates 2558 * where to place the logged data. 2559 */ 2560 STATIC void 2561 xlog_recover_do_reg_buffer( 2562 struct xfs_mount *mp, 2563 xlog_recover_item_t *item, 2564 struct xfs_buf *bp, 2565 xfs_buf_log_format_t *buf_f) 2566 { 2567 int i; 2568 int bit; 2569 int nbits; 2570 int error; 2571 2572 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 2573 2574 bit = 0; 2575 i = 1; /* 0 is the buf format structure */ 2576 while (1) { 2577 bit = xfs_next_bit(buf_f->blf_data_map, 2578 buf_f->blf_map_size, bit); 2579 if (bit == -1) 2580 break; 2581 nbits = xfs_contig_bits(buf_f->blf_data_map, 2582 buf_f->blf_map_size, bit); 2583 ASSERT(nbits > 0); 2584 ASSERT(item->ri_buf[i].i_addr != NULL); 2585 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 2586 ASSERT(BBTOB(bp->b_io_length) >= 2587 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 2588 2589 /* 2590 * The dirty regions logged in the buffer, even though 2591 * contiguous, may span multiple chunks. This is because the 2592 * dirty region may span a physical page boundary in a buffer 2593 * and hence be split into two separate vectors for writing into 2594 * the log. Hence we need to trim nbits back to the length of 2595 * the current region being copied out of the log. 2596 */ 2597 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 2598 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 2599 2600 /* 2601 * Do a sanity check if this is a dquot buffer. Just checking 2602 * the first dquot in the buffer should do. XXXThis is 2603 * probably a good thing to do for other buf types also. 2604 */ 2605 error = 0; 2606 if (buf_f->blf_flags & 2607 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2608 if (item->ri_buf[i].i_addr == NULL) { 2609 xfs_alert(mp, 2610 "XFS: NULL dquot in %s.", __func__); 2611 goto next; 2612 } 2613 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 2614 xfs_alert(mp, 2615 "XFS: dquot too small (%d) in %s.", 2616 item->ri_buf[i].i_len, __func__); 2617 goto next; 2618 } 2619 error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, 2620 -1, 0, XFS_QMOPT_DOWARN, 2621 "dquot_buf_recover"); 2622 if (error) 2623 goto next; 2624 } 2625 2626 memcpy(xfs_buf_offset(bp, 2627 (uint)bit << XFS_BLF_SHIFT), /* dest */ 2628 item->ri_buf[i].i_addr, /* source */ 2629 nbits<<XFS_BLF_SHIFT); /* length */ 2630 next: 2631 i++; 2632 bit += nbits; 2633 } 2634 2635 /* Shouldn't be any more regions */ 2636 ASSERT(i == item->ri_total); 2637 2638 xlog_recover_validate_buf_type(mp, bp, buf_f); 2639 } 2640 2641 /* 2642 * Perform a dquot buffer recovery. 2643 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 2644 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2645 * Else, treat it as a regular buffer and do recovery. 2646 * 2647 * Return false if the buffer was tossed and true if we recovered the buffer to 2648 * indicate to the caller if the buffer needs writing. 2649 */ 2650 STATIC bool 2651 xlog_recover_do_dquot_buffer( 2652 struct xfs_mount *mp, 2653 struct xlog *log, 2654 struct xlog_recover_item *item, 2655 struct xfs_buf *bp, 2656 struct xfs_buf_log_format *buf_f) 2657 { 2658 uint type; 2659 2660 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 2661 2662 /* 2663 * Filesystems are required to send in quota flags at mount time. 2664 */ 2665 if (!mp->m_qflags) 2666 return false; 2667 2668 type = 0; 2669 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 2670 type |= XFS_DQ_USER; 2671 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 2672 type |= XFS_DQ_PROJ; 2673 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 2674 type |= XFS_DQ_GROUP; 2675 /* 2676 * This type of quotas was turned off, so ignore this buffer 2677 */ 2678 if (log->l_quotaoffs_flag & type) 2679 return false; 2680 2681 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2682 return true; 2683 } 2684 2685 /* 2686 * This routine replays a modification made to a buffer at runtime. 2687 * There are actually two types of buffer, regular and inode, which 2688 * are handled differently. Inode buffers are handled differently 2689 * in that we only recover a specific set of data from them, namely 2690 * the inode di_next_unlinked fields. This is because all other inode 2691 * data is actually logged via inode records and any data we replay 2692 * here which overlaps that may be stale. 2693 * 2694 * When meta-data buffers are freed at run time we log a buffer item 2695 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 2696 * of the buffer in the log should not be replayed at recovery time. 2697 * This is so that if the blocks covered by the buffer are reused for 2698 * file data before we crash we don't end up replaying old, freed 2699 * meta-data into a user's file. 2700 * 2701 * To handle the cancellation of buffer log items, we make two passes 2702 * over the log during recovery. During the first we build a table of 2703 * those buffers which have been cancelled, and during the second we 2704 * only replay those buffers which do not have corresponding cancel 2705 * records in the table. See xlog_recover_buffer_pass[1,2] above 2706 * for more details on the implementation of the table of cancel records. 2707 */ 2708 STATIC int 2709 xlog_recover_buffer_pass2( 2710 struct xlog *log, 2711 struct list_head *buffer_list, 2712 struct xlog_recover_item *item, 2713 xfs_lsn_t current_lsn) 2714 { 2715 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2716 xfs_mount_t *mp = log->l_mp; 2717 xfs_buf_t *bp; 2718 int error; 2719 uint buf_flags; 2720 xfs_lsn_t lsn; 2721 2722 /* 2723 * In this pass we only want to recover all the buffers which have 2724 * not been cancelled and are not cancellation buffers themselves. 2725 */ 2726 if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, 2727 buf_f->blf_len, buf_f->blf_flags)) { 2728 trace_xfs_log_recover_buf_cancel(log, buf_f); 2729 return 0; 2730 } 2731 2732 trace_xfs_log_recover_buf_recover(log, buf_f); 2733 2734 buf_flags = 0; 2735 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 2736 buf_flags |= XBF_UNMAPPED; 2737 2738 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2739 buf_flags, NULL); 2740 if (!bp) 2741 return -ENOMEM; 2742 error = bp->b_error; 2743 if (error) { 2744 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2745 goto out_release; 2746 } 2747 2748 /* 2749 * Recover the buffer only if we get an LSN from it and it's less than 2750 * the lsn of the transaction we are replaying. 2751 * 2752 * Note that we have to be extremely careful of readahead here. 2753 * Readahead does not attach verfiers to the buffers so if we don't 2754 * actually do any replay after readahead because of the LSN we found 2755 * in the buffer if more recent than that current transaction then we 2756 * need to attach the verifier directly. Failure to do so can lead to 2757 * future recovery actions (e.g. EFI and unlinked list recovery) can 2758 * operate on the buffers and they won't get the verifier attached. This 2759 * can lead to blocks on disk having the correct content but a stale 2760 * CRC. 2761 * 2762 * It is safe to assume these clean buffers are currently up to date. 2763 * If the buffer is dirtied by a later transaction being replayed, then 2764 * the verifier will be reset to match whatever recover turns that 2765 * buffer into. 2766 */ 2767 lsn = xlog_recover_get_buf_lsn(mp, bp); 2768 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2769 xlog_recover_validate_buf_type(mp, bp, buf_f); 2770 goto out_release; 2771 } 2772 2773 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2774 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2775 if (error) 2776 goto out_release; 2777 } else if (buf_f->blf_flags & 2778 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2779 bool dirty; 2780 2781 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2782 if (!dirty) 2783 goto out_release; 2784 } else { 2785 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2786 } 2787 2788 /* 2789 * Perform delayed write on the buffer. Asynchronous writes will be 2790 * slower when taking into account all the buffers to be flushed. 2791 * 2792 * Also make sure that only inode buffers with good sizes stay in 2793 * the buffer cache. The kernel moves inodes in buffers of 1 block 2794 * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode 2795 * buffers in the log can be a different size if the log was generated 2796 * by an older kernel using unclustered inode buffers or a newer kernel 2797 * running with a different inode cluster size. Regardless, if the 2798 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) 2799 * for *our* value of mp->m_inode_cluster_size, then we need to keep 2800 * the buffer out of the buffer cache so that the buffer won't 2801 * overlap with future reads of those inodes. 2802 */ 2803 if (XFS_DINODE_MAGIC == 2804 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2805 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, 2806 (__uint32_t)log->l_mp->m_inode_cluster_size))) { 2807 xfs_buf_stale(bp); 2808 error = xfs_bwrite(bp); 2809 } else { 2810 ASSERT(bp->b_target->bt_mount == mp); 2811 bp->b_iodone = xlog_recover_iodone; 2812 xfs_buf_delwri_queue(bp, buffer_list); 2813 } 2814 2815 out_release: 2816 xfs_buf_relse(bp); 2817 return error; 2818 } 2819 2820 /* 2821 * Inode fork owner changes 2822 * 2823 * If we have been told that we have to reparent the inode fork, it's because an 2824 * extent swap operation on a CRC enabled filesystem has been done and we are 2825 * replaying it. We need to walk the BMBT of the appropriate fork and change the 2826 * owners of it. 2827 * 2828 * The complexity here is that we don't have an inode context to work with, so 2829 * after we've replayed the inode we need to instantiate one. This is where the 2830 * fun begins. 2831 * 2832 * We are in the middle of log recovery, so we can't run transactions. That 2833 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 2834 * that will result in the corresponding iput() running the inode through 2835 * xfs_inactive(). If we've just replayed an inode core that changes the link 2836 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 2837 * transactions (bad!). 2838 * 2839 * So, to avoid this, we instantiate an inode directly from the inode core we've 2840 * just recovered. We have the buffer still locked, and all we really need to 2841 * instantiate is the inode core and the forks being modified. We can do this 2842 * manually, then run the inode btree owner change, and then tear down the 2843 * xfs_inode without having to run any transactions at all. 2844 * 2845 * Also, because we don't have a transaction context available here but need to 2846 * gather all the buffers we modify for writeback so we pass the buffer_list 2847 * instead for the operation to use. 2848 */ 2849 2850 STATIC int 2851 xfs_recover_inode_owner_change( 2852 struct xfs_mount *mp, 2853 struct xfs_dinode *dip, 2854 struct xfs_inode_log_format *in_f, 2855 struct list_head *buffer_list) 2856 { 2857 struct xfs_inode *ip; 2858 int error; 2859 2860 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 2861 2862 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 2863 if (!ip) 2864 return -ENOMEM; 2865 2866 /* instantiate the inode */ 2867 xfs_inode_from_disk(ip, dip); 2868 ASSERT(ip->i_d.di_version >= 3); 2869 2870 error = xfs_iformat_fork(ip, dip); 2871 if (error) 2872 goto out_free_ip; 2873 2874 2875 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 2876 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 2877 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 2878 ip->i_ino, buffer_list); 2879 if (error) 2880 goto out_free_ip; 2881 } 2882 2883 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 2884 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 2885 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 2886 ip->i_ino, buffer_list); 2887 if (error) 2888 goto out_free_ip; 2889 } 2890 2891 out_free_ip: 2892 xfs_inode_free(ip); 2893 return error; 2894 } 2895 2896 STATIC int 2897 xlog_recover_inode_pass2( 2898 struct xlog *log, 2899 struct list_head *buffer_list, 2900 struct xlog_recover_item *item, 2901 xfs_lsn_t current_lsn) 2902 { 2903 xfs_inode_log_format_t *in_f; 2904 xfs_mount_t *mp = log->l_mp; 2905 xfs_buf_t *bp; 2906 xfs_dinode_t *dip; 2907 int len; 2908 char *src; 2909 char *dest; 2910 int error; 2911 int attr_index; 2912 uint fields; 2913 struct xfs_log_dinode *ldip; 2914 uint isize; 2915 int need_free = 0; 2916 2917 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2918 in_f = item->ri_buf[0].i_addr; 2919 } else { 2920 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); 2921 need_free = 1; 2922 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2923 if (error) 2924 goto error; 2925 } 2926 2927 /* 2928 * Inode buffers can be freed, look out for it, 2929 * and do not replay the inode. 2930 */ 2931 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2932 in_f->ilf_len, 0)) { 2933 error = 0; 2934 trace_xfs_log_recover_inode_cancel(log, in_f); 2935 goto error; 2936 } 2937 trace_xfs_log_recover_inode_recover(log, in_f); 2938 2939 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, 2940 &xfs_inode_buf_ops); 2941 if (!bp) { 2942 error = -ENOMEM; 2943 goto error; 2944 } 2945 error = bp->b_error; 2946 if (error) { 2947 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); 2948 goto out_release; 2949 } 2950 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2951 dip = xfs_buf_offset(bp, in_f->ilf_boffset); 2952 2953 /* 2954 * Make sure the place we're flushing out to really looks 2955 * like an inode! 2956 */ 2957 if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { 2958 xfs_alert(mp, 2959 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", 2960 __func__, dip, bp, in_f->ilf_ino); 2961 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2962 XFS_ERRLEVEL_LOW, mp); 2963 error = -EFSCORRUPTED; 2964 goto out_release; 2965 } 2966 ldip = item->ri_buf[1].i_addr; 2967 if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) { 2968 xfs_alert(mp, 2969 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", 2970 __func__, item, in_f->ilf_ino); 2971 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2972 XFS_ERRLEVEL_LOW, mp); 2973 error = -EFSCORRUPTED; 2974 goto out_release; 2975 } 2976 2977 /* 2978 * If the inode has an LSN in it, recover the inode only if it's less 2979 * than the lsn of the transaction we are replaying. Note: we still 2980 * need to replay an owner change even though the inode is more recent 2981 * than the transaction as there is no guarantee that all the btree 2982 * blocks are more recent than this transaction, too. 2983 */ 2984 if (dip->di_version >= 3) { 2985 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 2986 2987 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 2988 trace_xfs_log_recover_inode_skip(log, in_f); 2989 error = 0; 2990 goto out_owner_change; 2991 } 2992 } 2993 2994 /* 2995 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 2996 * are transactional and if ordering is necessary we can determine that 2997 * more accurately by the LSN field in the V3 inode core. Don't trust 2998 * the inode versions we might be changing them here - use the 2999 * superblock flag to determine whether we need to look at di_flushiter 3000 * to skip replay when the on disk inode is newer than the log one 3001 */ 3002 if (!xfs_sb_version_hascrc(&mp->m_sb) && 3003 ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 3004 /* 3005 * Deal with the wrap case, DI_MAX_FLUSH is less 3006 * than smaller numbers 3007 */ 3008 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 3009 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { 3010 /* do nothing */ 3011 } else { 3012 trace_xfs_log_recover_inode_skip(log, in_f); 3013 error = 0; 3014 goto out_release; 3015 } 3016 } 3017 3018 /* Take the opportunity to reset the flush iteration count */ 3019 ldip->di_flushiter = 0; 3020 3021 if (unlikely(S_ISREG(ldip->di_mode))) { 3022 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 3023 (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 3024 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 3025 XFS_ERRLEVEL_LOW, mp, ldip); 3026 xfs_alert(mp, 3027 "%s: Bad regular inode log record, rec ptr 0x%p, " 3028 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 3029 __func__, item, dip, bp, in_f->ilf_ino); 3030 error = -EFSCORRUPTED; 3031 goto out_release; 3032 } 3033 } else if (unlikely(S_ISDIR(ldip->di_mode))) { 3034 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 3035 (ldip->di_format != XFS_DINODE_FMT_BTREE) && 3036 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { 3037 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 3038 XFS_ERRLEVEL_LOW, mp, ldip); 3039 xfs_alert(mp, 3040 "%s: Bad dir inode log record, rec ptr 0x%p, " 3041 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 3042 __func__, item, dip, bp, in_f->ilf_ino); 3043 error = -EFSCORRUPTED; 3044 goto out_release; 3045 } 3046 } 3047 if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ 3048 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 3049 XFS_ERRLEVEL_LOW, mp, ldip); 3050 xfs_alert(mp, 3051 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 3052 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 3053 __func__, item, dip, bp, in_f->ilf_ino, 3054 ldip->di_nextents + ldip->di_anextents, 3055 ldip->di_nblocks); 3056 error = -EFSCORRUPTED; 3057 goto out_release; 3058 } 3059 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { 3060 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 3061 XFS_ERRLEVEL_LOW, mp, ldip); 3062 xfs_alert(mp, 3063 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 3064 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 3065 item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); 3066 error = -EFSCORRUPTED; 3067 goto out_release; 3068 } 3069 isize = xfs_log_dinode_size(ldip->di_version); 3070 if (unlikely(item->ri_buf[1].i_len > isize)) { 3071 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 3072 XFS_ERRLEVEL_LOW, mp, ldip); 3073 xfs_alert(mp, 3074 "%s: Bad inode log record length %d, rec ptr 0x%p", 3075 __func__, item->ri_buf[1].i_len, item); 3076 error = -EFSCORRUPTED; 3077 goto out_release; 3078 } 3079 3080 /* recover the log dinode inode into the on disk inode */ 3081 xfs_log_dinode_to_disk(ldip, dip); 3082 3083 /* the rest is in on-disk format */ 3084 if (item->ri_buf[1].i_len > isize) { 3085 memcpy((char *)dip + isize, 3086 item->ri_buf[1].i_addr + isize, 3087 item->ri_buf[1].i_len - isize); 3088 } 3089 3090 fields = in_f->ilf_fields; 3091 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { 3092 case XFS_ILOG_DEV: 3093 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 3094 break; 3095 case XFS_ILOG_UUID: 3096 memcpy(XFS_DFORK_DPTR(dip), 3097 &in_f->ilf_u.ilfu_uuid, 3098 sizeof(uuid_t)); 3099 break; 3100 } 3101 3102 if (in_f->ilf_size == 2) 3103 goto out_owner_change; 3104 len = item->ri_buf[2].i_len; 3105 src = item->ri_buf[2].i_addr; 3106 ASSERT(in_f->ilf_size <= 4); 3107 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 3108 ASSERT(!(fields & XFS_ILOG_DFORK) || 3109 (len == in_f->ilf_dsize)); 3110 3111 switch (fields & XFS_ILOG_DFORK) { 3112 case XFS_ILOG_DDATA: 3113 case XFS_ILOG_DEXT: 3114 memcpy(XFS_DFORK_DPTR(dip), src, len); 3115 break; 3116 3117 case XFS_ILOG_DBROOT: 3118 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 3119 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), 3120 XFS_DFORK_DSIZE(dip, mp)); 3121 break; 3122 3123 default: 3124 /* 3125 * There are no data fork flags set. 3126 */ 3127 ASSERT((fields & XFS_ILOG_DFORK) == 0); 3128 break; 3129 } 3130 3131 /* 3132 * If we logged any attribute data, recover it. There may or 3133 * may not have been any other non-core data logged in this 3134 * transaction. 3135 */ 3136 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 3137 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 3138 attr_index = 3; 3139 } else { 3140 attr_index = 2; 3141 } 3142 len = item->ri_buf[attr_index].i_len; 3143 src = item->ri_buf[attr_index].i_addr; 3144 ASSERT(len == in_f->ilf_asize); 3145 3146 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 3147 case XFS_ILOG_ADATA: 3148 case XFS_ILOG_AEXT: 3149 dest = XFS_DFORK_APTR(dip); 3150 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 3151 memcpy(dest, src, len); 3152 break; 3153 3154 case XFS_ILOG_ABROOT: 3155 dest = XFS_DFORK_APTR(dip); 3156 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 3157 len, (xfs_bmdr_block_t*)dest, 3158 XFS_DFORK_ASIZE(dip, mp)); 3159 break; 3160 3161 default: 3162 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 3163 ASSERT(0); 3164 error = -EIO; 3165 goto out_release; 3166 } 3167 } 3168 3169 out_owner_change: 3170 if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) 3171 error = xfs_recover_inode_owner_change(mp, dip, in_f, 3172 buffer_list); 3173 /* re-generate the checksum. */ 3174 xfs_dinode_calc_crc(log->l_mp, dip); 3175 3176 ASSERT(bp->b_target->bt_mount == mp); 3177 bp->b_iodone = xlog_recover_iodone; 3178 xfs_buf_delwri_queue(bp, buffer_list); 3179 3180 out_release: 3181 xfs_buf_relse(bp); 3182 error: 3183 if (need_free) 3184 kmem_free(in_f); 3185 return error; 3186 } 3187 3188 /* 3189 * Recover QUOTAOFF records. We simply make a note of it in the xlog 3190 * structure, so that we know not to do any dquot item or dquot buffer recovery, 3191 * of that type. 3192 */ 3193 STATIC int 3194 xlog_recover_quotaoff_pass1( 3195 struct xlog *log, 3196 struct xlog_recover_item *item) 3197 { 3198 xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; 3199 ASSERT(qoff_f); 3200 3201 /* 3202 * The logitem format's flag tells us if this was user quotaoff, 3203 * group/project quotaoff or both. 3204 */ 3205 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) 3206 log->l_quotaoffs_flag |= XFS_DQ_USER; 3207 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) 3208 log->l_quotaoffs_flag |= XFS_DQ_PROJ; 3209 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 3210 log->l_quotaoffs_flag |= XFS_DQ_GROUP; 3211 3212 return 0; 3213 } 3214 3215 /* 3216 * Recover a dquot record 3217 */ 3218 STATIC int 3219 xlog_recover_dquot_pass2( 3220 struct xlog *log, 3221 struct list_head *buffer_list, 3222 struct xlog_recover_item *item, 3223 xfs_lsn_t current_lsn) 3224 { 3225 xfs_mount_t *mp = log->l_mp; 3226 xfs_buf_t *bp; 3227 struct xfs_disk_dquot *ddq, *recddq; 3228 int error; 3229 xfs_dq_logformat_t *dq_f; 3230 uint type; 3231 3232 3233 /* 3234 * Filesystems are required to send in quota flags at mount time. 3235 */ 3236 if (mp->m_qflags == 0) 3237 return 0; 3238 3239 recddq = item->ri_buf[1].i_addr; 3240 if (recddq == NULL) { 3241 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); 3242 return -EIO; 3243 } 3244 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 3245 xfs_alert(log->l_mp, "dquot too small (%d) in %s.", 3246 item->ri_buf[1].i_len, __func__); 3247 return -EIO; 3248 } 3249 3250 /* 3251 * This type of quotas was turned off, so ignore this record. 3252 */ 3253 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 3254 ASSERT(type); 3255 if (log->l_quotaoffs_flag & type) 3256 return 0; 3257 3258 /* 3259 * At this point we know that quota was _not_ turned off. 3260 * Since the mount flags are not indicating to us otherwise, this 3261 * must mean that quota is on, and the dquot needs to be replayed. 3262 * Remember that we may not have fully recovered the superblock yet, 3263 * so we can't do the usual trick of looking at the SB quota bits. 3264 * 3265 * The other possibility, of course, is that the quota subsystem was 3266 * removed since the last mount - ENOSYS. 3267 */ 3268 dq_f = item->ri_buf[0].i_addr; 3269 ASSERT(dq_f); 3270 error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 3271 "xlog_recover_dquot_pass2 (log copy)"); 3272 if (error) 3273 return -EIO; 3274 ASSERT(dq_f->qlf_len == 1); 3275 3276 /* 3277 * At this point we are assuming that the dquots have been allocated 3278 * and hence the buffer has valid dquots stamped in it. It should, 3279 * therefore, pass verifier validation. If the dquot is bad, then the 3280 * we'll return an error here, so we don't need to specifically check 3281 * the dquot in the buffer after the verifier has run. 3282 */ 3283 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 3284 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, 3285 &xfs_dquot_buf_ops); 3286 if (error) 3287 return error; 3288 3289 ASSERT(bp); 3290 ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); 3291 3292 /* 3293 * If the dquot has an LSN in it, recover the dquot only if it's less 3294 * than the lsn of the transaction we are replaying. 3295 */ 3296 if (xfs_sb_version_hascrc(&mp->m_sb)) { 3297 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; 3298 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); 3299 3300 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 3301 goto out_release; 3302 } 3303 } 3304 3305 memcpy(ddq, recddq, item->ri_buf[1].i_len); 3306 if (xfs_sb_version_hascrc(&mp->m_sb)) { 3307 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), 3308 XFS_DQUOT_CRC_OFF); 3309 } 3310 3311 ASSERT(dq_f->qlf_size == 2); 3312 ASSERT(bp->b_target->bt_mount == mp); 3313 bp->b_iodone = xlog_recover_iodone; 3314 xfs_buf_delwri_queue(bp, buffer_list); 3315 3316 out_release: 3317 xfs_buf_relse(bp); 3318 return 0; 3319 } 3320 3321 /* 3322 * This routine is called to create an in-core extent free intent 3323 * item from the efi format structure which was logged on disk. 3324 * It allocates an in-core efi, copies the extents from the format 3325 * structure into it, and adds the efi to the AIL with the given 3326 * LSN. 3327 */ 3328 STATIC int 3329 xlog_recover_efi_pass2( 3330 struct xlog *log, 3331 struct xlog_recover_item *item, 3332 xfs_lsn_t lsn) 3333 { 3334 int error; 3335 struct xfs_mount *mp = log->l_mp; 3336 struct xfs_efi_log_item *efip; 3337 struct xfs_efi_log_format *efi_formatp; 3338 3339 efi_formatp = item->ri_buf[0].i_addr; 3340 3341 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 3342 error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); 3343 if (error) { 3344 xfs_efi_item_free(efip); 3345 return error; 3346 } 3347 atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); 3348 3349 spin_lock(&log->l_ailp->xa_lock); 3350 /* 3351 * The EFI has two references. One for the EFD and one for EFI to ensure 3352 * it makes it into the AIL. Insert the EFI into the AIL directly and 3353 * drop the EFI reference. Note that xfs_trans_ail_update() drops the 3354 * AIL lock. 3355 */ 3356 xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); 3357 xfs_efi_release(efip); 3358 return 0; 3359 } 3360 3361 3362 /* 3363 * This routine is called when an EFD format structure is found in a committed 3364 * transaction in the log. Its purpose is to cancel the corresponding EFI if it 3365 * was still in the log. To do this it searches the AIL for the EFI with an id 3366 * equal to that in the EFD format structure. If we find it we drop the EFD 3367 * reference, which removes the EFI from the AIL and frees it. 3368 */ 3369 STATIC int 3370 xlog_recover_efd_pass2( 3371 struct xlog *log, 3372 struct xlog_recover_item *item) 3373 { 3374 xfs_efd_log_format_t *efd_formatp; 3375 xfs_efi_log_item_t *efip = NULL; 3376 xfs_log_item_t *lip; 3377 __uint64_t efi_id; 3378 struct xfs_ail_cursor cur; 3379 struct xfs_ail *ailp = log->l_ailp; 3380 3381 efd_formatp = item->ri_buf[0].i_addr; 3382 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 3383 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 3384 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + 3385 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); 3386 efi_id = efd_formatp->efd_efi_id; 3387 3388 /* 3389 * Search for the EFI with the id in the EFD format structure in the 3390 * AIL. 3391 */ 3392 spin_lock(&ailp->xa_lock); 3393 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 3394 while (lip != NULL) { 3395 if (lip->li_type == XFS_LI_EFI) { 3396 efip = (xfs_efi_log_item_t *)lip; 3397 if (efip->efi_format.efi_id == efi_id) { 3398 /* 3399 * Drop the EFD reference to the EFI. This 3400 * removes the EFI from the AIL and frees it. 3401 */ 3402 spin_unlock(&ailp->xa_lock); 3403 xfs_efi_release(efip); 3404 spin_lock(&ailp->xa_lock); 3405 break; 3406 } 3407 } 3408 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3409 } 3410 3411 xfs_trans_ail_cursor_done(&cur); 3412 spin_unlock(&ailp->xa_lock); 3413 3414 return 0; 3415 } 3416 3417 /* 3418 * This routine is called when an inode create format structure is found in a 3419 * committed transaction in the log. It's purpose is to initialise the inodes 3420 * being allocated on disk. This requires us to get inode cluster buffers that 3421 * match the range to be intialised, stamped with inode templates and written 3422 * by delayed write so that subsequent modifications will hit the cached buffer 3423 * and only need writing out at the end of recovery. 3424 */ 3425 STATIC int 3426 xlog_recover_do_icreate_pass2( 3427 struct xlog *log, 3428 struct list_head *buffer_list, 3429 xlog_recover_item_t *item) 3430 { 3431 struct xfs_mount *mp = log->l_mp; 3432 struct xfs_icreate_log *icl; 3433 xfs_agnumber_t agno; 3434 xfs_agblock_t agbno; 3435 unsigned int count; 3436 unsigned int isize; 3437 xfs_agblock_t length; 3438 int blks_per_cluster; 3439 int bb_per_cluster; 3440 int cancel_count; 3441 int nbufs; 3442 int i; 3443 3444 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; 3445 if (icl->icl_type != XFS_LI_ICREATE) { 3446 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); 3447 return -EINVAL; 3448 } 3449 3450 if (icl->icl_size != 1) { 3451 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); 3452 return -EINVAL; 3453 } 3454 3455 agno = be32_to_cpu(icl->icl_ag); 3456 if (agno >= mp->m_sb.sb_agcount) { 3457 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); 3458 return -EINVAL; 3459 } 3460 agbno = be32_to_cpu(icl->icl_agbno); 3461 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { 3462 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); 3463 return -EINVAL; 3464 } 3465 isize = be32_to_cpu(icl->icl_isize); 3466 if (isize != mp->m_sb.sb_inodesize) { 3467 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); 3468 return -EINVAL; 3469 } 3470 count = be32_to_cpu(icl->icl_count); 3471 if (!count) { 3472 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); 3473 return -EINVAL; 3474 } 3475 length = be32_to_cpu(icl->icl_length); 3476 if (!length || length >= mp->m_sb.sb_agblocks) { 3477 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); 3478 return -EINVAL; 3479 } 3480 3481 /* 3482 * The inode chunk is either full or sparse and we only support 3483 * m_ialloc_min_blks sized sparse allocations at this time. 3484 */ 3485 if (length != mp->m_ialloc_blks && 3486 length != mp->m_ialloc_min_blks) { 3487 xfs_warn(log->l_mp, 3488 "%s: unsupported chunk length", __FUNCTION__); 3489 return -EINVAL; 3490 } 3491 3492 /* verify inode count is consistent with extent length */ 3493 if ((count >> mp->m_sb.sb_inopblog) != length) { 3494 xfs_warn(log->l_mp, 3495 "%s: inconsistent inode count and chunk length", 3496 __FUNCTION__); 3497 return -EINVAL; 3498 } 3499 3500 /* 3501 * The icreate transaction can cover multiple cluster buffers and these 3502 * buffers could have been freed and reused. Check the individual 3503 * buffers for cancellation so we don't overwrite anything written after 3504 * a cancellation. 3505 */ 3506 blks_per_cluster = xfs_icluster_size_fsb(mp); 3507 bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); 3508 nbufs = length / blks_per_cluster; 3509 for (i = 0, cancel_count = 0; i < nbufs; i++) { 3510 xfs_daddr_t daddr; 3511 3512 daddr = XFS_AGB_TO_DADDR(mp, agno, 3513 agbno + i * blks_per_cluster); 3514 if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) 3515 cancel_count++; 3516 } 3517 3518 /* 3519 * We currently only use icreate for a single allocation at a time. This 3520 * means we should expect either all or none of the buffers to be 3521 * cancelled. Be conservative and skip replay if at least one buffer is 3522 * cancelled, but warn the user that something is awry if the buffers 3523 * are not consistent. 3524 * 3525 * XXX: This must be refined to only skip cancelled clusters once we use 3526 * icreate for multiple chunk allocations. 3527 */ 3528 ASSERT(!cancel_count || cancel_count == nbufs); 3529 if (cancel_count) { 3530 if (cancel_count != nbufs) 3531 xfs_warn(mp, 3532 "WARNING: partial inode chunk cancellation, skipped icreate."); 3533 trace_xfs_log_recover_icreate_cancel(log, icl); 3534 return 0; 3535 } 3536 3537 trace_xfs_log_recover_icreate_recover(log, icl); 3538 return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, 3539 length, be32_to_cpu(icl->icl_gen)); 3540 } 3541 3542 STATIC void 3543 xlog_recover_buffer_ra_pass2( 3544 struct xlog *log, 3545 struct xlog_recover_item *item) 3546 { 3547 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 3548 struct xfs_mount *mp = log->l_mp; 3549 3550 if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, 3551 buf_f->blf_len, buf_f->blf_flags)) { 3552 return; 3553 } 3554 3555 xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, 3556 buf_f->blf_len, NULL); 3557 } 3558 3559 STATIC void 3560 xlog_recover_inode_ra_pass2( 3561 struct xlog *log, 3562 struct xlog_recover_item *item) 3563 { 3564 struct xfs_inode_log_format ilf_buf; 3565 struct xfs_inode_log_format *ilfp; 3566 struct xfs_mount *mp = log->l_mp; 3567 int error; 3568 3569 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 3570 ilfp = item->ri_buf[0].i_addr; 3571 } else { 3572 ilfp = &ilf_buf; 3573 memset(ilfp, 0, sizeof(*ilfp)); 3574 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); 3575 if (error) 3576 return; 3577 } 3578 3579 if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) 3580 return; 3581 3582 xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, 3583 ilfp->ilf_len, &xfs_inode_buf_ra_ops); 3584 } 3585 3586 STATIC void 3587 xlog_recover_dquot_ra_pass2( 3588 struct xlog *log, 3589 struct xlog_recover_item *item) 3590 { 3591 struct xfs_mount *mp = log->l_mp; 3592 struct xfs_disk_dquot *recddq; 3593 struct xfs_dq_logformat *dq_f; 3594 uint type; 3595 int len; 3596 3597 3598 if (mp->m_qflags == 0) 3599 return; 3600 3601 recddq = item->ri_buf[1].i_addr; 3602 if (recddq == NULL) 3603 return; 3604 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) 3605 return; 3606 3607 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 3608 ASSERT(type); 3609 if (log->l_quotaoffs_flag & type) 3610 return; 3611 3612 dq_f = item->ri_buf[0].i_addr; 3613 ASSERT(dq_f); 3614 ASSERT(dq_f->qlf_len == 1); 3615 3616 len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); 3617 if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) 3618 return; 3619 3620 xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, 3621 &xfs_dquot_buf_ra_ops); 3622 } 3623 3624 STATIC void 3625 xlog_recover_ra_pass2( 3626 struct xlog *log, 3627 struct xlog_recover_item *item) 3628 { 3629 switch (ITEM_TYPE(item)) { 3630 case XFS_LI_BUF: 3631 xlog_recover_buffer_ra_pass2(log, item); 3632 break; 3633 case XFS_LI_INODE: 3634 xlog_recover_inode_ra_pass2(log, item); 3635 break; 3636 case XFS_LI_DQUOT: 3637 xlog_recover_dquot_ra_pass2(log, item); 3638 break; 3639 case XFS_LI_EFI: 3640 case XFS_LI_EFD: 3641 case XFS_LI_QUOTAOFF: 3642 default: 3643 break; 3644 } 3645 } 3646 3647 STATIC int 3648 xlog_recover_commit_pass1( 3649 struct xlog *log, 3650 struct xlog_recover *trans, 3651 struct xlog_recover_item *item) 3652 { 3653 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); 3654 3655 switch (ITEM_TYPE(item)) { 3656 case XFS_LI_BUF: 3657 return xlog_recover_buffer_pass1(log, item); 3658 case XFS_LI_QUOTAOFF: 3659 return xlog_recover_quotaoff_pass1(log, item); 3660 case XFS_LI_INODE: 3661 case XFS_LI_EFI: 3662 case XFS_LI_EFD: 3663 case XFS_LI_DQUOT: 3664 case XFS_LI_ICREATE: 3665 /* nothing to do in pass 1 */ 3666 return 0; 3667 default: 3668 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3669 __func__, ITEM_TYPE(item)); 3670 ASSERT(0); 3671 return -EIO; 3672 } 3673 } 3674 3675 STATIC int 3676 xlog_recover_commit_pass2( 3677 struct xlog *log, 3678 struct xlog_recover *trans, 3679 struct list_head *buffer_list, 3680 struct xlog_recover_item *item) 3681 { 3682 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); 3683 3684 switch (ITEM_TYPE(item)) { 3685 case XFS_LI_BUF: 3686 return xlog_recover_buffer_pass2(log, buffer_list, item, 3687 trans->r_lsn); 3688 case XFS_LI_INODE: 3689 return xlog_recover_inode_pass2(log, buffer_list, item, 3690 trans->r_lsn); 3691 case XFS_LI_EFI: 3692 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 3693 case XFS_LI_EFD: 3694 return xlog_recover_efd_pass2(log, item); 3695 case XFS_LI_DQUOT: 3696 return xlog_recover_dquot_pass2(log, buffer_list, item, 3697 trans->r_lsn); 3698 case XFS_LI_ICREATE: 3699 return xlog_recover_do_icreate_pass2(log, buffer_list, item); 3700 case XFS_LI_QUOTAOFF: 3701 /* nothing to do in pass2 */ 3702 return 0; 3703 default: 3704 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3705 __func__, ITEM_TYPE(item)); 3706 ASSERT(0); 3707 return -EIO; 3708 } 3709 } 3710 3711 STATIC int 3712 xlog_recover_items_pass2( 3713 struct xlog *log, 3714 struct xlog_recover *trans, 3715 struct list_head *buffer_list, 3716 struct list_head *item_list) 3717 { 3718 struct xlog_recover_item *item; 3719 int error = 0; 3720 3721 list_for_each_entry(item, item_list, ri_list) { 3722 error = xlog_recover_commit_pass2(log, trans, 3723 buffer_list, item); 3724 if (error) 3725 return error; 3726 } 3727 3728 return error; 3729 } 3730 3731 /* 3732 * Perform the transaction. 3733 * 3734 * If the transaction modifies a buffer or inode, do it now. Otherwise, 3735 * EFIs and EFDs get queued up by adding entries into the AIL for them. 3736 */ 3737 STATIC int 3738 xlog_recover_commit_trans( 3739 struct xlog *log, 3740 struct xlog_recover *trans, 3741 int pass) 3742 { 3743 int error = 0; 3744 int error2; 3745 int items_queued = 0; 3746 struct xlog_recover_item *item; 3747 struct xlog_recover_item *next; 3748 LIST_HEAD (buffer_list); 3749 LIST_HEAD (ra_list); 3750 LIST_HEAD (done_list); 3751 3752 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 3753 3754 hlist_del(&trans->r_list); 3755 3756 error = xlog_recover_reorder_trans(log, trans, pass); 3757 if (error) 3758 return error; 3759 3760 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { 3761 switch (pass) { 3762 case XLOG_RECOVER_PASS1: 3763 error = xlog_recover_commit_pass1(log, trans, item); 3764 break; 3765 case XLOG_RECOVER_PASS2: 3766 xlog_recover_ra_pass2(log, item); 3767 list_move_tail(&item->ri_list, &ra_list); 3768 items_queued++; 3769 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { 3770 error = xlog_recover_items_pass2(log, trans, 3771 &buffer_list, &ra_list); 3772 list_splice_tail_init(&ra_list, &done_list); 3773 items_queued = 0; 3774 } 3775 3776 break; 3777 default: 3778 ASSERT(0); 3779 } 3780 3781 if (error) 3782 goto out; 3783 } 3784 3785 out: 3786 if (!list_empty(&ra_list)) { 3787 if (!error) 3788 error = xlog_recover_items_pass2(log, trans, 3789 &buffer_list, &ra_list); 3790 list_splice_tail_init(&ra_list, &done_list); 3791 } 3792 3793 if (!list_empty(&done_list)) 3794 list_splice_init(&done_list, &trans->r_itemq); 3795 3796 error2 = xfs_buf_delwri_submit(&buffer_list); 3797 return error ? error : error2; 3798 } 3799 3800 STATIC void 3801 xlog_recover_add_item( 3802 struct list_head *head) 3803 { 3804 xlog_recover_item_t *item; 3805 3806 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 3807 INIT_LIST_HEAD(&item->ri_list); 3808 list_add_tail(&item->ri_list, head); 3809 } 3810 3811 STATIC int 3812 xlog_recover_add_to_cont_trans( 3813 struct xlog *log, 3814 struct xlog_recover *trans, 3815 char *dp, 3816 int len) 3817 { 3818 xlog_recover_item_t *item; 3819 char *ptr, *old_ptr; 3820 int old_len; 3821 3822 /* 3823 * If the transaction is empty, the header was split across this and the 3824 * previous record. Copy the rest of the header. 3825 */ 3826 if (list_empty(&trans->r_itemq)) { 3827 ASSERT(len <= sizeof(struct xfs_trans_header)); 3828 if (len > sizeof(struct xfs_trans_header)) { 3829 xfs_warn(log->l_mp, "%s: bad header length", __func__); 3830 return -EIO; 3831 } 3832 3833 xlog_recover_add_item(&trans->r_itemq); 3834 ptr = (char *)&trans->r_theader + 3835 sizeof(struct xfs_trans_header) - len; 3836 memcpy(ptr, dp, len); 3837 return 0; 3838 } 3839 3840 /* take the tail entry */ 3841 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); 3842 3843 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 3844 old_len = item->ri_buf[item->ri_cnt-1].i_len; 3845 3846 ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); 3847 memcpy(&ptr[old_len], dp, len); 3848 item->ri_buf[item->ri_cnt-1].i_len += len; 3849 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 3850 trace_xfs_log_recover_item_add_cont(log, trans, item, 0); 3851 return 0; 3852 } 3853 3854 /* 3855 * The next region to add is the start of a new region. It could be 3856 * a whole region or it could be the first part of a new region. Because 3857 * of this, the assumption here is that the type and size fields of all 3858 * format structures fit into the first 32 bits of the structure. 3859 * 3860 * This works because all regions must be 32 bit aligned. Therefore, we 3861 * either have both fields or we have neither field. In the case we have 3862 * neither field, the data part of the region is zero length. We only have 3863 * a log_op_header and can throw away the header since a new one will appear 3864 * later. If we have at least 4 bytes, then we can determine how many regions 3865 * will appear in the current log item. 3866 */ 3867 STATIC int 3868 xlog_recover_add_to_trans( 3869 struct xlog *log, 3870 struct xlog_recover *trans, 3871 char *dp, 3872 int len) 3873 { 3874 xfs_inode_log_format_t *in_f; /* any will do */ 3875 xlog_recover_item_t *item; 3876 char *ptr; 3877 3878 if (!len) 3879 return 0; 3880 if (list_empty(&trans->r_itemq)) { 3881 /* we need to catch log corruptions here */ 3882 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 3883 xfs_warn(log->l_mp, "%s: bad header magic number", 3884 __func__); 3885 ASSERT(0); 3886 return -EIO; 3887 } 3888 3889 if (len > sizeof(struct xfs_trans_header)) { 3890 xfs_warn(log->l_mp, "%s: bad header length", __func__); 3891 ASSERT(0); 3892 return -EIO; 3893 } 3894 3895 /* 3896 * The transaction header can be arbitrarily split across op 3897 * records. If we don't have the whole thing here, copy what we 3898 * do have and handle the rest in the next record. 3899 */ 3900 if (len == sizeof(struct xfs_trans_header)) 3901 xlog_recover_add_item(&trans->r_itemq); 3902 memcpy(&trans->r_theader, dp, len); 3903 return 0; 3904 } 3905 3906 ptr = kmem_alloc(len, KM_SLEEP); 3907 memcpy(ptr, dp, len); 3908 in_f = (xfs_inode_log_format_t *)ptr; 3909 3910 /* take the tail entry */ 3911 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); 3912 if (item->ri_total != 0 && 3913 item->ri_total == item->ri_cnt) { 3914 /* tail item is in use, get a new one */ 3915 xlog_recover_add_item(&trans->r_itemq); 3916 item = list_entry(trans->r_itemq.prev, 3917 xlog_recover_item_t, ri_list); 3918 } 3919 3920 if (item->ri_total == 0) { /* first region to be added */ 3921 if (in_f->ilf_size == 0 || 3922 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 3923 xfs_warn(log->l_mp, 3924 "bad number of regions (%d) in inode log format", 3925 in_f->ilf_size); 3926 ASSERT(0); 3927 kmem_free(ptr); 3928 return -EIO; 3929 } 3930 3931 item->ri_total = in_f->ilf_size; 3932 item->ri_buf = 3933 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), 3934 KM_SLEEP); 3935 } 3936 ASSERT(item->ri_total > item->ri_cnt); 3937 /* Description region is ri_buf[0] */ 3938 item->ri_buf[item->ri_cnt].i_addr = ptr; 3939 item->ri_buf[item->ri_cnt].i_len = len; 3940 item->ri_cnt++; 3941 trace_xfs_log_recover_item_add(log, trans, item, 0); 3942 return 0; 3943 } 3944 3945 /* 3946 * Free up any resources allocated by the transaction 3947 * 3948 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 3949 */ 3950 STATIC void 3951 xlog_recover_free_trans( 3952 struct xlog_recover *trans) 3953 { 3954 xlog_recover_item_t *item, *n; 3955 int i; 3956 3957 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { 3958 /* Free the regions in the item. */ 3959 list_del(&item->ri_list); 3960 for (i = 0; i < item->ri_cnt; i++) 3961 kmem_free(item->ri_buf[i].i_addr); 3962 /* Free the item itself */ 3963 kmem_free(item->ri_buf); 3964 kmem_free(item); 3965 } 3966 /* Free the transaction recover structure */ 3967 kmem_free(trans); 3968 } 3969 3970 /* 3971 * On error or completion, trans is freed. 3972 */ 3973 STATIC int 3974 xlog_recovery_process_trans( 3975 struct xlog *log, 3976 struct xlog_recover *trans, 3977 char *dp, 3978 unsigned int len, 3979 unsigned int flags, 3980 int pass) 3981 { 3982 int error = 0; 3983 bool freeit = false; 3984 3985 /* mask off ophdr transaction container flags */ 3986 flags &= ~XLOG_END_TRANS; 3987 if (flags & XLOG_WAS_CONT_TRANS) 3988 flags &= ~XLOG_CONTINUE_TRANS; 3989 3990 /* 3991 * Callees must not free the trans structure. We'll decide if we need to 3992 * free it or not based on the operation being done and it's result. 3993 */ 3994 switch (flags) { 3995 /* expected flag values */ 3996 case 0: 3997 case XLOG_CONTINUE_TRANS: 3998 error = xlog_recover_add_to_trans(log, trans, dp, len); 3999 break; 4000 case XLOG_WAS_CONT_TRANS: 4001 error = xlog_recover_add_to_cont_trans(log, trans, dp, len); 4002 break; 4003 case XLOG_COMMIT_TRANS: 4004 error = xlog_recover_commit_trans(log, trans, pass); 4005 /* success or fail, we are now done with this transaction. */ 4006 freeit = true; 4007 break; 4008 4009 /* unexpected flag values */ 4010 case XLOG_UNMOUNT_TRANS: 4011 /* just skip trans */ 4012 xfs_warn(log->l_mp, "%s: Unmount LR", __func__); 4013 freeit = true; 4014 break; 4015 case XLOG_START_TRANS: 4016 default: 4017 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); 4018 ASSERT(0); 4019 error = -EIO; 4020 break; 4021 } 4022 if (error || freeit) 4023 xlog_recover_free_trans(trans); 4024 return error; 4025 } 4026 4027 /* 4028 * Lookup the transaction recovery structure associated with the ID in the 4029 * current ophdr. If the transaction doesn't exist and the start flag is set in 4030 * the ophdr, then allocate a new transaction for future ID matches to find. 4031 * Either way, return what we found during the lookup - an existing transaction 4032 * or nothing. 4033 */ 4034 STATIC struct xlog_recover * 4035 xlog_recover_ophdr_to_trans( 4036 struct hlist_head rhash[], 4037 struct xlog_rec_header *rhead, 4038 struct xlog_op_header *ohead) 4039 { 4040 struct xlog_recover *trans; 4041 xlog_tid_t tid; 4042 struct hlist_head *rhp; 4043 4044 tid = be32_to_cpu(ohead->oh_tid); 4045 rhp = &rhash[XLOG_RHASH(tid)]; 4046 hlist_for_each_entry(trans, rhp, r_list) { 4047 if (trans->r_log_tid == tid) 4048 return trans; 4049 } 4050 4051 /* 4052 * skip over non-start transaction headers - we could be 4053 * processing slack space before the next transaction starts 4054 */ 4055 if (!(ohead->oh_flags & XLOG_START_TRANS)) 4056 return NULL; 4057 4058 ASSERT(be32_to_cpu(ohead->oh_len) == 0); 4059 4060 /* 4061 * This is a new transaction so allocate a new recovery container to 4062 * hold the recovery ops that will follow. 4063 */ 4064 trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); 4065 trans->r_log_tid = tid; 4066 trans->r_lsn = be64_to_cpu(rhead->h_lsn); 4067 INIT_LIST_HEAD(&trans->r_itemq); 4068 INIT_HLIST_NODE(&trans->r_list); 4069 hlist_add_head(&trans->r_list, rhp); 4070 4071 /* 4072 * Nothing more to do for this ophdr. Items to be added to this new 4073 * transaction will be in subsequent ophdr containers. 4074 */ 4075 return NULL; 4076 } 4077 4078 STATIC int 4079 xlog_recover_process_ophdr( 4080 struct xlog *log, 4081 struct hlist_head rhash[], 4082 struct xlog_rec_header *rhead, 4083 struct xlog_op_header *ohead, 4084 char *dp, 4085 char *end, 4086 int pass) 4087 { 4088 struct xlog_recover *trans; 4089 unsigned int len; 4090 4091 /* Do we understand who wrote this op? */ 4092 if (ohead->oh_clientid != XFS_TRANSACTION && 4093 ohead->oh_clientid != XFS_LOG) { 4094 xfs_warn(log->l_mp, "%s: bad clientid 0x%x", 4095 __func__, ohead->oh_clientid); 4096 ASSERT(0); 4097 return -EIO; 4098 } 4099 4100 /* 4101 * Check the ophdr contains all the data it is supposed to contain. 4102 */ 4103 len = be32_to_cpu(ohead->oh_len); 4104 if (dp + len > end) { 4105 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); 4106 WARN_ON(1); 4107 return -EIO; 4108 } 4109 4110 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); 4111 if (!trans) { 4112 /* nothing to do, so skip over this ophdr */ 4113 return 0; 4114 } 4115 4116 return xlog_recovery_process_trans(log, trans, dp, len, 4117 ohead->oh_flags, pass); 4118 } 4119 4120 /* 4121 * There are two valid states of the r_state field. 0 indicates that the 4122 * transaction structure is in a normal state. We have either seen the 4123 * start of the transaction or the last operation we added was not a partial 4124 * operation. If the last operation we added to the transaction was a 4125 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. 4126 * 4127 * NOTE: skip LRs with 0 data length. 4128 */ 4129 STATIC int 4130 xlog_recover_process_data( 4131 struct xlog *log, 4132 struct hlist_head rhash[], 4133 struct xlog_rec_header *rhead, 4134 char *dp, 4135 int pass) 4136 { 4137 struct xlog_op_header *ohead; 4138 char *end; 4139 int num_logops; 4140 int error; 4141 4142 end = dp + be32_to_cpu(rhead->h_len); 4143 num_logops = be32_to_cpu(rhead->h_num_logops); 4144 4145 /* check the log format matches our own - else we can't recover */ 4146 if (xlog_header_check_recover(log->l_mp, rhead)) 4147 return -EIO; 4148 4149 while ((dp < end) && num_logops) { 4150 4151 ohead = (struct xlog_op_header *)dp; 4152 dp += sizeof(*ohead); 4153 ASSERT(dp <= end); 4154 4155 /* errors will abort recovery */ 4156 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, 4157 dp, end, pass); 4158 if (error) 4159 return error; 4160 4161 dp += be32_to_cpu(ohead->oh_len); 4162 num_logops--; 4163 } 4164 return 0; 4165 } 4166 4167 /* 4168 * Process an extent free intent item that was recovered from 4169 * the log. We need to free the extents that it describes. 4170 */ 4171 STATIC int 4172 xlog_recover_process_efi( 4173 xfs_mount_t *mp, 4174 xfs_efi_log_item_t *efip) 4175 { 4176 xfs_efd_log_item_t *efdp; 4177 xfs_trans_t *tp; 4178 int i; 4179 int error = 0; 4180 xfs_extent_t *extp; 4181 xfs_fsblock_t startblock_fsb; 4182 4183 ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); 4184 4185 /* 4186 * First check the validity of the extents described by the 4187 * EFI. If any are bad, then assume that all are bad and 4188 * just toss the EFI. 4189 */ 4190 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 4191 extp = &(efip->efi_format.efi_extents[i]); 4192 startblock_fsb = XFS_BB_TO_FSB(mp, 4193 XFS_FSB_TO_DADDR(mp, extp->ext_start)); 4194 if ((startblock_fsb == 0) || 4195 (extp->ext_len == 0) || 4196 (startblock_fsb >= mp->m_sb.sb_dblocks) || 4197 (extp->ext_len >= mp->m_sb.sb_agblocks)) { 4198 /* 4199 * This will pull the EFI from the AIL and 4200 * free the memory associated with it. 4201 */ 4202 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 4203 xfs_efi_release(efip); 4204 return -EIO; 4205 } 4206 } 4207 4208 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 4209 if (error) 4210 return error; 4211 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 4212 4213 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 4214 extp = &(efip->efi_format.efi_extents[i]); 4215 error = xfs_trans_free_extent(tp, efdp, extp->ext_start, 4216 extp->ext_len); 4217 if (error) 4218 goto abort_error; 4219 4220 } 4221 4222 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 4223 error = xfs_trans_commit(tp); 4224 return error; 4225 4226 abort_error: 4227 xfs_trans_cancel(tp); 4228 return error; 4229 } 4230 4231 /* 4232 * When this is called, all of the EFIs which did not have 4233 * corresponding EFDs should be in the AIL. What we do now 4234 * is free the extents associated with each one. 4235 * 4236 * Since we process the EFIs in normal transactions, they 4237 * will be removed at some point after the commit. This prevents 4238 * us from just walking down the list processing each one. 4239 * We'll use a flag in the EFI to skip those that we've already 4240 * processed and use the AIL iteration mechanism's generation 4241 * count to try to speed this up at least a bit. 4242 * 4243 * When we start, we know that the EFIs are the only things in 4244 * the AIL. As we process them, however, other items are added 4245 * to the AIL. Since everything added to the AIL must come after 4246 * everything already in the AIL, we stop processing as soon as 4247 * we see something other than an EFI in the AIL. 4248 */ 4249 STATIC int 4250 xlog_recover_process_efis( 4251 struct xlog *log) 4252 { 4253 struct xfs_log_item *lip; 4254 struct xfs_efi_log_item *efip; 4255 int error = 0; 4256 struct xfs_ail_cursor cur; 4257 struct xfs_ail *ailp; 4258 4259 ailp = log->l_ailp; 4260 spin_lock(&ailp->xa_lock); 4261 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 4262 while (lip != NULL) { 4263 /* 4264 * We're done when we see something other than an EFI. 4265 * There should be no EFIs left in the AIL now. 4266 */ 4267 if (lip->li_type != XFS_LI_EFI) { 4268 #ifdef DEBUG 4269 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) 4270 ASSERT(lip->li_type != XFS_LI_EFI); 4271 #endif 4272 break; 4273 } 4274 4275 /* 4276 * Skip EFIs that we've already processed. 4277 */ 4278 efip = container_of(lip, struct xfs_efi_log_item, efi_item); 4279 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { 4280 lip = xfs_trans_ail_cursor_next(ailp, &cur); 4281 continue; 4282 } 4283 4284 spin_unlock(&ailp->xa_lock); 4285 error = xlog_recover_process_efi(log->l_mp, efip); 4286 spin_lock(&ailp->xa_lock); 4287 if (error) 4288 goto out; 4289 lip = xfs_trans_ail_cursor_next(ailp, &cur); 4290 } 4291 out: 4292 xfs_trans_ail_cursor_done(&cur); 4293 spin_unlock(&ailp->xa_lock); 4294 return error; 4295 } 4296 4297 /* 4298 * A cancel occurs when the mount has failed and we're bailing out. Release all 4299 * pending EFIs so they don't pin the AIL. 4300 */ 4301 STATIC int 4302 xlog_recover_cancel_efis( 4303 struct xlog *log) 4304 { 4305 struct xfs_log_item *lip; 4306 struct xfs_efi_log_item *efip; 4307 int error = 0; 4308 struct xfs_ail_cursor cur; 4309 struct xfs_ail *ailp; 4310 4311 ailp = log->l_ailp; 4312 spin_lock(&ailp->xa_lock); 4313 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 4314 while (lip != NULL) { 4315 /* 4316 * We're done when we see something other than an EFI. 4317 * There should be no EFIs left in the AIL now. 4318 */ 4319 if (lip->li_type != XFS_LI_EFI) { 4320 #ifdef DEBUG 4321 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) 4322 ASSERT(lip->li_type != XFS_LI_EFI); 4323 #endif 4324 break; 4325 } 4326 4327 efip = container_of(lip, struct xfs_efi_log_item, efi_item); 4328 4329 spin_unlock(&ailp->xa_lock); 4330 xfs_efi_release(efip); 4331 spin_lock(&ailp->xa_lock); 4332 4333 lip = xfs_trans_ail_cursor_next(ailp, &cur); 4334 } 4335 4336 xfs_trans_ail_cursor_done(&cur); 4337 spin_unlock(&ailp->xa_lock); 4338 return error; 4339 } 4340 4341 /* 4342 * This routine performs a transaction to null out a bad inode pointer 4343 * in an agi unlinked inode hash bucket. 4344 */ 4345 STATIC void 4346 xlog_recover_clear_agi_bucket( 4347 xfs_mount_t *mp, 4348 xfs_agnumber_t agno, 4349 int bucket) 4350 { 4351 xfs_trans_t *tp; 4352 xfs_agi_t *agi; 4353 xfs_buf_t *agibp; 4354 int offset; 4355 int error; 4356 4357 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); 4358 if (error) 4359 goto out_error; 4360 4361 error = xfs_read_agi(mp, tp, agno, &agibp); 4362 if (error) 4363 goto out_abort; 4364 4365 agi = XFS_BUF_TO_AGI(agibp); 4366 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 4367 offset = offsetof(xfs_agi_t, agi_unlinked) + 4368 (sizeof(xfs_agino_t) * bucket); 4369 xfs_trans_log_buf(tp, agibp, offset, 4370 (offset + sizeof(xfs_agino_t) - 1)); 4371 4372 error = xfs_trans_commit(tp); 4373 if (error) 4374 goto out_error; 4375 return; 4376 4377 out_abort: 4378 xfs_trans_cancel(tp); 4379 out_error: 4380 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); 4381 return; 4382 } 4383 4384 STATIC xfs_agino_t 4385 xlog_recover_process_one_iunlink( 4386 struct xfs_mount *mp, 4387 xfs_agnumber_t agno, 4388 xfs_agino_t agino, 4389 int bucket) 4390 { 4391 struct xfs_buf *ibp; 4392 struct xfs_dinode *dip; 4393 struct xfs_inode *ip; 4394 xfs_ino_t ino; 4395 int error; 4396 4397 ino = XFS_AGINO_TO_INO(mp, agno, agino); 4398 error = xfs_iget(mp, NULL, ino, 0, 0, &ip); 4399 if (error) 4400 goto fail; 4401 4402 /* 4403 * Get the on disk inode to find the next inode in the bucket. 4404 */ 4405 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); 4406 if (error) 4407 goto fail_iput; 4408 4409 ASSERT(VFS_I(ip)->i_nlink == 0); 4410 ASSERT(VFS_I(ip)->i_mode != 0); 4411 4412 /* setup for the next pass */ 4413 agino = be32_to_cpu(dip->di_next_unlinked); 4414 xfs_buf_relse(ibp); 4415 4416 /* 4417 * Prevent any DMAPI event from being sent when the reference on 4418 * the inode is dropped. 4419 */ 4420 ip->i_d.di_dmevmask = 0; 4421 4422 IRELE(ip); 4423 return agino; 4424 4425 fail_iput: 4426 IRELE(ip); 4427 fail: 4428 /* 4429 * We can't read in the inode this bucket points to, or this inode 4430 * is messed up. Just ditch this bucket of inodes. We will lose 4431 * some inodes and space, but at least we won't hang. 4432 * 4433 * Call xlog_recover_clear_agi_bucket() to perform a transaction to 4434 * clear the inode pointer in the bucket. 4435 */ 4436 xlog_recover_clear_agi_bucket(mp, agno, bucket); 4437 return NULLAGINO; 4438 } 4439 4440 /* 4441 * xlog_iunlink_recover 4442 * 4443 * This is called during recovery to process any inodes which 4444 * we unlinked but not freed when the system crashed. These 4445 * inodes will be on the lists in the AGI blocks. What we do 4446 * here is scan all the AGIs and fully truncate and free any 4447 * inodes found on the lists. Each inode is removed from the 4448 * lists when it has been fully truncated and is freed. The 4449 * freeing of the inode and its removal from the list must be 4450 * atomic. 4451 */ 4452 STATIC void 4453 xlog_recover_process_iunlinks( 4454 struct xlog *log) 4455 { 4456 xfs_mount_t *mp; 4457 xfs_agnumber_t agno; 4458 xfs_agi_t *agi; 4459 xfs_buf_t *agibp; 4460 xfs_agino_t agino; 4461 int bucket; 4462 int error; 4463 uint mp_dmevmask; 4464 4465 mp = log->l_mp; 4466 4467 /* 4468 * Prevent any DMAPI event from being sent while in this function. 4469 */ 4470 mp_dmevmask = mp->m_dmevmask; 4471 mp->m_dmevmask = 0; 4472 4473 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 4474 /* 4475 * Find the agi for this ag. 4476 */ 4477 error = xfs_read_agi(mp, NULL, agno, &agibp); 4478 if (error) { 4479 /* 4480 * AGI is b0rked. Don't process it. 4481 * 4482 * We should probably mark the filesystem as corrupt 4483 * after we've recovered all the ag's we can.... 4484 */ 4485 continue; 4486 } 4487 /* 4488 * Unlock the buffer so that it can be acquired in the normal 4489 * course of the transaction to truncate and free each inode. 4490 * Because we are not racing with anyone else here for the AGI 4491 * buffer, we don't even need to hold it locked to read the 4492 * initial unlinked bucket entries out of the buffer. We keep 4493 * buffer reference though, so that it stays pinned in memory 4494 * while we need the buffer. 4495 */ 4496 agi = XFS_BUF_TO_AGI(agibp); 4497 xfs_buf_unlock(agibp); 4498 4499 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { 4500 agino = be32_to_cpu(agi->agi_unlinked[bucket]); 4501 while (agino != NULLAGINO) { 4502 agino = xlog_recover_process_one_iunlink(mp, 4503 agno, agino, bucket); 4504 } 4505 } 4506 xfs_buf_rele(agibp); 4507 } 4508 4509 mp->m_dmevmask = mp_dmevmask; 4510 } 4511 4512 STATIC int 4513 xlog_unpack_data( 4514 struct xlog_rec_header *rhead, 4515 char *dp, 4516 struct xlog *log) 4517 { 4518 int i, j, k; 4519 4520 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 4521 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 4522 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; 4523 dp += BBSIZE; 4524 } 4525 4526 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 4527 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; 4528 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { 4529 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 4530 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 4531 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; 4532 dp += BBSIZE; 4533 } 4534 } 4535 4536 return 0; 4537 } 4538 4539 /* 4540 * CRC check, unpack and process a log record. 4541 */ 4542 STATIC int 4543 xlog_recover_process( 4544 struct xlog *log, 4545 struct hlist_head rhash[], 4546 struct xlog_rec_header *rhead, 4547 char *dp, 4548 int pass) 4549 { 4550 int error; 4551 __le32 crc; 4552 4553 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); 4554 4555 /* 4556 * Nothing else to do if this is a CRC verification pass. Just return 4557 * if this a record with a non-zero crc. Unfortunately, mkfs always 4558 * sets h_crc to 0 so we must consider this valid even on v5 supers. 4559 * Otherwise, return EFSBADCRC on failure so the callers up the stack 4560 * know precisely what failed. 4561 */ 4562 if (pass == XLOG_RECOVER_CRCPASS) { 4563 if (rhead->h_crc && crc != rhead->h_crc) 4564 return -EFSBADCRC; 4565 return 0; 4566 } 4567 4568 /* 4569 * We're in the normal recovery path. Issue a warning if and only if the 4570 * CRC in the header is non-zero. This is an advisory warning and the 4571 * zero CRC check prevents warnings from being emitted when upgrading 4572 * the kernel from one that does not add CRCs by default. 4573 */ 4574 if (crc != rhead->h_crc) { 4575 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { 4576 xfs_alert(log->l_mp, 4577 "log record CRC mismatch: found 0x%x, expected 0x%x.", 4578 le32_to_cpu(rhead->h_crc), 4579 le32_to_cpu(crc)); 4580 xfs_hex_dump(dp, 32); 4581 } 4582 4583 /* 4584 * If the filesystem is CRC enabled, this mismatch becomes a 4585 * fatal log corruption failure. 4586 */ 4587 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) 4588 return -EFSCORRUPTED; 4589 } 4590 4591 error = xlog_unpack_data(rhead, dp, log); 4592 if (error) 4593 return error; 4594 4595 return xlog_recover_process_data(log, rhash, rhead, dp, pass); 4596 } 4597 4598 STATIC int 4599 xlog_valid_rec_header( 4600 struct xlog *log, 4601 struct xlog_rec_header *rhead, 4602 xfs_daddr_t blkno) 4603 { 4604 int hlen; 4605 4606 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { 4607 XFS_ERROR_REPORT("xlog_valid_rec_header(1)", 4608 XFS_ERRLEVEL_LOW, log->l_mp); 4609 return -EFSCORRUPTED; 4610 } 4611 if (unlikely( 4612 (!rhead->h_version || 4613 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 4614 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", 4615 __func__, be32_to_cpu(rhead->h_version)); 4616 return -EIO; 4617 } 4618 4619 /* LR body must have data or it wouldn't have been written */ 4620 hlen = be32_to_cpu(rhead->h_len); 4621 if (unlikely( hlen <= 0 || hlen > INT_MAX )) { 4622 XFS_ERROR_REPORT("xlog_valid_rec_header(2)", 4623 XFS_ERRLEVEL_LOW, log->l_mp); 4624 return -EFSCORRUPTED; 4625 } 4626 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { 4627 XFS_ERROR_REPORT("xlog_valid_rec_header(3)", 4628 XFS_ERRLEVEL_LOW, log->l_mp); 4629 return -EFSCORRUPTED; 4630 } 4631 return 0; 4632 } 4633 4634 /* 4635 * Read the log from tail to head and process the log records found. 4636 * Handle the two cases where the tail and head are in the same cycle 4637 * and where the active portion of the log wraps around the end of 4638 * the physical log separately. The pass parameter is passed through 4639 * to the routines called to process the data and is not looked at 4640 * here. 4641 */ 4642 STATIC int 4643 xlog_do_recovery_pass( 4644 struct xlog *log, 4645 xfs_daddr_t head_blk, 4646 xfs_daddr_t tail_blk, 4647 int pass, 4648 xfs_daddr_t *first_bad) /* out: first bad log rec */ 4649 { 4650 xlog_rec_header_t *rhead; 4651 xfs_daddr_t blk_no; 4652 xfs_daddr_t rhead_blk; 4653 char *offset; 4654 xfs_buf_t *hbp, *dbp; 4655 int error = 0, h_size, h_len; 4656 int bblks, split_bblks; 4657 int hblks, split_hblks, wrapped_hblks; 4658 struct hlist_head rhash[XLOG_RHASH_SIZE]; 4659 4660 ASSERT(head_blk != tail_blk); 4661 rhead_blk = 0; 4662 4663 /* 4664 * Read the header of the tail block and get the iclog buffer size from 4665 * h_size. Use this to tell how many sectors make up the log header. 4666 */ 4667 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 4668 /* 4669 * When using variable length iclogs, read first sector of 4670 * iclog header and extract the header size from it. Get a 4671 * new hbp that is the correct size. 4672 */ 4673 hbp = xlog_get_bp(log, 1); 4674 if (!hbp) 4675 return -ENOMEM; 4676 4677 error = xlog_bread(log, tail_blk, 1, hbp, &offset); 4678 if (error) 4679 goto bread_err1; 4680 4681 rhead = (xlog_rec_header_t *)offset; 4682 error = xlog_valid_rec_header(log, rhead, tail_blk); 4683 if (error) 4684 goto bread_err1; 4685 4686 /* 4687 * xfsprogs has a bug where record length is based on lsunit but 4688 * h_size (iclog size) is hardcoded to 32k. Now that we 4689 * unconditionally CRC verify the unmount record, this means the 4690 * log buffer can be too small for the record and cause an 4691 * overrun. 4692 * 4693 * Detect this condition here. Use lsunit for the buffer size as 4694 * long as this looks like the mkfs case. Otherwise, return an 4695 * error to avoid a buffer overrun. 4696 */ 4697 h_size = be32_to_cpu(rhead->h_size); 4698 h_len = be32_to_cpu(rhead->h_len); 4699 if (h_len > h_size) { 4700 if (h_len <= log->l_mp->m_logbsize && 4701 be32_to_cpu(rhead->h_num_logops) == 1) { 4702 xfs_warn(log->l_mp, 4703 "invalid iclog size (%d bytes), using lsunit (%d bytes)", 4704 h_size, log->l_mp->m_logbsize); 4705 h_size = log->l_mp->m_logbsize; 4706 } else 4707 return -EFSCORRUPTED; 4708 } 4709 4710 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && 4711 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 4712 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 4713 if (h_size % XLOG_HEADER_CYCLE_SIZE) 4714 hblks++; 4715 xlog_put_bp(hbp); 4716 hbp = xlog_get_bp(log, hblks); 4717 } else { 4718 hblks = 1; 4719 } 4720 } else { 4721 ASSERT(log->l_sectBBsize == 1); 4722 hblks = 1; 4723 hbp = xlog_get_bp(log, 1); 4724 h_size = XLOG_BIG_RECORD_BSIZE; 4725 } 4726 4727 if (!hbp) 4728 return -ENOMEM; 4729 dbp = xlog_get_bp(log, BTOBB(h_size)); 4730 if (!dbp) { 4731 xlog_put_bp(hbp); 4732 return -ENOMEM; 4733 } 4734 4735 memset(rhash, 0, sizeof(rhash)); 4736 blk_no = rhead_blk = tail_blk; 4737 if (tail_blk > head_blk) { 4738 /* 4739 * Perform recovery around the end of the physical log. 4740 * When the head is not on the same cycle number as the tail, 4741 * we can't do a sequential recovery. 4742 */ 4743 while (blk_no < log->l_logBBsize) { 4744 /* 4745 * Check for header wrapping around physical end-of-log 4746 */ 4747 offset = hbp->b_addr; 4748 split_hblks = 0; 4749 wrapped_hblks = 0; 4750 if (blk_no + hblks <= log->l_logBBsize) { 4751 /* Read header in one read */ 4752 error = xlog_bread(log, blk_no, hblks, hbp, 4753 &offset); 4754 if (error) 4755 goto bread_err2; 4756 } else { 4757 /* This LR is split across physical log end */ 4758 if (blk_no != log->l_logBBsize) { 4759 /* some data before physical log end */ 4760 ASSERT(blk_no <= INT_MAX); 4761 split_hblks = log->l_logBBsize - (int)blk_no; 4762 ASSERT(split_hblks > 0); 4763 error = xlog_bread(log, blk_no, 4764 split_hblks, hbp, 4765 &offset); 4766 if (error) 4767 goto bread_err2; 4768 } 4769 4770 /* 4771 * Note: this black magic still works with 4772 * large sector sizes (non-512) only because: 4773 * - we increased the buffer size originally 4774 * by 1 sector giving us enough extra space 4775 * for the second read; 4776 * - the log start is guaranteed to be sector 4777 * aligned; 4778 * - we read the log end (LR header start) 4779 * _first_, then the log start (LR header end) 4780 * - order is important. 4781 */ 4782 wrapped_hblks = hblks - split_hblks; 4783 error = xlog_bread_offset(log, 0, 4784 wrapped_hblks, hbp, 4785 offset + BBTOB(split_hblks)); 4786 if (error) 4787 goto bread_err2; 4788 } 4789 rhead = (xlog_rec_header_t *)offset; 4790 error = xlog_valid_rec_header(log, rhead, 4791 split_hblks ? blk_no : 0); 4792 if (error) 4793 goto bread_err2; 4794 4795 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 4796 blk_no += hblks; 4797 4798 /* Read in data for log record */ 4799 if (blk_no + bblks <= log->l_logBBsize) { 4800 error = xlog_bread(log, blk_no, bblks, dbp, 4801 &offset); 4802 if (error) 4803 goto bread_err2; 4804 } else { 4805 /* This log record is split across the 4806 * physical end of log */ 4807 offset = dbp->b_addr; 4808 split_bblks = 0; 4809 if (blk_no != log->l_logBBsize) { 4810 /* some data is before the physical 4811 * end of log */ 4812 ASSERT(!wrapped_hblks); 4813 ASSERT(blk_no <= INT_MAX); 4814 split_bblks = 4815 log->l_logBBsize - (int)blk_no; 4816 ASSERT(split_bblks > 0); 4817 error = xlog_bread(log, blk_no, 4818 split_bblks, dbp, 4819 &offset); 4820 if (error) 4821 goto bread_err2; 4822 } 4823 4824 /* 4825 * Note: this black magic still works with 4826 * large sector sizes (non-512) only because: 4827 * - we increased the buffer size originally 4828 * by 1 sector giving us enough extra space 4829 * for the second read; 4830 * - the log start is guaranteed to be sector 4831 * aligned; 4832 * - we read the log end (LR header start) 4833 * _first_, then the log start (LR header end) 4834 * - order is important. 4835 */ 4836 error = xlog_bread_offset(log, 0, 4837 bblks - split_bblks, dbp, 4838 offset + BBTOB(split_bblks)); 4839 if (error) 4840 goto bread_err2; 4841 } 4842 4843 error = xlog_recover_process(log, rhash, rhead, offset, 4844 pass); 4845 if (error) 4846 goto bread_err2; 4847 4848 blk_no += bblks; 4849 rhead_blk = blk_no; 4850 } 4851 4852 ASSERT(blk_no >= log->l_logBBsize); 4853 blk_no -= log->l_logBBsize; 4854 rhead_blk = blk_no; 4855 } 4856 4857 /* read first part of physical log */ 4858 while (blk_no < head_blk) { 4859 error = xlog_bread(log, blk_no, hblks, hbp, &offset); 4860 if (error) 4861 goto bread_err2; 4862 4863 rhead = (xlog_rec_header_t *)offset; 4864 error = xlog_valid_rec_header(log, rhead, blk_no); 4865 if (error) 4866 goto bread_err2; 4867 4868 /* blocks in data section */ 4869 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 4870 error = xlog_bread(log, blk_no+hblks, bblks, dbp, 4871 &offset); 4872 if (error) 4873 goto bread_err2; 4874 4875 error = xlog_recover_process(log, rhash, rhead, offset, pass); 4876 if (error) 4877 goto bread_err2; 4878 4879 blk_no += bblks + hblks; 4880 rhead_blk = blk_no; 4881 } 4882 4883 bread_err2: 4884 xlog_put_bp(dbp); 4885 bread_err1: 4886 xlog_put_bp(hbp); 4887 4888 if (error && first_bad) 4889 *first_bad = rhead_blk; 4890 4891 return error; 4892 } 4893 4894 /* 4895 * Do the recovery of the log. We actually do this in two phases. 4896 * The two passes are necessary in order to implement the function 4897 * of cancelling a record written into the log. The first pass 4898 * determines those things which have been cancelled, and the 4899 * second pass replays log items normally except for those which 4900 * have been cancelled. The handling of the replay and cancellations 4901 * takes place in the log item type specific routines. 4902 * 4903 * The table of items which have cancel records in the log is allocated 4904 * and freed at this level, since only here do we know when all of 4905 * the log recovery has been completed. 4906 */ 4907 STATIC int 4908 xlog_do_log_recovery( 4909 struct xlog *log, 4910 xfs_daddr_t head_blk, 4911 xfs_daddr_t tail_blk) 4912 { 4913 int error, i; 4914 4915 ASSERT(head_blk != tail_blk); 4916 4917 /* 4918 * First do a pass to find all of the cancelled buf log items. 4919 * Store them in the buf_cancel_table for use in the second pass. 4920 */ 4921 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * 4922 sizeof(struct list_head), 4923 KM_SLEEP); 4924 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 4925 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 4926 4927 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 4928 XLOG_RECOVER_PASS1, NULL); 4929 if (error != 0) { 4930 kmem_free(log->l_buf_cancel_table); 4931 log->l_buf_cancel_table = NULL; 4932 return error; 4933 } 4934 /* 4935 * Then do a second pass to actually recover the items in the log. 4936 * When it is complete free the table of buf cancel items. 4937 */ 4938 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 4939 XLOG_RECOVER_PASS2, NULL); 4940 #ifdef DEBUG 4941 if (!error) { 4942 int i; 4943 4944 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 4945 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 4946 } 4947 #endif /* DEBUG */ 4948 4949 kmem_free(log->l_buf_cancel_table); 4950 log->l_buf_cancel_table = NULL; 4951 4952 return error; 4953 } 4954 4955 /* 4956 * Do the actual recovery 4957 */ 4958 STATIC int 4959 xlog_do_recover( 4960 struct xlog *log, 4961 xfs_daddr_t head_blk, 4962 xfs_daddr_t tail_blk) 4963 { 4964 struct xfs_mount *mp = log->l_mp; 4965 int error; 4966 xfs_buf_t *bp; 4967 xfs_sb_t *sbp; 4968 4969 /* 4970 * First replay the images in the log. 4971 */ 4972 error = xlog_do_log_recovery(log, head_blk, tail_blk); 4973 if (error) 4974 return error; 4975 4976 /* 4977 * If IO errors happened during recovery, bail out. 4978 */ 4979 if (XFS_FORCED_SHUTDOWN(mp)) { 4980 return -EIO; 4981 } 4982 4983 /* 4984 * We now update the tail_lsn since much of the recovery has completed 4985 * and there may be space available to use. If there were no extent 4986 * or iunlinks, we can free up the entire log and set the tail_lsn to 4987 * be the last_sync_lsn. This was set in xlog_find_tail to be the 4988 * lsn of the last known good LR on disk. If there are extent frees 4989 * or iunlinks they will have some entries in the AIL; so we look at 4990 * the AIL to determine how to set the tail_lsn. 4991 */ 4992 xlog_assign_tail_lsn(mp); 4993 4994 /* 4995 * Now that we've finished replaying all buffer and inode 4996 * updates, re-read in the superblock and reverify it. 4997 */ 4998 bp = xfs_getsb(mp, 0); 4999 bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); 5000 ASSERT(!(bp->b_flags & XBF_WRITE)); 5001 bp->b_flags |= XBF_READ; 5002 bp->b_ops = &xfs_sb_buf_ops; 5003 5004 error = xfs_buf_submit_wait(bp); 5005 if (error) { 5006 if (!XFS_FORCED_SHUTDOWN(mp)) { 5007 xfs_buf_ioerror_alert(bp, __func__); 5008 ASSERT(0); 5009 } 5010 xfs_buf_relse(bp); 5011 return error; 5012 } 5013 5014 /* Convert superblock from on-disk format */ 5015 sbp = &mp->m_sb; 5016 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 5017 xfs_buf_relse(bp); 5018 5019 /* re-initialise in-core superblock and geometry structures */ 5020 xfs_reinit_percpu_counters(mp); 5021 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 5022 if (error) { 5023 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); 5024 return error; 5025 } 5026 5027 xlog_recover_check_summary(log); 5028 5029 /* Normal transactions can now occur */ 5030 log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 5031 return 0; 5032 } 5033 5034 /* 5035 * Perform recovery and re-initialize some log variables in xlog_find_tail. 5036 * 5037 * Return error or zero. 5038 */ 5039 int 5040 xlog_recover( 5041 struct xlog *log) 5042 { 5043 xfs_daddr_t head_blk, tail_blk; 5044 int error; 5045 5046 /* find the tail of the log */ 5047 error = xlog_find_tail(log, &head_blk, &tail_blk); 5048 if (error) 5049 return error; 5050 5051 /* 5052 * The superblock was read before the log was available and thus the LSN 5053 * could not be verified. Check the superblock LSN against the current 5054 * LSN now that it's known. 5055 */ 5056 if (xfs_sb_version_hascrc(&log->l_mp->m_sb) && 5057 !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) 5058 return -EINVAL; 5059 5060 if (tail_blk != head_blk) { 5061 /* There used to be a comment here: 5062 * 5063 * disallow recovery on read-only mounts. note -- mount 5064 * checks for ENOSPC and turns it into an intelligent 5065 * error message. 5066 * ...but this is no longer true. Now, unless you specify 5067 * NORECOVERY (in which case this function would never be 5068 * called), we just go ahead and recover. We do this all 5069 * under the vfs layer, so we can get away with it unless 5070 * the device itself is read-only, in which case we fail. 5071 */ 5072 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { 5073 return error; 5074 } 5075 5076 /* 5077 * Version 5 superblock log feature mask validation. We know the 5078 * log is dirty so check if there are any unknown log features 5079 * in what we need to recover. If there are unknown features 5080 * (e.g. unsupported transactions, then simply reject the 5081 * attempt at recovery before touching anything. 5082 */ 5083 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && 5084 xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, 5085 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { 5086 xfs_warn(log->l_mp, 5087 "Superblock has unknown incompatible log features (0x%x) enabled.", 5088 (log->l_mp->m_sb.sb_features_log_incompat & 5089 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); 5090 xfs_warn(log->l_mp, 5091 "The log can not be fully and/or safely recovered by this kernel."); 5092 xfs_warn(log->l_mp, 5093 "Please recover the log on a kernel that supports the unknown features."); 5094 return -EINVAL; 5095 } 5096 5097 /* 5098 * Delay log recovery if the debug hook is set. This is debug 5099 * instrumention to coordinate simulation of I/O failures with 5100 * log recovery. 5101 */ 5102 if (xfs_globals.log_recovery_delay) { 5103 xfs_notice(log->l_mp, 5104 "Delaying log recovery for %d seconds.", 5105 xfs_globals.log_recovery_delay); 5106 msleep(xfs_globals.log_recovery_delay * 1000); 5107 } 5108 5109 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", 5110 log->l_mp->m_logname ? log->l_mp->m_logname 5111 : "internal"); 5112 5113 error = xlog_do_recover(log, head_blk, tail_blk); 5114 log->l_flags |= XLOG_RECOVERY_NEEDED; 5115 } 5116 return error; 5117 } 5118 5119 /* 5120 * In the first part of recovery we replay inodes and buffers and build 5121 * up the list of extent free items which need to be processed. Here 5122 * we process the extent free items and clean up the on disk unlinked 5123 * inode lists. This is separated from the first part of recovery so 5124 * that the root and real-time bitmap inodes can be read in from disk in 5125 * between the two stages. This is necessary so that we can free space 5126 * in the real-time portion of the file system. 5127 */ 5128 int 5129 xlog_recover_finish( 5130 struct xlog *log) 5131 { 5132 /* 5133 * Now we're ready to do the transactions needed for the 5134 * rest of recovery. Start with completing all the extent 5135 * free intent records and then process the unlinked inode 5136 * lists. At this point, we essentially run in normal mode 5137 * except that we're still performing recovery actions 5138 * rather than accepting new requests. 5139 */ 5140 if (log->l_flags & XLOG_RECOVERY_NEEDED) { 5141 int error; 5142 error = xlog_recover_process_efis(log); 5143 if (error) { 5144 xfs_alert(log->l_mp, "Failed to recover EFIs"); 5145 return error; 5146 } 5147 /* 5148 * Sync the log to get all the EFIs out of the AIL. 5149 * This isn't absolutely necessary, but it helps in 5150 * case the unlink transactions would have problems 5151 * pushing the EFIs out of the way. 5152 */ 5153 xfs_log_force(log->l_mp, XFS_LOG_SYNC); 5154 5155 xlog_recover_process_iunlinks(log); 5156 5157 xlog_recover_check_summary(log); 5158 5159 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", 5160 log->l_mp->m_logname ? log->l_mp->m_logname 5161 : "internal"); 5162 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 5163 } else { 5164 xfs_info(log->l_mp, "Ending clean mount"); 5165 } 5166 return 0; 5167 } 5168 5169 int 5170 xlog_recover_cancel( 5171 struct xlog *log) 5172 { 5173 int error = 0; 5174 5175 if (log->l_flags & XLOG_RECOVERY_NEEDED) 5176 error = xlog_recover_cancel_efis(log); 5177 5178 return error; 5179 } 5180 5181 #if defined(DEBUG) 5182 /* 5183 * Read all of the agf and agi counters and check that they 5184 * are consistent with the superblock counters. 5185 */ 5186 void 5187 xlog_recover_check_summary( 5188 struct xlog *log) 5189 { 5190 xfs_mount_t *mp; 5191 xfs_agf_t *agfp; 5192 xfs_buf_t *agfbp; 5193 xfs_buf_t *agibp; 5194 xfs_agnumber_t agno; 5195 __uint64_t freeblks; 5196 __uint64_t itotal; 5197 __uint64_t ifree; 5198 int error; 5199 5200 mp = log->l_mp; 5201 5202 freeblks = 0LL; 5203 itotal = 0LL; 5204 ifree = 0LL; 5205 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 5206 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 5207 if (error) { 5208 xfs_alert(mp, "%s agf read failed agno %d error %d", 5209 __func__, agno, error); 5210 } else { 5211 agfp = XFS_BUF_TO_AGF(agfbp); 5212 freeblks += be32_to_cpu(agfp->agf_freeblks) + 5213 be32_to_cpu(agfp->agf_flcount); 5214 xfs_buf_relse(agfbp); 5215 } 5216 5217 error = xfs_read_agi(mp, NULL, agno, &agibp); 5218 if (error) { 5219 xfs_alert(mp, "%s agi read failed agno %d error %d", 5220 __func__, agno, error); 5221 } else { 5222 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 5223 5224 itotal += be32_to_cpu(agi->agi_count); 5225 ifree += be32_to_cpu(agi->agi_freecount); 5226 xfs_buf_relse(agibp); 5227 } 5228 } 5229 } 5230 #endif /* DEBUG */ 5231