1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 #include "xfs_log_priv.h" 20 #include "xfs_log_recover.h" 21 #include "xfs_error.h" 22 #include "xfs_inode.h" 23 #include "xfs_dir2.h" 24 #include "xfs_quota.h" 25 #include "xfs_alloc.h" 26 #include "xfs_ag.h" 27 #include "xfs_sb.h" 28 #include "xfs_rtgroup.h" 29 #include "xfs_rtbitmap.h" 30 31 /* 32 * This is the number of entries in the l_buf_cancel_table used during 33 * recovery. 34 */ 35 #define XLOG_BC_TABLE_SIZE 64 36 37 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ 38 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) 39 40 /* 41 * This structure is used during recovery to record the buf log items which 42 * have been canceled and should not be replayed. 43 */ 44 struct xfs_buf_cancel { 45 xfs_daddr_t bc_blkno; 46 uint bc_len; 47 int bc_refcount; 48 struct list_head bc_list; 49 }; 50 51 static struct xfs_buf_cancel * 52 xlog_find_buffer_cancelled( 53 struct xlog *log, 54 xfs_daddr_t blkno, 55 uint len) 56 { 57 struct list_head *bucket; 58 struct xfs_buf_cancel *bcp; 59 60 if (!log->l_buf_cancel_table) 61 return NULL; 62 63 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 64 list_for_each_entry(bcp, bucket, bc_list) { 65 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 66 return bcp; 67 } 68 69 return NULL; 70 } 71 72 static bool 73 xlog_add_buffer_cancelled( 74 struct xlog *log, 75 xfs_daddr_t blkno, 76 uint len) 77 { 78 struct xfs_buf_cancel *bcp; 79 80 /* 81 * If we find an existing cancel record, this indicates that the buffer 82 * was cancelled multiple times. To ensure that during pass 2 we keep 83 * the record in the table until we reach its last occurrence in the 84 * log, a reference count is kept to tell how many times we expect to 85 * see this record during the second pass. 86 */ 87 bcp = xlog_find_buffer_cancelled(log, blkno, len); 88 if (bcp) { 89 bcp->bc_refcount++; 90 return false; 91 } 92 93 bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); 94 bcp->bc_blkno = blkno; 95 bcp->bc_len = len; 96 bcp->bc_refcount = 1; 97 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); 98 return true; 99 } 100 101 /* 102 * Check if there is and entry for blkno, len in the buffer cancel record table. 103 */ 104 bool 105 xlog_is_buffer_cancelled( 106 struct xlog *log, 107 xfs_daddr_t blkno, 108 uint len) 109 { 110 return xlog_find_buffer_cancelled(log, blkno, len) != NULL; 111 } 112 113 /* 114 * Check if there is and entry for blkno, len in the buffer cancel record table, 115 * and decremented the reference count on it if there is one. 116 * 117 * Remove the cancel record once the refcount hits zero, so that if the same 118 * buffer is re-used again after its last cancellation we actually replay the 119 * changes made at that point. 120 */ 121 static bool 122 xlog_put_buffer_cancelled( 123 struct xlog *log, 124 xfs_daddr_t blkno, 125 uint len) 126 { 127 struct xfs_buf_cancel *bcp; 128 129 bcp = xlog_find_buffer_cancelled(log, blkno, len); 130 if (!bcp) { 131 ASSERT(0); 132 return false; 133 } 134 135 if (--bcp->bc_refcount == 0) { 136 list_del(&bcp->bc_list); 137 kfree(bcp); 138 } 139 return true; 140 } 141 142 /* log buffer item recovery */ 143 144 /* 145 * Sort buffer items for log recovery. Most buffer items should end up on the 146 * buffer list and are recovered first, with the following exceptions: 147 * 148 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items 149 * might depend on the incor ecancellation record, and replaying a cancelled 150 * buffer item can remove the incore record. 151 * 152 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that 153 * we replay di_next_unlinked only after flushing the inode 'free' state 154 * to the inode buffer. 155 * 156 * See xlog_recover_reorder_trans for more details. 157 */ 158 STATIC enum xlog_recover_reorder 159 xlog_recover_buf_reorder( 160 struct xlog_recover_item *item) 161 { 162 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 163 164 if (buf_f->blf_flags & XFS_BLF_CANCEL) 165 return XLOG_REORDER_CANCEL_LIST; 166 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 167 return XLOG_REORDER_INODE_BUFFER_LIST; 168 return XLOG_REORDER_BUFFER_LIST; 169 } 170 171 STATIC void 172 xlog_recover_buf_ra_pass2( 173 struct xlog *log, 174 struct xlog_recover_item *item) 175 { 176 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 177 178 xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); 179 } 180 181 /* 182 * Build up the table of buf cancel records so that we don't replay cancelled 183 * data in the second pass. 184 */ 185 static int 186 xlog_recover_buf_commit_pass1( 187 struct xlog *log, 188 struct xlog_recover_item *item) 189 { 190 struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; 191 192 if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { 193 xfs_err(log->l_mp, "bad buffer log item size (%d)", 194 item->ri_buf[0].i_len); 195 return -EFSCORRUPTED; 196 } 197 198 if (!(bf->blf_flags & XFS_BLF_CANCEL)) 199 trace_xfs_log_recover_buf_not_cancel(log, bf); 200 else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) 201 trace_xfs_log_recover_buf_cancel_add(log, bf); 202 else 203 trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); 204 return 0; 205 } 206 207 /* 208 * Validate the recovered buffer is of the correct type and attach the 209 * appropriate buffer operations to them for writeback. Magic numbers are in a 210 * few places: 211 * the first 16 bits of the buffer (inode buffer, dquot buffer), 212 * the first 32 bits of the buffer (most blocks), 213 * inside a struct xfs_da_blkinfo at the start of the buffer. 214 */ 215 static void 216 xlog_recover_validate_buf_type( 217 struct xfs_mount *mp, 218 struct xfs_buf *bp, 219 struct xfs_buf_log_format *buf_f, 220 xfs_lsn_t current_lsn) 221 { 222 struct xfs_da_blkinfo *info = bp->b_addr; 223 uint32_t magic32; 224 uint16_t magic16; 225 uint16_t magicda; 226 char *warnmsg = NULL; 227 228 /* 229 * We can only do post recovery validation on items on CRC enabled 230 * fielsystems as we need to know when the buffer was written to be able 231 * to determine if we should have replayed the item. If we replay old 232 * metadata over a newer buffer, then it will enter a temporarily 233 * inconsistent state resulting in verification failures. Hence for now 234 * just avoid the verification stage for non-crc filesystems 235 */ 236 if (!xfs_has_crc(mp)) 237 return; 238 239 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 240 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 241 magicda = be16_to_cpu(info->magic); 242 switch (xfs_blft_from_flags(buf_f)) { 243 case XFS_BLFT_BTREE_BUF: 244 switch (magic32) { 245 case XFS_ABTB_CRC_MAGIC: 246 case XFS_ABTB_MAGIC: 247 bp->b_ops = &xfs_bnobt_buf_ops; 248 break; 249 case XFS_ABTC_CRC_MAGIC: 250 case XFS_ABTC_MAGIC: 251 bp->b_ops = &xfs_cntbt_buf_ops; 252 break; 253 case XFS_IBT_CRC_MAGIC: 254 case XFS_IBT_MAGIC: 255 bp->b_ops = &xfs_inobt_buf_ops; 256 break; 257 case XFS_FIBT_CRC_MAGIC: 258 case XFS_FIBT_MAGIC: 259 bp->b_ops = &xfs_finobt_buf_ops; 260 break; 261 case XFS_BMAP_CRC_MAGIC: 262 case XFS_BMAP_MAGIC: 263 bp->b_ops = &xfs_bmbt_buf_ops; 264 break; 265 case XFS_RTRMAP_CRC_MAGIC: 266 bp->b_ops = &xfs_rtrmapbt_buf_ops; 267 break; 268 case XFS_RMAP_CRC_MAGIC: 269 bp->b_ops = &xfs_rmapbt_buf_ops; 270 break; 271 case XFS_REFC_CRC_MAGIC: 272 bp->b_ops = &xfs_refcountbt_buf_ops; 273 break; 274 case XFS_RTREFC_CRC_MAGIC: 275 bp->b_ops = &xfs_rtrefcountbt_buf_ops; 276 break; 277 default: 278 warnmsg = "Bad btree block magic!"; 279 break; 280 } 281 break; 282 case XFS_BLFT_AGF_BUF: 283 if (magic32 != XFS_AGF_MAGIC) { 284 warnmsg = "Bad AGF block magic!"; 285 break; 286 } 287 bp->b_ops = &xfs_agf_buf_ops; 288 break; 289 case XFS_BLFT_AGFL_BUF: 290 if (magic32 != XFS_AGFL_MAGIC) { 291 warnmsg = "Bad AGFL block magic!"; 292 break; 293 } 294 bp->b_ops = &xfs_agfl_buf_ops; 295 break; 296 case XFS_BLFT_AGI_BUF: 297 if (magic32 != XFS_AGI_MAGIC) { 298 warnmsg = "Bad AGI block magic!"; 299 break; 300 } 301 bp->b_ops = &xfs_agi_buf_ops; 302 break; 303 case XFS_BLFT_UDQUOT_BUF: 304 case XFS_BLFT_PDQUOT_BUF: 305 case XFS_BLFT_GDQUOT_BUF: 306 #ifdef CONFIG_XFS_QUOTA 307 if (magic16 != XFS_DQUOT_MAGIC) { 308 warnmsg = "Bad DQUOT block magic!"; 309 break; 310 } 311 bp->b_ops = &xfs_dquot_buf_ops; 312 #else 313 xfs_alert(mp, 314 "Trying to recover dquots without QUOTA support built in!"); 315 ASSERT(0); 316 #endif 317 break; 318 case XFS_BLFT_DINO_BUF: 319 if (magic16 != XFS_DINODE_MAGIC) { 320 warnmsg = "Bad INODE block magic!"; 321 break; 322 } 323 bp->b_ops = &xfs_inode_buf_ops; 324 break; 325 case XFS_BLFT_SYMLINK_BUF: 326 if (magic32 != XFS_SYMLINK_MAGIC) { 327 warnmsg = "Bad symlink block magic!"; 328 break; 329 } 330 bp->b_ops = &xfs_symlink_buf_ops; 331 break; 332 case XFS_BLFT_DIR_BLOCK_BUF: 333 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 334 magic32 != XFS_DIR3_BLOCK_MAGIC) { 335 warnmsg = "Bad dir block magic!"; 336 break; 337 } 338 bp->b_ops = &xfs_dir3_block_buf_ops; 339 break; 340 case XFS_BLFT_DIR_DATA_BUF: 341 if (magic32 != XFS_DIR2_DATA_MAGIC && 342 magic32 != XFS_DIR3_DATA_MAGIC) { 343 warnmsg = "Bad dir data magic!"; 344 break; 345 } 346 bp->b_ops = &xfs_dir3_data_buf_ops; 347 break; 348 case XFS_BLFT_DIR_FREE_BUF: 349 if (magic32 != XFS_DIR2_FREE_MAGIC && 350 magic32 != XFS_DIR3_FREE_MAGIC) { 351 warnmsg = "Bad dir3 free magic!"; 352 break; 353 } 354 bp->b_ops = &xfs_dir3_free_buf_ops; 355 break; 356 case XFS_BLFT_DIR_LEAF1_BUF: 357 if (magicda != XFS_DIR2_LEAF1_MAGIC && 358 magicda != XFS_DIR3_LEAF1_MAGIC) { 359 warnmsg = "Bad dir leaf1 magic!"; 360 break; 361 } 362 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 363 break; 364 case XFS_BLFT_DIR_LEAFN_BUF: 365 if (magicda != XFS_DIR2_LEAFN_MAGIC && 366 magicda != XFS_DIR3_LEAFN_MAGIC) { 367 warnmsg = "Bad dir leafn magic!"; 368 break; 369 } 370 bp->b_ops = &xfs_dir3_leafn_buf_ops; 371 break; 372 case XFS_BLFT_DA_NODE_BUF: 373 if (magicda != XFS_DA_NODE_MAGIC && 374 magicda != XFS_DA3_NODE_MAGIC) { 375 warnmsg = "Bad da node magic!"; 376 break; 377 } 378 bp->b_ops = &xfs_da3_node_buf_ops; 379 break; 380 case XFS_BLFT_ATTR_LEAF_BUF: 381 if (magicda != XFS_ATTR_LEAF_MAGIC && 382 magicda != XFS_ATTR3_LEAF_MAGIC) { 383 warnmsg = "Bad attr leaf magic!"; 384 break; 385 } 386 bp->b_ops = &xfs_attr3_leaf_buf_ops; 387 break; 388 case XFS_BLFT_ATTR_RMT_BUF: 389 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 390 warnmsg = "Bad attr remote magic!"; 391 break; 392 } 393 bp->b_ops = &xfs_attr3_rmt_buf_ops; 394 break; 395 case XFS_BLFT_SB_BUF: 396 if (magic32 != XFS_SB_MAGIC) { 397 warnmsg = "Bad SB block magic!"; 398 break; 399 } 400 bp->b_ops = &xfs_sb_buf_ops; 401 break; 402 #ifdef CONFIG_XFS_RT 403 case XFS_BLFT_RTBITMAP_BUF: 404 if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) { 405 warnmsg = "Bad rtbitmap magic!"; 406 break; 407 } 408 bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP); 409 break; 410 case XFS_BLFT_RTSUMMARY_BUF: 411 if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) { 412 warnmsg = "Bad rtsummary magic!"; 413 break; 414 } 415 bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY); 416 break; 417 #endif /* CONFIG_XFS_RT */ 418 default: 419 xfs_warn(mp, "Unknown buffer type %d!", 420 xfs_blft_from_flags(buf_f)); 421 break; 422 } 423 424 /* 425 * Nothing else to do in the case of a NULL current LSN as this means 426 * the buffer is more recent than the change in the log and will be 427 * skipped. 428 */ 429 if (current_lsn == NULLCOMMITLSN) 430 return; 431 432 if (warnmsg) { 433 xfs_warn(mp, warnmsg); 434 ASSERT(0); 435 } 436 437 /* 438 * We must update the metadata LSN of the buffer as it is written out to 439 * ensure that older transactions never replay over this one and corrupt 440 * the buffer. This can occur if log recovery is interrupted at some 441 * point after the current transaction completes, at which point a 442 * subsequent mount starts recovery from the beginning. 443 * 444 * Write verifiers update the metadata LSN from log items attached to 445 * the buffer. Therefore, initialize a bli purely to carry the LSN to 446 * the verifier. 447 */ 448 if (bp->b_ops) { 449 struct xfs_buf_log_item *bip; 450 451 bp->b_flags |= _XBF_LOGRECOVERY; 452 xfs_buf_item_init(bp, mp); 453 bip = bp->b_log_item; 454 bip->bli_item.li_lsn = current_lsn; 455 } 456 } 457 458 /* 459 * Perform a 'normal' buffer recovery. Each logged region of the 460 * buffer should be copied over the corresponding region in the 461 * given buffer. The bitmap in the buf log format structure indicates 462 * where to place the logged data. 463 */ 464 STATIC void 465 xlog_recover_do_reg_buffer( 466 struct xfs_mount *mp, 467 struct xlog_recover_item *item, 468 struct xfs_buf *bp, 469 struct xfs_buf_log_format *buf_f, 470 xfs_lsn_t current_lsn) 471 { 472 int i; 473 int bit; 474 int nbits; 475 xfs_failaddr_t fa; 476 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); 477 478 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 479 480 bit = 0; 481 i = 1; /* 0 is the buf format structure */ 482 while (1) { 483 bit = xfs_next_bit(buf_f->blf_data_map, 484 buf_f->blf_map_size, bit); 485 if (bit == -1) 486 break; 487 nbits = xfs_contig_bits(buf_f->blf_data_map, 488 buf_f->blf_map_size, bit); 489 ASSERT(nbits > 0); 490 ASSERT(item->ri_buf[i].i_addr != NULL); 491 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 492 ASSERT(BBTOB(bp->b_length) >= 493 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 494 495 /* 496 * The dirty regions logged in the buffer, even though 497 * contiguous, may span multiple chunks. This is because the 498 * dirty region may span a physical page boundary in a buffer 499 * and hence be split into two separate vectors for writing into 500 * the log. Hence we need to trim nbits back to the length of 501 * the current region being copied out of the log. 502 */ 503 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 504 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 505 506 /* 507 * Do a sanity check if this is a dquot buffer. Just checking 508 * the first dquot in the buffer should do. XXXThis is 509 * probably a good thing to do for other buf types also. 510 */ 511 fa = NULL; 512 if (buf_f->blf_flags & 513 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 514 if (item->ri_buf[i].i_addr == NULL) { 515 xfs_alert(mp, 516 "XFS: NULL dquot in %s.", __func__); 517 goto next; 518 } 519 if (item->ri_buf[i].i_len < size_disk_dquot) { 520 xfs_alert(mp, 521 "XFS: dquot too small (%d) in %s.", 522 item->ri_buf[i].i_len, __func__); 523 goto next; 524 } 525 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); 526 if (fa) { 527 xfs_alert(mp, 528 "dquot corrupt at %pS trying to replay into block 0x%llx", 529 fa, xfs_buf_daddr(bp)); 530 goto next; 531 } 532 } 533 534 memcpy(xfs_buf_offset(bp, 535 (uint)bit << XFS_BLF_SHIFT), /* dest */ 536 item->ri_buf[i].i_addr, /* source */ 537 nbits<<XFS_BLF_SHIFT); /* length */ 538 next: 539 i++; 540 bit += nbits; 541 } 542 543 /* Shouldn't be any more regions */ 544 ASSERT(i == item->ri_total); 545 546 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 547 } 548 549 /* 550 * Perform a dquot buffer recovery. 551 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 552 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 553 * Else, treat it as a regular buffer and do recovery. 554 * 555 * Return false if the buffer was tossed and true if we recovered the buffer to 556 * indicate to the caller if the buffer needs writing. 557 */ 558 STATIC bool 559 xlog_recover_do_dquot_buffer( 560 struct xfs_mount *mp, 561 struct xlog *log, 562 struct xlog_recover_item *item, 563 struct xfs_buf *bp, 564 struct xfs_buf_log_format *buf_f) 565 { 566 uint type; 567 568 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 569 570 /* 571 * Filesystems are required to send in quota flags at mount time. 572 */ 573 if (!mp->m_qflags) 574 return false; 575 576 type = 0; 577 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 578 type |= XFS_DQTYPE_USER; 579 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 580 type |= XFS_DQTYPE_PROJ; 581 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 582 type |= XFS_DQTYPE_GROUP; 583 /* 584 * This type of quotas was turned off, so ignore this buffer 585 */ 586 if (log->l_quotaoffs_flag & type) 587 return false; 588 589 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 590 return true; 591 } 592 593 /* 594 * Perform recovery for a buffer full of inodes. In these buffers, the only 595 * data which should be recovered is that which corresponds to the 596 * di_next_unlinked pointers in the on disk inode structures. The rest of the 597 * data for the inodes is always logged through the inodes themselves rather 598 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 599 * 600 * The only time when buffers full of inodes are fully recovered is when the 601 * buffer is full of newly allocated inodes. In this case the buffer will 602 * not be marked as an inode buffer and so will be sent to 603 * xlog_recover_do_reg_buffer() below during recovery. 604 */ 605 STATIC int 606 xlog_recover_do_inode_buffer( 607 struct xfs_mount *mp, 608 struct xlog_recover_item *item, 609 struct xfs_buf *bp, 610 struct xfs_buf_log_format *buf_f) 611 { 612 int i; 613 int item_index = 0; 614 int bit = 0; 615 int nbits = 0; 616 int reg_buf_offset = 0; 617 int reg_buf_bytes = 0; 618 int next_unlinked_offset; 619 int inodes_per_buf; 620 xfs_agino_t *logged_nextp; 621 xfs_agino_t *buffer_nextp; 622 623 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 624 625 /* 626 * Post recovery validation only works properly on CRC enabled 627 * filesystems. 628 */ 629 if (xfs_has_crc(mp)) 630 bp->b_ops = &xfs_inode_buf_ops; 631 632 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; 633 for (i = 0; i < inodes_per_buf; i++) { 634 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 635 offsetof(struct xfs_dinode, di_next_unlinked); 636 637 while (next_unlinked_offset >= 638 (reg_buf_offset + reg_buf_bytes)) { 639 /* 640 * The next di_next_unlinked field is beyond 641 * the current logged region. Find the next 642 * logged region that contains or is beyond 643 * the current di_next_unlinked field. 644 */ 645 bit += nbits; 646 bit = xfs_next_bit(buf_f->blf_data_map, 647 buf_f->blf_map_size, bit); 648 649 /* 650 * If there are no more logged regions in the 651 * buffer, then we're done. 652 */ 653 if (bit == -1) 654 return 0; 655 656 nbits = xfs_contig_bits(buf_f->blf_data_map, 657 buf_f->blf_map_size, bit); 658 ASSERT(nbits > 0); 659 reg_buf_offset = bit << XFS_BLF_SHIFT; 660 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 661 item_index++; 662 } 663 664 /* 665 * If the current logged region starts after the current 666 * di_next_unlinked field, then move on to the next 667 * di_next_unlinked field. 668 */ 669 if (next_unlinked_offset < reg_buf_offset) 670 continue; 671 672 ASSERT(item->ri_buf[item_index].i_addr != NULL); 673 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 674 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); 675 676 /* 677 * The current logged region contains a copy of the 678 * current di_next_unlinked field. Extract its value 679 * and copy it to the buffer copy. 680 */ 681 logged_nextp = item->ri_buf[item_index].i_addr + 682 next_unlinked_offset - reg_buf_offset; 683 if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { 684 xfs_alert(mp, 685 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " 686 "Trying to replay bad (0) inode di_next_unlinked field.", 687 item, bp); 688 return -EFSCORRUPTED; 689 } 690 691 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 692 *buffer_nextp = *logged_nextp; 693 694 /* 695 * If necessary, recalculate the CRC in the on-disk inode. We 696 * have to leave the inode in a consistent state for whoever 697 * reads it next.... 698 */ 699 xfs_dinode_calc_crc(mp, 700 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 701 702 } 703 704 return 0; 705 } 706 707 /* 708 * Update the in-memory superblock and perag structures from the primary SB 709 * buffer. 710 * 711 * This is required because transactions running after growfs may require the 712 * updated values to be set in a previous fully commit transaction. 713 */ 714 static int 715 xlog_recover_do_primary_sb_buffer( 716 struct xfs_mount *mp, 717 struct xlog_recover_item *item, 718 struct xfs_buf *bp, 719 struct xfs_buf_log_format *buf_f, 720 xfs_lsn_t current_lsn) 721 { 722 struct xfs_dsb *dsb = bp->b_addr; 723 xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; 724 xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount; 725 int error; 726 727 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 728 729 if (orig_agcount == 0) { 730 xfs_alert(mp, "Trying to grow file system without AGs"); 731 return -EFSCORRUPTED; 732 } 733 734 /* 735 * Update the in-core super block from the freshly recovered on-disk one. 736 */ 737 xfs_sb_from_disk(&mp->m_sb, dsb); 738 739 if (mp->m_sb.sb_agcount < orig_agcount) { 740 xfs_alert(mp, "Shrinking AG count in log recovery not supported"); 741 return -EFSCORRUPTED; 742 } 743 if (mp->m_sb.sb_rgcount < orig_rgcount) { 744 xfs_warn(mp, 745 "Shrinking rtgroup count in log recovery not supported"); 746 return -EFSCORRUPTED; 747 } 748 749 /* 750 * If the last AG was grown or shrunk, we also need to update the 751 * length in the in-core perag structure and values depending on it. 752 */ 753 error = xfs_update_last_ag_size(mp, orig_agcount); 754 if (error) 755 return error; 756 757 /* 758 * If the last rtgroup was grown or shrunk, we also need to update the 759 * length in the in-core rtgroup structure and values depending on it. 760 * Ignore this on any filesystem with zero rtgroups. 761 */ 762 if (orig_rgcount > 0) { 763 error = xfs_update_last_rtgroup_size(mp, orig_rgcount); 764 if (error) 765 return error; 766 } 767 768 /* 769 * Initialize the new perags, and also update various block and inode 770 * allocator setting based off the number of AGs or total blocks. 771 * Because of the latter this also needs to happen if the agcount did 772 * not change. 773 */ 774 error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, 775 mp->m_sb.sb_dblocks, &mp->m_maxagi); 776 if (error) { 777 xfs_warn(mp, "Failed recovery per-ag init: %d", error); 778 return error; 779 } 780 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); 781 782 error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount, 783 mp->m_sb.sb_rextents); 784 if (error) { 785 xfs_warn(mp, "Failed recovery rtgroup init: %d", error); 786 return error; 787 } 788 return 0; 789 } 790 791 /* 792 * V5 filesystems know the age of the buffer on disk being recovered. We can 793 * have newer objects on disk than we are replaying, and so for these cases we 794 * don't want to replay the current change as that will make the buffer contents 795 * temporarily invalid on disk. 796 * 797 * The magic number might not match the buffer type we are going to recover 798 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 799 * extract the LSN of the existing object in the buffer based on it's current 800 * magic number. If we don't recognise the magic number in the buffer, then 801 * return a LSN of -1 so that the caller knows it was an unrecognised block and 802 * so can recover the buffer. 803 * 804 * Note: we cannot rely solely on magic number matches to determine that the 805 * buffer has a valid LSN - we also need to verify that it belongs to this 806 * filesystem, so we need to extract the object's LSN and compare it to that 807 * which we read from the superblock. If the UUIDs don't match, then we've got a 808 * stale metadata block from an old filesystem instance that we need to recover 809 * over the top of. 810 */ 811 static xfs_lsn_t 812 xlog_recover_get_buf_lsn( 813 struct xfs_mount *mp, 814 struct xfs_buf *bp, 815 struct xfs_buf_log_format *buf_f) 816 { 817 uint32_t magic32; 818 uint16_t magic16; 819 uint16_t magicda; 820 void *blk = bp->b_addr; 821 uuid_t *uuid; 822 xfs_lsn_t lsn = -1; 823 uint16_t blft; 824 825 /* v4 filesystems always recover immediately */ 826 if (!xfs_has_crc(mp)) 827 goto recover_immediately; 828 829 /* 830 * realtime bitmap and summary file blocks do not have magic numbers or 831 * UUIDs, so we must recover them immediately. 832 */ 833 blft = xfs_blft_from_flags(buf_f); 834 if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF || 835 blft == XFS_BLFT_RTSUMMARY_BUF)) 836 goto recover_immediately; 837 838 magic32 = be32_to_cpu(*(__be32 *)blk); 839 switch (magic32) { 840 case XFS_RTSUMMARY_MAGIC: 841 case XFS_RTBITMAP_MAGIC: { 842 struct xfs_rtbuf_blkinfo *hdr = blk; 843 844 lsn = be64_to_cpu(hdr->rt_lsn); 845 uuid = &hdr->rt_uuid; 846 break; 847 } 848 case XFS_ABTB_CRC_MAGIC: 849 case XFS_ABTC_CRC_MAGIC: 850 case XFS_ABTB_MAGIC: 851 case XFS_ABTC_MAGIC: 852 case XFS_RMAP_CRC_MAGIC: 853 case XFS_REFC_CRC_MAGIC: 854 case XFS_FIBT_CRC_MAGIC: 855 case XFS_FIBT_MAGIC: 856 case XFS_IBT_CRC_MAGIC: 857 case XFS_IBT_MAGIC: { 858 struct xfs_btree_block *btb = blk; 859 860 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 861 uuid = &btb->bb_u.s.bb_uuid; 862 break; 863 } 864 case XFS_RTRMAP_CRC_MAGIC: 865 case XFS_RTREFC_CRC_MAGIC: 866 case XFS_BMAP_CRC_MAGIC: 867 case XFS_BMAP_MAGIC: { 868 struct xfs_btree_block *btb = blk; 869 870 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 871 uuid = &btb->bb_u.l.bb_uuid; 872 break; 873 } 874 case XFS_AGF_MAGIC: 875 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 876 uuid = &((struct xfs_agf *)blk)->agf_uuid; 877 break; 878 case XFS_AGFL_MAGIC: 879 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 880 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 881 break; 882 case XFS_AGI_MAGIC: 883 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 884 uuid = &((struct xfs_agi *)blk)->agi_uuid; 885 break; 886 case XFS_SYMLINK_MAGIC: 887 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 888 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 889 break; 890 case XFS_DIR3_BLOCK_MAGIC: 891 case XFS_DIR3_DATA_MAGIC: 892 case XFS_DIR3_FREE_MAGIC: 893 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 894 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 895 break; 896 case XFS_ATTR3_RMT_MAGIC: 897 /* 898 * Remote attr blocks are written synchronously, rather than 899 * being logged. That means they do not contain a valid LSN 900 * (i.e. transactionally ordered) in them, and hence any time we 901 * see a buffer to replay over the top of a remote attribute 902 * block we should simply do so. 903 */ 904 goto recover_immediately; 905 case XFS_SB_MAGIC: 906 /* 907 * superblock uuids are magic. We may or may not have a 908 * sb_meta_uuid on disk, but it will be set in the in-core 909 * superblock. We set the uuid pointer for verification 910 * according to the superblock feature mask to ensure we check 911 * the relevant UUID in the superblock. 912 */ 913 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 914 if (xfs_has_metauuid(mp)) 915 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 916 else 917 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 918 break; 919 default: 920 break; 921 } 922 923 if (lsn != (xfs_lsn_t)-1) { 924 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 925 goto recover_immediately; 926 return lsn; 927 } 928 929 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 930 switch (magicda) { 931 case XFS_DIR3_LEAF1_MAGIC: 932 case XFS_DIR3_LEAFN_MAGIC: 933 case XFS_ATTR3_LEAF_MAGIC: 934 case XFS_DA3_NODE_MAGIC: 935 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 936 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 937 break; 938 default: 939 break; 940 } 941 942 if (lsn != (xfs_lsn_t)-1) { 943 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 944 goto recover_immediately; 945 return lsn; 946 } 947 948 /* 949 * We do individual object checks on dquot and inode buffers as they 950 * have their own individual LSN records. Also, we could have a stale 951 * buffer here, so we have to at least recognise these buffer types. 952 * 953 * A notd complexity here is inode unlinked list processing - it logs 954 * the inode directly in the buffer, but we don't know which inodes have 955 * been modified, and there is no global buffer LSN. Hence we need to 956 * recover all inode buffer types immediately. This problem will be 957 * fixed by logical logging of the unlinked list modifications. 958 */ 959 magic16 = be16_to_cpu(*(__be16 *)blk); 960 switch (magic16) { 961 case XFS_DQUOT_MAGIC: 962 case XFS_DINODE_MAGIC: 963 goto recover_immediately; 964 default: 965 break; 966 } 967 968 /* unknown buffer contents, recover immediately */ 969 970 recover_immediately: 971 return (xfs_lsn_t)-1; 972 973 } 974 975 /* 976 * This routine replays a modification made to a buffer at runtime. 977 * There are actually two types of buffer, regular and inode, which 978 * are handled differently. Inode buffers are handled differently 979 * in that we only recover a specific set of data from them, namely 980 * the inode di_next_unlinked fields. This is because all other inode 981 * data is actually logged via inode records and any data we replay 982 * here which overlaps that may be stale. 983 * 984 * When meta-data buffers are freed at run time we log a buffer item 985 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 986 * of the buffer in the log should not be replayed at recovery time. 987 * This is so that if the blocks covered by the buffer are reused for 988 * file data before we crash we don't end up replaying old, freed 989 * meta-data into a user's file. 990 * 991 * To handle the cancellation of buffer log items, we make two passes 992 * over the log during recovery. During the first we build a table of 993 * those buffers which have been cancelled, and during the second we 994 * only replay those buffers which do not have corresponding cancel 995 * records in the table. See xlog_recover_buf_pass[1,2] above 996 * for more details on the implementation of the table of cancel records. 997 */ 998 STATIC int 999 xlog_recover_buf_commit_pass2( 1000 struct xlog *log, 1001 struct list_head *buffer_list, 1002 struct xlog_recover_item *item, 1003 xfs_lsn_t current_lsn) 1004 { 1005 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 1006 struct xfs_mount *mp = log->l_mp; 1007 struct xfs_buf *bp; 1008 int error; 1009 uint buf_flags; 1010 xfs_lsn_t lsn; 1011 1012 /* 1013 * In this pass we only want to recover all the buffers which have 1014 * not been cancelled and are not cancellation buffers themselves. 1015 */ 1016 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1017 if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, 1018 buf_f->blf_len)) 1019 goto cancelled; 1020 } else { 1021 1022 if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, 1023 buf_f->blf_len)) 1024 goto cancelled; 1025 } 1026 1027 trace_xfs_log_recover_buf_recover(log, buf_f); 1028 1029 buf_flags = 0; 1030 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 1031 buf_flags |= XBF_UNMAPPED; 1032 1033 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 1034 buf_flags, &bp, NULL); 1035 if (error) 1036 return error; 1037 1038 /* 1039 * Recover the buffer only if we get an LSN from it and it's less than 1040 * the lsn of the transaction we are replaying. 1041 * 1042 * Note that we have to be extremely careful of readahead here. 1043 * Readahead does not attach verfiers to the buffers so if we don't 1044 * actually do any replay after readahead because of the LSN we found 1045 * in the buffer if more recent than that current transaction then we 1046 * need to attach the verifier directly. Failure to do so can lead to 1047 * future recovery actions (e.g. EFI and unlinked list recovery) can 1048 * operate on the buffers and they won't get the verifier attached. This 1049 * can lead to blocks on disk having the correct content but a stale 1050 * CRC. 1051 * 1052 * It is safe to assume these clean buffers are currently up to date. 1053 * If the buffer is dirtied by a later transaction being replayed, then 1054 * the verifier will be reset to match whatever recover turns that 1055 * buffer into. 1056 */ 1057 lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); 1058 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 1059 trace_xfs_log_recover_buf_skip(log, buf_f); 1060 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 1061 1062 /* 1063 * We're skipping replay of this buffer log item due to the log 1064 * item LSN being behind the ondisk buffer. Verify the buffer 1065 * contents since we aren't going to run the write verifier. 1066 */ 1067 if (bp->b_ops) { 1068 bp->b_ops->verify_read(bp); 1069 error = bp->b_error; 1070 } 1071 goto out_release; 1072 } 1073 1074 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 1075 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 1076 if (error) 1077 goto out_release; 1078 } else if (buf_f->blf_flags & 1079 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1080 bool dirty; 1081 1082 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 1083 if (!dirty) 1084 goto out_release; 1085 } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && 1086 xfs_buf_daddr(bp) == 0) { 1087 error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, 1088 current_lsn); 1089 if (error) 1090 goto out_writebuf; 1091 1092 /* Update the rt superblock if we have one. */ 1093 if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { 1094 struct xfs_buf *rtsb_bp = mp->m_rtsb_bp; 1095 1096 xfs_buf_lock(rtsb_bp); 1097 xfs_buf_hold(rtsb_bp); 1098 xfs_update_rtsb(rtsb_bp, bp); 1099 rtsb_bp->b_flags |= _XBF_LOGRECOVERY; 1100 xfs_buf_delwri_queue(rtsb_bp, buffer_list); 1101 xfs_buf_relse(rtsb_bp); 1102 } 1103 } else { 1104 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 1105 } 1106 1107 /* 1108 * Buffer held by buf log item during 'normal' buffer recovery must 1109 * be committed through buffer I/O submission path to ensure proper 1110 * release. When error occurs during sb buffer recovery, log shutdown 1111 * will be done before submitting buffer list so that buffers can be 1112 * released correctly through ioend failure path. 1113 */ 1114 out_writebuf: 1115 1116 /* 1117 * Perform delayed write on the buffer. Asynchronous writes will be 1118 * slower when taking into account all the buffers to be flushed. 1119 * 1120 * Also make sure that only inode buffers with good sizes stay in 1121 * the buffer cache. The kernel moves inodes in buffers of 1 block 1122 * or inode_cluster_size bytes, whichever is bigger. The inode 1123 * buffers in the log can be a different size if the log was generated 1124 * by an older kernel using unclustered inode buffers or a newer kernel 1125 * running with a different inode cluster size. Regardless, if 1126 * the inode buffer size isn't max(blocksize, inode_cluster_size) 1127 * for *our* value of inode_cluster_size, then we need to keep 1128 * the buffer out of the buffer cache so that the buffer won't 1129 * overlap with future reads of those inodes. 1130 */ 1131 if (XFS_DINODE_MAGIC == 1132 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 1133 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { 1134 xfs_buf_stale(bp); 1135 error = xfs_bwrite(bp); 1136 } else { 1137 ASSERT(bp->b_mount == mp); 1138 bp->b_flags |= _XBF_LOGRECOVERY; 1139 xfs_buf_delwri_queue(bp, buffer_list); 1140 } 1141 1142 out_release: 1143 xfs_buf_relse(bp); 1144 return error; 1145 cancelled: 1146 trace_xfs_log_recover_buf_cancel(log, buf_f); 1147 return 0; 1148 } 1149 1150 const struct xlog_recover_item_ops xlog_buf_item_ops = { 1151 .item_type = XFS_LI_BUF, 1152 .reorder = xlog_recover_buf_reorder, 1153 .ra_pass2 = xlog_recover_buf_ra_pass2, 1154 .commit_pass1 = xlog_recover_buf_commit_pass1, 1155 .commit_pass2 = xlog_recover_buf_commit_pass2, 1156 }; 1157 1158 #ifdef DEBUG 1159 void 1160 xlog_check_buf_cancel_table( 1161 struct xlog *log) 1162 { 1163 int i; 1164 1165 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1166 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 1167 } 1168 #endif 1169 1170 int 1171 xlog_alloc_buf_cancel_table( 1172 struct xlog *log) 1173 { 1174 void *p; 1175 int i; 1176 1177 ASSERT(log->l_buf_cancel_table == NULL); 1178 1179 p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), 1180 GFP_KERNEL); 1181 if (!p) 1182 return -ENOMEM; 1183 1184 log->l_buf_cancel_table = p; 1185 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1186 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 1187 1188 return 0; 1189 } 1190 1191 void 1192 xlog_free_buf_cancel_table( 1193 struct xlog *log) 1194 { 1195 int i; 1196 1197 if (!log->l_buf_cancel_table) 1198 return; 1199 1200 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { 1201 struct xfs_buf_cancel *bc; 1202 1203 while ((bc = list_first_entry_or_null( 1204 &log->l_buf_cancel_table[i], 1205 struct xfs_buf_cancel, bc_list))) { 1206 list_del(&bc->bc_list); 1207 kfree(bc); 1208 } 1209 } 1210 1211 kfree(log->l_buf_cancel_table); 1212 log->l_buf_cancel_table = NULL; 1213 } 1214