1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 #include "xfs_log_priv.h" 20 #include "xfs_log_recover.h" 21 #include "xfs_error.h" 22 #include "xfs_inode.h" 23 #include "xfs_dir2.h" 24 #include "xfs_quota.h" 25 #include "xfs_alloc.h" 26 #include "xfs_ag.h" 27 #include "xfs_sb.h" 28 #include "xfs_rtgroup.h" 29 #include "xfs_rtbitmap.h" 30 31 /* 32 * This is the number of entries in the l_buf_cancel_table used during 33 * recovery. 34 */ 35 #define XLOG_BC_TABLE_SIZE 64 36 37 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ 38 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) 39 40 /* 41 * This structure is used during recovery to record the buf log items which 42 * have been canceled and should not be replayed. 43 */ 44 struct xfs_buf_cancel { 45 xfs_daddr_t bc_blkno; 46 uint bc_len; 47 int bc_refcount; 48 struct list_head bc_list; 49 }; 50 51 static struct xfs_buf_cancel * 52 xlog_find_buffer_cancelled( 53 struct xlog *log, 54 xfs_daddr_t blkno, 55 uint len) 56 { 57 struct list_head *bucket; 58 struct xfs_buf_cancel *bcp; 59 60 if (!log->l_buf_cancel_table) 61 return NULL; 62 63 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 64 list_for_each_entry(bcp, bucket, bc_list) { 65 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 66 return bcp; 67 } 68 69 return NULL; 70 } 71 72 static bool 73 xlog_add_buffer_cancelled( 74 struct xlog *log, 75 xfs_daddr_t blkno, 76 uint len) 77 { 78 struct xfs_buf_cancel *bcp; 79 80 /* 81 * If we find an existing cancel record, this indicates that the buffer 82 * was cancelled multiple times. To ensure that during pass 2 we keep 83 * the record in the table until we reach its last occurrence in the 84 * log, a reference count is kept to tell how many times we expect to 85 * see this record during the second pass. 86 */ 87 bcp = xlog_find_buffer_cancelled(log, blkno, len); 88 if (bcp) { 89 bcp->bc_refcount++; 90 return false; 91 } 92 93 bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); 94 bcp->bc_blkno = blkno; 95 bcp->bc_len = len; 96 bcp->bc_refcount = 1; 97 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); 98 return true; 99 } 100 101 /* 102 * Check if there is and entry for blkno, len in the buffer cancel record table. 103 */ 104 bool 105 xlog_is_buffer_cancelled( 106 struct xlog *log, 107 xfs_daddr_t blkno, 108 uint len) 109 { 110 return xlog_find_buffer_cancelled(log, blkno, len) != NULL; 111 } 112 113 /* 114 * Check if there is and entry for blkno, len in the buffer cancel record table, 115 * and decremented the reference count on it if there is one. 116 * 117 * Remove the cancel record once the refcount hits zero, so that if the same 118 * buffer is re-used again after its last cancellation we actually replay the 119 * changes made at that point. 120 */ 121 static bool 122 xlog_put_buffer_cancelled( 123 struct xlog *log, 124 xfs_daddr_t blkno, 125 uint len) 126 { 127 struct xfs_buf_cancel *bcp; 128 129 bcp = xlog_find_buffer_cancelled(log, blkno, len); 130 if (!bcp) { 131 ASSERT(0); 132 return false; 133 } 134 135 if (--bcp->bc_refcount == 0) { 136 list_del(&bcp->bc_list); 137 kfree(bcp); 138 } 139 return true; 140 } 141 142 /* log buffer item recovery */ 143 144 /* 145 * Sort buffer items for log recovery. Most buffer items should end up on the 146 * buffer list and are recovered first, with the following exceptions: 147 * 148 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items 149 * might depend on the incor ecancellation record, and replaying a cancelled 150 * buffer item can remove the incore record. 151 * 152 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that 153 * we replay di_next_unlinked only after flushing the inode 'free' state 154 * to the inode buffer. 155 * 156 * See xlog_recover_reorder_trans for more details. 157 */ 158 STATIC enum xlog_recover_reorder 159 xlog_recover_buf_reorder( 160 struct xlog_recover_item *item) 161 { 162 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 163 164 if (buf_f->blf_flags & XFS_BLF_CANCEL) 165 return XLOG_REORDER_CANCEL_LIST; 166 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 167 return XLOG_REORDER_INODE_BUFFER_LIST; 168 return XLOG_REORDER_BUFFER_LIST; 169 } 170 171 STATIC void 172 xlog_recover_buf_ra_pass2( 173 struct xlog *log, 174 struct xlog_recover_item *item) 175 { 176 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 177 178 xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); 179 } 180 181 /* 182 * Build up the table of buf cancel records so that we don't replay cancelled 183 * data in the second pass. 184 */ 185 static int 186 xlog_recover_buf_commit_pass1( 187 struct xlog *log, 188 struct xlog_recover_item *item) 189 { 190 struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; 191 192 if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { 193 xfs_err(log->l_mp, "bad buffer log item size (%d)", 194 item->ri_buf[0].i_len); 195 return -EFSCORRUPTED; 196 } 197 198 if (!(bf->blf_flags & XFS_BLF_CANCEL)) 199 trace_xfs_log_recover_buf_not_cancel(log, bf); 200 else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) 201 trace_xfs_log_recover_buf_cancel_add(log, bf); 202 else 203 trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); 204 return 0; 205 } 206 207 /* 208 * Validate the recovered buffer is of the correct type and attach the 209 * appropriate buffer operations to them for writeback. Magic numbers are in a 210 * few places: 211 * the first 16 bits of the buffer (inode buffer, dquot buffer), 212 * the first 32 bits of the buffer (most blocks), 213 * inside a struct xfs_da_blkinfo at the start of the buffer. 214 */ 215 static void 216 xlog_recover_validate_buf_type( 217 struct xfs_mount *mp, 218 struct xfs_buf *bp, 219 struct xfs_buf_log_format *buf_f, 220 xfs_lsn_t current_lsn) 221 { 222 struct xfs_da_blkinfo *info = bp->b_addr; 223 uint32_t magic32; 224 uint16_t magic16; 225 uint16_t magicda; 226 char *warnmsg = NULL; 227 228 /* 229 * We can only do post recovery validation on items on CRC enabled 230 * fielsystems as we need to know when the buffer was written to be able 231 * to determine if we should have replayed the item. If we replay old 232 * metadata over a newer buffer, then it will enter a temporarily 233 * inconsistent state resulting in verification failures. Hence for now 234 * just avoid the verification stage for non-crc filesystems 235 */ 236 if (!xfs_has_crc(mp)) 237 return; 238 239 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 240 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 241 magicda = be16_to_cpu(info->magic); 242 switch (xfs_blft_from_flags(buf_f)) { 243 case XFS_BLFT_BTREE_BUF: 244 switch (magic32) { 245 case XFS_ABTB_CRC_MAGIC: 246 case XFS_ABTB_MAGIC: 247 bp->b_ops = &xfs_bnobt_buf_ops; 248 break; 249 case XFS_ABTC_CRC_MAGIC: 250 case XFS_ABTC_MAGIC: 251 bp->b_ops = &xfs_cntbt_buf_ops; 252 break; 253 case XFS_IBT_CRC_MAGIC: 254 case XFS_IBT_MAGIC: 255 bp->b_ops = &xfs_inobt_buf_ops; 256 break; 257 case XFS_FIBT_CRC_MAGIC: 258 case XFS_FIBT_MAGIC: 259 bp->b_ops = &xfs_finobt_buf_ops; 260 break; 261 case XFS_BMAP_CRC_MAGIC: 262 case XFS_BMAP_MAGIC: 263 bp->b_ops = &xfs_bmbt_buf_ops; 264 break; 265 case XFS_RMAP_CRC_MAGIC: 266 bp->b_ops = &xfs_rmapbt_buf_ops; 267 break; 268 case XFS_REFC_CRC_MAGIC: 269 bp->b_ops = &xfs_refcountbt_buf_ops; 270 break; 271 default: 272 warnmsg = "Bad btree block magic!"; 273 break; 274 } 275 break; 276 case XFS_BLFT_AGF_BUF: 277 if (magic32 != XFS_AGF_MAGIC) { 278 warnmsg = "Bad AGF block magic!"; 279 break; 280 } 281 bp->b_ops = &xfs_agf_buf_ops; 282 break; 283 case XFS_BLFT_AGFL_BUF: 284 if (magic32 != XFS_AGFL_MAGIC) { 285 warnmsg = "Bad AGFL block magic!"; 286 break; 287 } 288 bp->b_ops = &xfs_agfl_buf_ops; 289 break; 290 case XFS_BLFT_AGI_BUF: 291 if (magic32 != XFS_AGI_MAGIC) { 292 warnmsg = "Bad AGI block magic!"; 293 break; 294 } 295 bp->b_ops = &xfs_agi_buf_ops; 296 break; 297 case XFS_BLFT_UDQUOT_BUF: 298 case XFS_BLFT_PDQUOT_BUF: 299 case XFS_BLFT_GDQUOT_BUF: 300 #ifdef CONFIG_XFS_QUOTA 301 if (magic16 != XFS_DQUOT_MAGIC) { 302 warnmsg = "Bad DQUOT block magic!"; 303 break; 304 } 305 bp->b_ops = &xfs_dquot_buf_ops; 306 #else 307 xfs_alert(mp, 308 "Trying to recover dquots without QUOTA support built in!"); 309 ASSERT(0); 310 #endif 311 break; 312 case XFS_BLFT_DINO_BUF: 313 if (magic16 != XFS_DINODE_MAGIC) { 314 warnmsg = "Bad INODE block magic!"; 315 break; 316 } 317 bp->b_ops = &xfs_inode_buf_ops; 318 break; 319 case XFS_BLFT_SYMLINK_BUF: 320 if (magic32 != XFS_SYMLINK_MAGIC) { 321 warnmsg = "Bad symlink block magic!"; 322 break; 323 } 324 bp->b_ops = &xfs_symlink_buf_ops; 325 break; 326 case XFS_BLFT_DIR_BLOCK_BUF: 327 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 328 magic32 != XFS_DIR3_BLOCK_MAGIC) { 329 warnmsg = "Bad dir block magic!"; 330 break; 331 } 332 bp->b_ops = &xfs_dir3_block_buf_ops; 333 break; 334 case XFS_BLFT_DIR_DATA_BUF: 335 if (magic32 != XFS_DIR2_DATA_MAGIC && 336 magic32 != XFS_DIR3_DATA_MAGIC) { 337 warnmsg = "Bad dir data magic!"; 338 break; 339 } 340 bp->b_ops = &xfs_dir3_data_buf_ops; 341 break; 342 case XFS_BLFT_DIR_FREE_BUF: 343 if (magic32 != XFS_DIR2_FREE_MAGIC && 344 magic32 != XFS_DIR3_FREE_MAGIC) { 345 warnmsg = "Bad dir3 free magic!"; 346 break; 347 } 348 bp->b_ops = &xfs_dir3_free_buf_ops; 349 break; 350 case XFS_BLFT_DIR_LEAF1_BUF: 351 if (magicda != XFS_DIR2_LEAF1_MAGIC && 352 magicda != XFS_DIR3_LEAF1_MAGIC) { 353 warnmsg = "Bad dir leaf1 magic!"; 354 break; 355 } 356 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 357 break; 358 case XFS_BLFT_DIR_LEAFN_BUF: 359 if (magicda != XFS_DIR2_LEAFN_MAGIC && 360 magicda != XFS_DIR3_LEAFN_MAGIC) { 361 warnmsg = "Bad dir leafn magic!"; 362 break; 363 } 364 bp->b_ops = &xfs_dir3_leafn_buf_ops; 365 break; 366 case XFS_BLFT_DA_NODE_BUF: 367 if (magicda != XFS_DA_NODE_MAGIC && 368 magicda != XFS_DA3_NODE_MAGIC) { 369 warnmsg = "Bad da node magic!"; 370 break; 371 } 372 bp->b_ops = &xfs_da3_node_buf_ops; 373 break; 374 case XFS_BLFT_ATTR_LEAF_BUF: 375 if (magicda != XFS_ATTR_LEAF_MAGIC && 376 magicda != XFS_ATTR3_LEAF_MAGIC) { 377 warnmsg = "Bad attr leaf magic!"; 378 break; 379 } 380 bp->b_ops = &xfs_attr3_leaf_buf_ops; 381 break; 382 case XFS_BLFT_ATTR_RMT_BUF: 383 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 384 warnmsg = "Bad attr remote magic!"; 385 break; 386 } 387 bp->b_ops = &xfs_attr3_rmt_buf_ops; 388 break; 389 case XFS_BLFT_SB_BUF: 390 if (magic32 != XFS_SB_MAGIC) { 391 warnmsg = "Bad SB block magic!"; 392 break; 393 } 394 bp->b_ops = &xfs_sb_buf_ops; 395 break; 396 #ifdef CONFIG_XFS_RT 397 case XFS_BLFT_RTBITMAP_BUF: 398 if (xfs_has_rtgroups(mp) && magic32 != XFS_RTBITMAP_MAGIC) { 399 warnmsg = "Bad rtbitmap magic!"; 400 break; 401 } 402 bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_BITMAP); 403 break; 404 case XFS_BLFT_RTSUMMARY_BUF: 405 if (xfs_has_rtgroups(mp) && magic32 != XFS_RTSUMMARY_MAGIC) { 406 warnmsg = "Bad rtsummary magic!"; 407 break; 408 } 409 bp->b_ops = xfs_rtblock_ops(mp, XFS_RTGI_SUMMARY); 410 break; 411 #endif /* CONFIG_XFS_RT */ 412 default: 413 xfs_warn(mp, "Unknown buffer type %d!", 414 xfs_blft_from_flags(buf_f)); 415 break; 416 } 417 418 /* 419 * Nothing else to do in the case of a NULL current LSN as this means 420 * the buffer is more recent than the change in the log and will be 421 * skipped. 422 */ 423 if (current_lsn == NULLCOMMITLSN) 424 return; 425 426 if (warnmsg) { 427 xfs_warn(mp, warnmsg); 428 ASSERT(0); 429 } 430 431 /* 432 * We must update the metadata LSN of the buffer as it is written out to 433 * ensure that older transactions never replay over this one and corrupt 434 * the buffer. This can occur if log recovery is interrupted at some 435 * point after the current transaction completes, at which point a 436 * subsequent mount starts recovery from the beginning. 437 * 438 * Write verifiers update the metadata LSN from log items attached to 439 * the buffer. Therefore, initialize a bli purely to carry the LSN to 440 * the verifier. 441 */ 442 if (bp->b_ops) { 443 struct xfs_buf_log_item *bip; 444 445 bp->b_flags |= _XBF_LOGRECOVERY; 446 xfs_buf_item_init(bp, mp); 447 bip = bp->b_log_item; 448 bip->bli_item.li_lsn = current_lsn; 449 } 450 } 451 452 /* 453 * Perform a 'normal' buffer recovery. Each logged region of the 454 * buffer should be copied over the corresponding region in the 455 * given buffer. The bitmap in the buf log format structure indicates 456 * where to place the logged data. 457 */ 458 STATIC void 459 xlog_recover_do_reg_buffer( 460 struct xfs_mount *mp, 461 struct xlog_recover_item *item, 462 struct xfs_buf *bp, 463 struct xfs_buf_log_format *buf_f, 464 xfs_lsn_t current_lsn) 465 { 466 int i; 467 int bit; 468 int nbits; 469 xfs_failaddr_t fa; 470 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); 471 472 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 473 474 bit = 0; 475 i = 1; /* 0 is the buf format structure */ 476 while (1) { 477 bit = xfs_next_bit(buf_f->blf_data_map, 478 buf_f->blf_map_size, bit); 479 if (bit == -1) 480 break; 481 nbits = xfs_contig_bits(buf_f->blf_data_map, 482 buf_f->blf_map_size, bit); 483 ASSERT(nbits > 0); 484 ASSERT(item->ri_buf[i].i_addr != NULL); 485 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 486 ASSERT(BBTOB(bp->b_length) >= 487 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 488 489 /* 490 * The dirty regions logged in the buffer, even though 491 * contiguous, may span multiple chunks. This is because the 492 * dirty region may span a physical page boundary in a buffer 493 * and hence be split into two separate vectors for writing into 494 * the log. Hence we need to trim nbits back to the length of 495 * the current region being copied out of the log. 496 */ 497 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 498 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 499 500 /* 501 * Do a sanity check if this is a dquot buffer. Just checking 502 * the first dquot in the buffer should do. XXXThis is 503 * probably a good thing to do for other buf types also. 504 */ 505 fa = NULL; 506 if (buf_f->blf_flags & 507 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 508 if (item->ri_buf[i].i_addr == NULL) { 509 xfs_alert(mp, 510 "XFS: NULL dquot in %s.", __func__); 511 goto next; 512 } 513 if (item->ri_buf[i].i_len < size_disk_dquot) { 514 xfs_alert(mp, 515 "XFS: dquot too small (%d) in %s.", 516 item->ri_buf[i].i_len, __func__); 517 goto next; 518 } 519 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); 520 if (fa) { 521 xfs_alert(mp, 522 "dquot corrupt at %pS trying to replay into block 0x%llx", 523 fa, xfs_buf_daddr(bp)); 524 goto next; 525 } 526 } 527 528 memcpy(xfs_buf_offset(bp, 529 (uint)bit << XFS_BLF_SHIFT), /* dest */ 530 item->ri_buf[i].i_addr, /* source */ 531 nbits<<XFS_BLF_SHIFT); /* length */ 532 next: 533 i++; 534 bit += nbits; 535 } 536 537 /* Shouldn't be any more regions */ 538 ASSERT(i == item->ri_total); 539 540 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 541 } 542 543 /* 544 * Perform a dquot buffer recovery. 545 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 546 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 547 * Else, treat it as a regular buffer and do recovery. 548 * 549 * Return false if the buffer was tossed and true if we recovered the buffer to 550 * indicate to the caller if the buffer needs writing. 551 */ 552 STATIC bool 553 xlog_recover_do_dquot_buffer( 554 struct xfs_mount *mp, 555 struct xlog *log, 556 struct xlog_recover_item *item, 557 struct xfs_buf *bp, 558 struct xfs_buf_log_format *buf_f) 559 { 560 uint type; 561 562 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 563 564 /* 565 * Filesystems are required to send in quota flags at mount time. 566 */ 567 if (!mp->m_qflags) 568 return false; 569 570 type = 0; 571 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 572 type |= XFS_DQTYPE_USER; 573 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 574 type |= XFS_DQTYPE_PROJ; 575 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 576 type |= XFS_DQTYPE_GROUP; 577 /* 578 * This type of quotas was turned off, so ignore this buffer 579 */ 580 if (log->l_quotaoffs_flag & type) 581 return false; 582 583 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 584 return true; 585 } 586 587 /* 588 * Perform recovery for a buffer full of inodes. In these buffers, the only 589 * data which should be recovered is that which corresponds to the 590 * di_next_unlinked pointers in the on disk inode structures. The rest of the 591 * data for the inodes is always logged through the inodes themselves rather 592 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 593 * 594 * The only time when buffers full of inodes are fully recovered is when the 595 * buffer is full of newly allocated inodes. In this case the buffer will 596 * not be marked as an inode buffer and so will be sent to 597 * xlog_recover_do_reg_buffer() below during recovery. 598 */ 599 STATIC int 600 xlog_recover_do_inode_buffer( 601 struct xfs_mount *mp, 602 struct xlog_recover_item *item, 603 struct xfs_buf *bp, 604 struct xfs_buf_log_format *buf_f) 605 { 606 int i; 607 int item_index = 0; 608 int bit = 0; 609 int nbits = 0; 610 int reg_buf_offset = 0; 611 int reg_buf_bytes = 0; 612 int next_unlinked_offset; 613 int inodes_per_buf; 614 xfs_agino_t *logged_nextp; 615 xfs_agino_t *buffer_nextp; 616 617 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 618 619 /* 620 * Post recovery validation only works properly on CRC enabled 621 * filesystems. 622 */ 623 if (xfs_has_crc(mp)) 624 bp->b_ops = &xfs_inode_buf_ops; 625 626 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; 627 for (i = 0; i < inodes_per_buf; i++) { 628 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 629 offsetof(struct xfs_dinode, di_next_unlinked); 630 631 while (next_unlinked_offset >= 632 (reg_buf_offset + reg_buf_bytes)) { 633 /* 634 * The next di_next_unlinked field is beyond 635 * the current logged region. Find the next 636 * logged region that contains or is beyond 637 * the current di_next_unlinked field. 638 */ 639 bit += nbits; 640 bit = xfs_next_bit(buf_f->blf_data_map, 641 buf_f->blf_map_size, bit); 642 643 /* 644 * If there are no more logged regions in the 645 * buffer, then we're done. 646 */ 647 if (bit == -1) 648 return 0; 649 650 nbits = xfs_contig_bits(buf_f->blf_data_map, 651 buf_f->blf_map_size, bit); 652 ASSERT(nbits > 0); 653 reg_buf_offset = bit << XFS_BLF_SHIFT; 654 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 655 item_index++; 656 } 657 658 /* 659 * If the current logged region starts after the current 660 * di_next_unlinked field, then move on to the next 661 * di_next_unlinked field. 662 */ 663 if (next_unlinked_offset < reg_buf_offset) 664 continue; 665 666 ASSERT(item->ri_buf[item_index].i_addr != NULL); 667 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 668 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); 669 670 /* 671 * The current logged region contains a copy of the 672 * current di_next_unlinked field. Extract its value 673 * and copy it to the buffer copy. 674 */ 675 logged_nextp = item->ri_buf[item_index].i_addr + 676 next_unlinked_offset - reg_buf_offset; 677 if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { 678 xfs_alert(mp, 679 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " 680 "Trying to replay bad (0) inode di_next_unlinked field.", 681 item, bp); 682 return -EFSCORRUPTED; 683 } 684 685 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 686 *buffer_nextp = *logged_nextp; 687 688 /* 689 * If necessary, recalculate the CRC in the on-disk inode. We 690 * have to leave the inode in a consistent state for whoever 691 * reads it next.... 692 */ 693 xfs_dinode_calc_crc(mp, 694 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 695 696 } 697 698 return 0; 699 } 700 701 /* 702 * Update the in-memory superblock and perag structures from the primary SB 703 * buffer. 704 * 705 * This is required because transactions running after growfs may require the 706 * updated values to be set in a previous fully commit transaction. 707 */ 708 static int 709 xlog_recover_do_primary_sb_buffer( 710 struct xfs_mount *mp, 711 struct xlog_recover_item *item, 712 struct xfs_buf *bp, 713 struct xfs_buf_log_format *buf_f, 714 xfs_lsn_t current_lsn) 715 { 716 struct xfs_dsb *dsb = bp->b_addr; 717 xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; 718 xfs_rgnumber_t orig_rgcount = mp->m_sb.sb_rgcount; 719 int error; 720 721 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 722 723 if (orig_agcount == 0) { 724 xfs_alert(mp, "Trying to grow file system without AGs"); 725 return -EFSCORRUPTED; 726 } 727 728 /* 729 * Update the in-core super block from the freshly recovered on-disk one. 730 */ 731 xfs_sb_from_disk(&mp->m_sb, dsb); 732 733 if (mp->m_sb.sb_agcount < orig_agcount) { 734 xfs_alert(mp, "Shrinking AG count in log recovery not supported"); 735 return -EFSCORRUPTED; 736 } 737 if (mp->m_sb.sb_rgcount < orig_rgcount) { 738 xfs_warn(mp, 739 "Shrinking rtgroup count in log recovery not supported"); 740 return -EFSCORRUPTED; 741 } 742 743 /* 744 * If the last AG was grown or shrunk, we also need to update the 745 * length in the in-core perag structure and values depending on it. 746 */ 747 error = xfs_update_last_ag_size(mp, orig_agcount); 748 if (error) 749 return error; 750 751 /* 752 * If the last rtgroup was grown or shrunk, we also need to update the 753 * length in the in-core rtgroup structure and values depending on it. 754 * Ignore this on any filesystem with zero rtgroups. 755 */ 756 if (orig_rgcount > 0) { 757 error = xfs_update_last_rtgroup_size(mp, orig_rgcount); 758 if (error) 759 return error; 760 } 761 762 /* 763 * Initialize the new perags, and also update various block and inode 764 * allocator setting based off the number of AGs or total blocks. 765 * Because of the latter this also needs to happen if the agcount did 766 * not change. 767 */ 768 error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, 769 mp->m_sb.sb_dblocks, &mp->m_maxagi); 770 if (error) { 771 xfs_warn(mp, "Failed recovery per-ag init: %d", error); 772 return error; 773 } 774 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); 775 776 error = xfs_initialize_rtgroups(mp, orig_rgcount, mp->m_sb.sb_rgcount, 777 mp->m_sb.sb_rextents); 778 if (error) { 779 xfs_warn(mp, "Failed recovery rtgroup init: %d", error); 780 return error; 781 } 782 return 0; 783 } 784 785 /* 786 * V5 filesystems know the age of the buffer on disk being recovered. We can 787 * have newer objects on disk than we are replaying, and so for these cases we 788 * don't want to replay the current change as that will make the buffer contents 789 * temporarily invalid on disk. 790 * 791 * The magic number might not match the buffer type we are going to recover 792 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 793 * extract the LSN of the existing object in the buffer based on it's current 794 * magic number. If we don't recognise the magic number in the buffer, then 795 * return a LSN of -1 so that the caller knows it was an unrecognised block and 796 * so can recover the buffer. 797 * 798 * Note: we cannot rely solely on magic number matches to determine that the 799 * buffer has a valid LSN - we also need to verify that it belongs to this 800 * filesystem, so we need to extract the object's LSN and compare it to that 801 * which we read from the superblock. If the UUIDs don't match, then we've got a 802 * stale metadata block from an old filesystem instance that we need to recover 803 * over the top of. 804 */ 805 static xfs_lsn_t 806 xlog_recover_get_buf_lsn( 807 struct xfs_mount *mp, 808 struct xfs_buf *bp, 809 struct xfs_buf_log_format *buf_f) 810 { 811 uint32_t magic32; 812 uint16_t magic16; 813 uint16_t magicda; 814 void *blk = bp->b_addr; 815 uuid_t *uuid; 816 xfs_lsn_t lsn = -1; 817 uint16_t blft; 818 819 /* v4 filesystems always recover immediately */ 820 if (!xfs_has_crc(mp)) 821 goto recover_immediately; 822 823 /* 824 * realtime bitmap and summary file blocks do not have magic numbers or 825 * UUIDs, so we must recover them immediately. 826 */ 827 blft = xfs_blft_from_flags(buf_f); 828 if (!xfs_has_rtgroups(mp) && (blft == XFS_BLFT_RTBITMAP_BUF || 829 blft == XFS_BLFT_RTSUMMARY_BUF)) 830 goto recover_immediately; 831 832 magic32 = be32_to_cpu(*(__be32 *)blk); 833 switch (magic32) { 834 case XFS_RTSUMMARY_MAGIC: 835 case XFS_RTBITMAP_MAGIC: { 836 struct xfs_rtbuf_blkinfo *hdr = blk; 837 838 lsn = be64_to_cpu(hdr->rt_lsn); 839 uuid = &hdr->rt_uuid; 840 break; 841 } 842 case XFS_ABTB_CRC_MAGIC: 843 case XFS_ABTC_CRC_MAGIC: 844 case XFS_ABTB_MAGIC: 845 case XFS_ABTC_MAGIC: 846 case XFS_RMAP_CRC_MAGIC: 847 case XFS_REFC_CRC_MAGIC: 848 case XFS_FIBT_CRC_MAGIC: 849 case XFS_FIBT_MAGIC: 850 case XFS_IBT_CRC_MAGIC: 851 case XFS_IBT_MAGIC: { 852 struct xfs_btree_block *btb = blk; 853 854 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 855 uuid = &btb->bb_u.s.bb_uuid; 856 break; 857 } 858 case XFS_BMAP_CRC_MAGIC: 859 case XFS_BMAP_MAGIC: { 860 struct xfs_btree_block *btb = blk; 861 862 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 863 uuid = &btb->bb_u.l.bb_uuid; 864 break; 865 } 866 case XFS_AGF_MAGIC: 867 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 868 uuid = &((struct xfs_agf *)blk)->agf_uuid; 869 break; 870 case XFS_AGFL_MAGIC: 871 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 872 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 873 break; 874 case XFS_AGI_MAGIC: 875 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 876 uuid = &((struct xfs_agi *)blk)->agi_uuid; 877 break; 878 case XFS_SYMLINK_MAGIC: 879 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 880 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 881 break; 882 case XFS_DIR3_BLOCK_MAGIC: 883 case XFS_DIR3_DATA_MAGIC: 884 case XFS_DIR3_FREE_MAGIC: 885 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 886 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 887 break; 888 case XFS_ATTR3_RMT_MAGIC: 889 /* 890 * Remote attr blocks are written synchronously, rather than 891 * being logged. That means they do not contain a valid LSN 892 * (i.e. transactionally ordered) in them, and hence any time we 893 * see a buffer to replay over the top of a remote attribute 894 * block we should simply do so. 895 */ 896 goto recover_immediately; 897 case XFS_SB_MAGIC: 898 /* 899 * superblock uuids are magic. We may or may not have a 900 * sb_meta_uuid on disk, but it will be set in the in-core 901 * superblock. We set the uuid pointer for verification 902 * according to the superblock feature mask to ensure we check 903 * the relevant UUID in the superblock. 904 */ 905 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 906 if (xfs_has_metauuid(mp)) 907 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 908 else 909 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 910 break; 911 default: 912 break; 913 } 914 915 if (lsn != (xfs_lsn_t)-1) { 916 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 917 goto recover_immediately; 918 return lsn; 919 } 920 921 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 922 switch (magicda) { 923 case XFS_DIR3_LEAF1_MAGIC: 924 case XFS_DIR3_LEAFN_MAGIC: 925 case XFS_ATTR3_LEAF_MAGIC: 926 case XFS_DA3_NODE_MAGIC: 927 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 928 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 929 break; 930 default: 931 break; 932 } 933 934 if (lsn != (xfs_lsn_t)-1) { 935 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 936 goto recover_immediately; 937 return lsn; 938 } 939 940 /* 941 * We do individual object checks on dquot and inode buffers as they 942 * have their own individual LSN records. Also, we could have a stale 943 * buffer here, so we have to at least recognise these buffer types. 944 * 945 * A notd complexity here is inode unlinked list processing - it logs 946 * the inode directly in the buffer, but we don't know which inodes have 947 * been modified, and there is no global buffer LSN. Hence we need to 948 * recover all inode buffer types immediately. This problem will be 949 * fixed by logical logging of the unlinked list modifications. 950 */ 951 magic16 = be16_to_cpu(*(__be16 *)blk); 952 switch (magic16) { 953 case XFS_DQUOT_MAGIC: 954 case XFS_DINODE_MAGIC: 955 goto recover_immediately; 956 default: 957 break; 958 } 959 960 /* unknown buffer contents, recover immediately */ 961 962 recover_immediately: 963 return (xfs_lsn_t)-1; 964 965 } 966 967 /* 968 * This routine replays a modification made to a buffer at runtime. 969 * There are actually two types of buffer, regular and inode, which 970 * are handled differently. Inode buffers are handled differently 971 * in that we only recover a specific set of data from them, namely 972 * the inode di_next_unlinked fields. This is because all other inode 973 * data is actually logged via inode records and any data we replay 974 * here which overlaps that may be stale. 975 * 976 * When meta-data buffers are freed at run time we log a buffer item 977 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 978 * of the buffer in the log should not be replayed at recovery time. 979 * This is so that if the blocks covered by the buffer are reused for 980 * file data before we crash we don't end up replaying old, freed 981 * meta-data into a user's file. 982 * 983 * To handle the cancellation of buffer log items, we make two passes 984 * over the log during recovery. During the first we build a table of 985 * those buffers which have been cancelled, and during the second we 986 * only replay those buffers which do not have corresponding cancel 987 * records in the table. See xlog_recover_buf_pass[1,2] above 988 * for more details on the implementation of the table of cancel records. 989 */ 990 STATIC int 991 xlog_recover_buf_commit_pass2( 992 struct xlog *log, 993 struct list_head *buffer_list, 994 struct xlog_recover_item *item, 995 xfs_lsn_t current_lsn) 996 { 997 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 998 struct xfs_mount *mp = log->l_mp; 999 struct xfs_buf *bp; 1000 int error; 1001 uint buf_flags; 1002 xfs_lsn_t lsn; 1003 1004 /* 1005 * In this pass we only want to recover all the buffers which have 1006 * not been cancelled and are not cancellation buffers themselves. 1007 */ 1008 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1009 if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, 1010 buf_f->blf_len)) 1011 goto cancelled; 1012 } else { 1013 1014 if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, 1015 buf_f->blf_len)) 1016 goto cancelled; 1017 } 1018 1019 trace_xfs_log_recover_buf_recover(log, buf_f); 1020 1021 buf_flags = 0; 1022 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 1023 buf_flags |= XBF_UNMAPPED; 1024 1025 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 1026 buf_flags, &bp, NULL); 1027 if (error) 1028 return error; 1029 1030 /* 1031 * Recover the buffer only if we get an LSN from it and it's less than 1032 * the lsn of the transaction we are replaying. 1033 * 1034 * Note that we have to be extremely careful of readahead here. 1035 * Readahead does not attach verfiers to the buffers so if we don't 1036 * actually do any replay after readahead because of the LSN we found 1037 * in the buffer if more recent than that current transaction then we 1038 * need to attach the verifier directly. Failure to do so can lead to 1039 * future recovery actions (e.g. EFI and unlinked list recovery) can 1040 * operate on the buffers and they won't get the verifier attached. This 1041 * can lead to blocks on disk having the correct content but a stale 1042 * CRC. 1043 * 1044 * It is safe to assume these clean buffers are currently up to date. 1045 * If the buffer is dirtied by a later transaction being replayed, then 1046 * the verifier will be reset to match whatever recover turns that 1047 * buffer into. 1048 */ 1049 lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); 1050 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 1051 trace_xfs_log_recover_buf_skip(log, buf_f); 1052 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 1053 1054 /* 1055 * We're skipping replay of this buffer log item due to the log 1056 * item LSN being behind the ondisk buffer. Verify the buffer 1057 * contents since we aren't going to run the write verifier. 1058 */ 1059 if (bp->b_ops) { 1060 bp->b_ops->verify_read(bp); 1061 error = bp->b_error; 1062 } 1063 goto out_release; 1064 } 1065 1066 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 1067 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 1068 if (error) 1069 goto out_release; 1070 } else if (buf_f->blf_flags & 1071 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1072 bool dirty; 1073 1074 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 1075 if (!dirty) 1076 goto out_release; 1077 } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && 1078 xfs_buf_daddr(bp) == 0) { 1079 error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, 1080 current_lsn); 1081 if (error) 1082 goto out_release; 1083 1084 /* Update the rt superblock if we have one. */ 1085 if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) { 1086 struct xfs_buf *rtsb_bp = mp->m_rtsb_bp; 1087 1088 xfs_buf_lock(rtsb_bp); 1089 xfs_buf_hold(rtsb_bp); 1090 xfs_update_rtsb(rtsb_bp, bp); 1091 rtsb_bp->b_flags |= _XBF_LOGRECOVERY; 1092 xfs_buf_delwri_queue(rtsb_bp, buffer_list); 1093 xfs_buf_relse(rtsb_bp); 1094 } 1095 } else { 1096 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 1097 } 1098 1099 /* 1100 * Perform delayed write on the buffer. Asynchronous writes will be 1101 * slower when taking into account all the buffers to be flushed. 1102 * 1103 * Also make sure that only inode buffers with good sizes stay in 1104 * the buffer cache. The kernel moves inodes in buffers of 1 block 1105 * or inode_cluster_size bytes, whichever is bigger. The inode 1106 * buffers in the log can be a different size if the log was generated 1107 * by an older kernel using unclustered inode buffers or a newer kernel 1108 * running with a different inode cluster size. Regardless, if 1109 * the inode buffer size isn't max(blocksize, inode_cluster_size) 1110 * for *our* value of inode_cluster_size, then we need to keep 1111 * the buffer out of the buffer cache so that the buffer won't 1112 * overlap with future reads of those inodes. 1113 */ 1114 if (XFS_DINODE_MAGIC == 1115 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 1116 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { 1117 xfs_buf_stale(bp); 1118 error = xfs_bwrite(bp); 1119 } else { 1120 ASSERT(bp->b_mount == mp); 1121 bp->b_flags |= _XBF_LOGRECOVERY; 1122 xfs_buf_delwri_queue(bp, buffer_list); 1123 } 1124 1125 out_release: 1126 xfs_buf_relse(bp); 1127 return error; 1128 cancelled: 1129 trace_xfs_log_recover_buf_cancel(log, buf_f); 1130 return 0; 1131 } 1132 1133 const struct xlog_recover_item_ops xlog_buf_item_ops = { 1134 .item_type = XFS_LI_BUF, 1135 .reorder = xlog_recover_buf_reorder, 1136 .ra_pass2 = xlog_recover_buf_ra_pass2, 1137 .commit_pass1 = xlog_recover_buf_commit_pass1, 1138 .commit_pass2 = xlog_recover_buf_commit_pass2, 1139 }; 1140 1141 #ifdef DEBUG 1142 void 1143 xlog_check_buf_cancel_table( 1144 struct xlog *log) 1145 { 1146 int i; 1147 1148 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1149 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 1150 } 1151 #endif 1152 1153 int 1154 xlog_alloc_buf_cancel_table( 1155 struct xlog *log) 1156 { 1157 void *p; 1158 int i; 1159 1160 ASSERT(log->l_buf_cancel_table == NULL); 1161 1162 p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), 1163 GFP_KERNEL); 1164 if (!p) 1165 return -ENOMEM; 1166 1167 log->l_buf_cancel_table = p; 1168 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1169 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 1170 1171 return 0; 1172 } 1173 1174 void 1175 xlog_free_buf_cancel_table( 1176 struct xlog *log) 1177 { 1178 int i; 1179 1180 if (!log->l_buf_cancel_table) 1181 return; 1182 1183 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { 1184 struct xfs_buf_cancel *bc; 1185 1186 while ((bc = list_first_entry_or_null( 1187 &log->l_buf_cancel_table[i], 1188 struct xfs_buf_cancel, bc_list))) { 1189 list_del(&bc->bc_list); 1190 kfree(bc); 1191 } 1192 } 1193 1194 kfree(log->l_buf_cancel_table); 1195 log->l_buf_cancel_table = NULL; 1196 } 1197