1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_buf_item.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_trace.h" 18 #include "xfs_log.h" 19 #include "xfs_log_priv.h" 20 #include "xfs_log_recover.h" 21 #include "xfs_error.h" 22 #include "xfs_inode.h" 23 #include "xfs_dir2.h" 24 #include "xfs_quota.h" 25 #include "xfs_alloc.h" 26 #include "xfs_ag.h" 27 #include "xfs_sb.h" 28 29 /* 30 * This is the number of entries in the l_buf_cancel_table used during 31 * recovery. 32 */ 33 #define XLOG_BC_TABLE_SIZE 64 34 35 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ 36 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) 37 38 /* 39 * This structure is used during recovery to record the buf log items which 40 * have been canceled and should not be replayed. 41 */ 42 struct xfs_buf_cancel { 43 xfs_daddr_t bc_blkno; 44 uint bc_len; 45 int bc_refcount; 46 struct list_head bc_list; 47 }; 48 49 static struct xfs_buf_cancel * 50 xlog_find_buffer_cancelled( 51 struct xlog *log, 52 xfs_daddr_t blkno, 53 uint len) 54 { 55 struct list_head *bucket; 56 struct xfs_buf_cancel *bcp; 57 58 if (!log->l_buf_cancel_table) 59 return NULL; 60 61 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 62 list_for_each_entry(bcp, bucket, bc_list) { 63 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 64 return bcp; 65 } 66 67 return NULL; 68 } 69 70 static bool 71 xlog_add_buffer_cancelled( 72 struct xlog *log, 73 xfs_daddr_t blkno, 74 uint len) 75 { 76 struct xfs_buf_cancel *bcp; 77 78 /* 79 * If we find an existing cancel record, this indicates that the buffer 80 * was cancelled multiple times. To ensure that during pass 2 we keep 81 * the record in the table until we reach its last occurrence in the 82 * log, a reference count is kept to tell how many times we expect to 83 * see this record during the second pass. 84 */ 85 bcp = xlog_find_buffer_cancelled(log, blkno, len); 86 if (bcp) { 87 bcp->bc_refcount++; 88 return false; 89 } 90 91 bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); 92 bcp->bc_blkno = blkno; 93 bcp->bc_len = len; 94 bcp->bc_refcount = 1; 95 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); 96 return true; 97 } 98 99 /* 100 * Check if there is and entry for blkno, len in the buffer cancel record table. 101 */ 102 bool 103 xlog_is_buffer_cancelled( 104 struct xlog *log, 105 xfs_daddr_t blkno, 106 uint len) 107 { 108 return xlog_find_buffer_cancelled(log, blkno, len) != NULL; 109 } 110 111 /* 112 * Check if there is and entry for blkno, len in the buffer cancel record table, 113 * and decremented the reference count on it if there is one. 114 * 115 * Remove the cancel record once the refcount hits zero, so that if the same 116 * buffer is re-used again after its last cancellation we actually replay the 117 * changes made at that point. 118 */ 119 static bool 120 xlog_put_buffer_cancelled( 121 struct xlog *log, 122 xfs_daddr_t blkno, 123 uint len) 124 { 125 struct xfs_buf_cancel *bcp; 126 127 bcp = xlog_find_buffer_cancelled(log, blkno, len); 128 if (!bcp) { 129 ASSERT(0); 130 return false; 131 } 132 133 if (--bcp->bc_refcount == 0) { 134 list_del(&bcp->bc_list); 135 kfree(bcp); 136 } 137 return true; 138 } 139 140 /* log buffer item recovery */ 141 142 /* 143 * Sort buffer items for log recovery. Most buffer items should end up on the 144 * buffer list and are recovered first, with the following exceptions: 145 * 146 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items 147 * might depend on the incor ecancellation record, and replaying a cancelled 148 * buffer item can remove the incore record. 149 * 150 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that 151 * we replay di_next_unlinked only after flushing the inode 'free' state 152 * to the inode buffer. 153 * 154 * See xlog_recover_reorder_trans for more details. 155 */ 156 STATIC enum xlog_recover_reorder 157 xlog_recover_buf_reorder( 158 struct xlog_recover_item *item) 159 { 160 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 161 162 if (buf_f->blf_flags & XFS_BLF_CANCEL) 163 return XLOG_REORDER_CANCEL_LIST; 164 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 165 return XLOG_REORDER_INODE_BUFFER_LIST; 166 return XLOG_REORDER_BUFFER_LIST; 167 } 168 169 STATIC void 170 xlog_recover_buf_ra_pass2( 171 struct xlog *log, 172 struct xlog_recover_item *item) 173 { 174 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 175 176 xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); 177 } 178 179 /* 180 * Build up the table of buf cancel records so that we don't replay cancelled 181 * data in the second pass. 182 */ 183 static int 184 xlog_recover_buf_commit_pass1( 185 struct xlog *log, 186 struct xlog_recover_item *item) 187 { 188 struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; 189 190 if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { 191 xfs_err(log->l_mp, "bad buffer log item size (%d)", 192 item->ri_buf[0].i_len); 193 return -EFSCORRUPTED; 194 } 195 196 if (!(bf->blf_flags & XFS_BLF_CANCEL)) 197 trace_xfs_log_recover_buf_not_cancel(log, bf); 198 else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) 199 trace_xfs_log_recover_buf_cancel_add(log, bf); 200 else 201 trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); 202 return 0; 203 } 204 205 /* 206 * Validate the recovered buffer is of the correct type and attach the 207 * appropriate buffer operations to them for writeback. Magic numbers are in a 208 * few places: 209 * the first 16 bits of the buffer (inode buffer, dquot buffer), 210 * the first 32 bits of the buffer (most blocks), 211 * inside a struct xfs_da_blkinfo at the start of the buffer. 212 */ 213 static void 214 xlog_recover_validate_buf_type( 215 struct xfs_mount *mp, 216 struct xfs_buf *bp, 217 struct xfs_buf_log_format *buf_f, 218 xfs_lsn_t current_lsn) 219 { 220 struct xfs_da_blkinfo *info = bp->b_addr; 221 uint32_t magic32; 222 uint16_t magic16; 223 uint16_t magicda; 224 char *warnmsg = NULL; 225 226 /* 227 * We can only do post recovery validation on items on CRC enabled 228 * fielsystems as we need to know when the buffer was written to be able 229 * to determine if we should have replayed the item. If we replay old 230 * metadata over a newer buffer, then it will enter a temporarily 231 * inconsistent state resulting in verification failures. Hence for now 232 * just avoid the verification stage for non-crc filesystems 233 */ 234 if (!xfs_has_crc(mp)) 235 return; 236 237 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 238 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 239 magicda = be16_to_cpu(info->magic); 240 switch (xfs_blft_from_flags(buf_f)) { 241 case XFS_BLFT_BTREE_BUF: 242 switch (magic32) { 243 case XFS_ABTB_CRC_MAGIC: 244 case XFS_ABTB_MAGIC: 245 bp->b_ops = &xfs_bnobt_buf_ops; 246 break; 247 case XFS_ABTC_CRC_MAGIC: 248 case XFS_ABTC_MAGIC: 249 bp->b_ops = &xfs_cntbt_buf_ops; 250 break; 251 case XFS_IBT_CRC_MAGIC: 252 case XFS_IBT_MAGIC: 253 bp->b_ops = &xfs_inobt_buf_ops; 254 break; 255 case XFS_FIBT_CRC_MAGIC: 256 case XFS_FIBT_MAGIC: 257 bp->b_ops = &xfs_finobt_buf_ops; 258 break; 259 case XFS_BMAP_CRC_MAGIC: 260 case XFS_BMAP_MAGIC: 261 bp->b_ops = &xfs_bmbt_buf_ops; 262 break; 263 case XFS_RMAP_CRC_MAGIC: 264 bp->b_ops = &xfs_rmapbt_buf_ops; 265 break; 266 case XFS_REFC_CRC_MAGIC: 267 bp->b_ops = &xfs_refcountbt_buf_ops; 268 break; 269 default: 270 warnmsg = "Bad btree block magic!"; 271 break; 272 } 273 break; 274 case XFS_BLFT_AGF_BUF: 275 if (magic32 != XFS_AGF_MAGIC) { 276 warnmsg = "Bad AGF block magic!"; 277 break; 278 } 279 bp->b_ops = &xfs_agf_buf_ops; 280 break; 281 case XFS_BLFT_AGFL_BUF: 282 if (magic32 != XFS_AGFL_MAGIC) { 283 warnmsg = "Bad AGFL block magic!"; 284 break; 285 } 286 bp->b_ops = &xfs_agfl_buf_ops; 287 break; 288 case XFS_BLFT_AGI_BUF: 289 if (magic32 != XFS_AGI_MAGIC) { 290 warnmsg = "Bad AGI block magic!"; 291 break; 292 } 293 bp->b_ops = &xfs_agi_buf_ops; 294 break; 295 case XFS_BLFT_UDQUOT_BUF: 296 case XFS_BLFT_PDQUOT_BUF: 297 case XFS_BLFT_GDQUOT_BUF: 298 #ifdef CONFIG_XFS_QUOTA 299 if (magic16 != XFS_DQUOT_MAGIC) { 300 warnmsg = "Bad DQUOT block magic!"; 301 break; 302 } 303 bp->b_ops = &xfs_dquot_buf_ops; 304 #else 305 xfs_alert(mp, 306 "Trying to recover dquots without QUOTA support built in!"); 307 ASSERT(0); 308 #endif 309 break; 310 case XFS_BLFT_DINO_BUF: 311 if (magic16 != XFS_DINODE_MAGIC) { 312 warnmsg = "Bad INODE block magic!"; 313 break; 314 } 315 bp->b_ops = &xfs_inode_buf_ops; 316 break; 317 case XFS_BLFT_SYMLINK_BUF: 318 if (magic32 != XFS_SYMLINK_MAGIC) { 319 warnmsg = "Bad symlink block magic!"; 320 break; 321 } 322 bp->b_ops = &xfs_symlink_buf_ops; 323 break; 324 case XFS_BLFT_DIR_BLOCK_BUF: 325 if (magic32 != XFS_DIR2_BLOCK_MAGIC && 326 magic32 != XFS_DIR3_BLOCK_MAGIC) { 327 warnmsg = "Bad dir block magic!"; 328 break; 329 } 330 bp->b_ops = &xfs_dir3_block_buf_ops; 331 break; 332 case XFS_BLFT_DIR_DATA_BUF: 333 if (magic32 != XFS_DIR2_DATA_MAGIC && 334 magic32 != XFS_DIR3_DATA_MAGIC) { 335 warnmsg = "Bad dir data magic!"; 336 break; 337 } 338 bp->b_ops = &xfs_dir3_data_buf_ops; 339 break; 340 case XFS_BLFT_DIR_FREE_BUF: 341 if (magic32 != XFS_DIR2_FREE_MAGIC && 342 magic32 != XFS_DIR3_FREE_MAGIC) { 343 warnmsg = "Bad dir3 free magic!"; 344 break; 345 } 346 bp->b_ops = &xfs_dir3_free_buf_ops; 347 break; 348 case XFS_BLFT_DIR_LEAF1_BUF: 349 if (magicda != XFS_DIR2_LEAF1_MAGIC && 350 magicda != XFS_DIR3_LEAF1_MAGIC) { 351 warnmsg = "Bad dir leaf1 magic!"; 352 break; 353 } 354 bp->b_ops = &xfs_dir3_leaf1_buf_ops; 355 break; 356 case XFS_BLFT_DIR_LEAFN_BUF: 357 if (magicda != XFS_DIR2_LEAFN_MAGIC && 358 magicda != XFS_DIR3_LEAFN_MAGIC) { 359 warnmsg = "Bad dir leafn magic!"; 360 break; 361 } 362 bp->b_ops = &xfs_dir3_leafn_buf_ops; 363 break; 364 case XFS_BLFT_DA_NODE_BUF: 365 if (magicda != XFS_DA_NODE_MAGIC && 366 magicda != XFS_DA3_NODE_MAGIC) { 367 warnmsg = "Bad da node magic!"; 368 break; 369 } 370 bp->b_ops = &xfs_da3_node_buf_ops; 371 break; 372 case XFS_BLFT_ATTR_LEAF_BUF: 373 if (magicda != XFS_ATTR_LEAF_MAGIC && 374 magicda != XFS_ATTR3_LEAF_MAGIC) { 375 warnmsg = "Bad attr leaf magic!"; 376 break; 377 } 378 bp->b_ops = &xfs_attr3_leaf_buf_ops; 379 break; 380 case XFS_BLFT_ATTR_RMT_BUF: 381 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 382 warnmsg = "Bad attr remote magic!"; 383 break; 384 } 385 bp->b_ops = &xfs_attr3_rmt_buf_ops; 386 break; 387 case XFS_BLFT_SB_BUF: 388 if (magic32 != XFS_SB_MAGIC) { 389 warnmsg = "Bad SB block magic!"; 390 break; 391 } 392 bp->b_ops = &xfs_sb_buf_ops; 393 break; 394 #ifdef CONFIG_XFS_RT 395 case XFS_BLFT_RTBITMAP_BUF: 396 case XFS_BLFT_RTSUMMARY_BUF: 397 /* no magic numbers for verification of RT buffers */ 398 bp->b_ops = &xfs_rtbuf_ops; 399 break; 400 #endif /* CONFIG_XFS_RT */ 401 default: 402 xfs_warn(mp, "Unknown buffer type %d!", 403 xfs_blft_from_flags(buf_f)); 404 break; 405 } 406 407 /* 408 * Nothing else to do in the case of a NULL current LSN as this means 409 * the buffer is more recent than the change in the log and will be 410 * skipped. 411 */ 412 if (current_lsn == NULLCOMMITLSN) 413 return; 414 415 if (warnmsg) { 416 xfs_warn(mp, warnmsg); 417 ASSERT(0); 418 } 419 420 /* 421 * We must update the metadata LSN of the buffer as it is written out to 422 * ensure that older transactions never replay over this one and corrupt 423 * the buffer. This can occur if log recovery is interrupted at some 424 * point after the current transaction completes, at which point a 425 * subsequent mount starts recovery from the beginning. 426 * 427 * Write verifiers update the metadata LSN from log items attached to 428 * the buffer. Therefore, initialize a bli purely to carry the LSN to 429 * the verifier. 430 */ 431 if (bp->b_ops) { 432 struct xfs_buf_log_item *bip; 433 434 bp->b_flags |= _XBF_LOGRECOVERY; 435 xfs_buf_item_init(bp, mp); 436 bip = bp->b_log_item; 437 bip->bli_item.li_lsn = current_lsn; 438 } 439 } 440 441 /* 442 * Perform a 'normal' buffer recovery. Each logged region of the 443 * buffer should be copied over the corresponding region in the 444 * given buffer. The bitmap in the buf log format structure indicates 445 * where to place the logged data. 446 */ 447 STATIC void 448 xlog_recover_do_reg_buffer( 449 struct xfs_mount *mp, 450 struct xlog_recover_item *item, 451 struct xfs_buf *bp, 452 struct xfs_buf_log_format *buf_f, 453 xfs_lsn_t current_lsn) 454 { 455 int i; 456 int bit; 457 int nbits; 458 xfs_failaddr_t fa; 459 const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); 460 461 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); 462 463 bit = 0; 464 i = 1; /* 0 is the buf format structure */ 465 while (1) { 466 bit = xfs_next_bit(buf_f->blf_data_map, 467 buf_f->blf_map_size, bit); 468 if (bit == -1) 469 break; 470 nbits = xfs_contig_bits(buf_f->blf_data_map, 471 buf_f->blf_map_size, bit); 472 ASSERT(nbits > 0); 473 ASSERT(item->ri_buf[i].i_addr != NULL); 474 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); 475 ASSERT(BBTOB(bp->b_length) >= 476 ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); 477 478 /* 479 * The dirty regions logged in the buffer, even though 480 * contiguous, may span multiple chunks. This is because the 481 * dirty region may span a physical page boundary in a buffer 482 * and hence be split into two separate vectors for writing into 483 * the log. Hence we need to trim nbits back to the length of 484 * the current region being copied out of the log. 485 */ 486 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) 487 nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; 488 489 /* 490 * Do a sanity check if this is a dquot buffer. Just checking 491 * the first dquot in the buffer should do. XXXThis is 492 * probably a good thing to do for other buf types also. 493 */ 494 fa = NULL; 495 if (buf_f->blf_flags & 496 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 497 if (item->ri_buf[i].i_addr == NULL) { 498 xfs_alert(mp, 499 "XFS: NULL dquot in %s.", __func__); 500 goto next; 501 } 502 if (item->ri_buf[i].i_len < size_disk_dquot) { 503 xfs_alert(mp, 504 "XFS: dquot too small (%d) in %s.", 505 item->ri_buf[i].i_len, __func__); 506 goto next; 507 } 508 fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); 509 if (fa) { 510 xfs_alert(mp, 511 "dquot corrupt at %pS trying to replay into block 0x%llx", 512 fa, xfs_buf_daddr(bp)); 513 goto next; 514 } 515 } 516 517 memcpy(xfs_buf_offset(bp, 518 (uint)bit << XFS_BLF_SHIFT), /* dest */ 519 item->ri_buf[i].i_addr, /* source */ 520 nbits<<XFS_BLF_SHIFT); /* length */ 521 next: 522 i++; 523 bit += nbits; 524 } 525 526 /* Shouldn't be any more regions */ 527 ASSERT(i == item->ri_total); 528 529 xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); 530 } 531 532 /* 533 * Perform a dquot buffer recovery. 534 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 535 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 536 * Else, treat it as a regular buffer and do recovery. 537 * 538 * Return false if the buffer was tossed and true if we recovered the buffer to 539 * indicate to the caller if the buffer needs writing. 540 */ 541 STATIC bool 542 xlog_recover_do_dquot_buffer( 543 struct xfs_mount *mp, 544 struct xlog *log, 545 struct xlog_recover_item *item, 546 struct xfs_buf *bp, 547 struct xfs_buf_log_format *buf_f) 548 { 549 uint type; 550 551 trace_xfs_log_recover_buf_dquot_buf(log, buf_f); 552 553 /* 554 * Filesystems are required to send in quota flags at mount time. 555 */ 556 if (!mp->m_qflags) 557 return false; 558 559 type = 0; 560 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 561 type |= XFS_DQTYPE_USER; 562 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) 563 type |= XFS_DQTYPE_PROJ; 564 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) 565 type |= XFS_DQTYPE_GROUP; 566 /* 567 * This type of quotas was turned off, so ignore this buffer 568 */ 569 if (log->l_quotaoffs_flag & type) 570 return false; 571 572 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); 573 return true; 574 } 575 576 /* 577 * Perform recovery for a buffer full of inodes. In these buffers, the only 578 * data which should be recovered is that which corresponds to the 579 * di_next_unlinked pointers in the on disk inode structures. The rest of the 580 * data for the inodes is always logged through the inodes themselves rather 581 * than the inode buffer and is recovered in xlog_recover_inode_pass2(). 582 * 583 * The only time when buffers full of inodes are fully recovered is when the 584 * buffer is full of newly allocated inodes. In this case the buffer will 585 * not be marked as an inode buffer and so will be sent to 586 * xlog_recover_do_reg_buffer() below during recovery. 587 */ 588 STATIC int 589 xlog_recover_do_inode_buffer( 590 struct xfs_mount *mp, 591 struct xlog_recover_item *item, 592 struct xfs_buf *bp, 593 struct xfs_buf_log_format *buf_f) 594 { 595 int i; 596 int item_index = 0; 597 int bit = 0; 598 int nbits = 0; 599 int reg_buf_offset = 0; 600 int reg_buf_bytes = 0; 601 int next_unlinked_offset; 602 int inodes_per_buf; 603 xfs_agino_t *logged_nextp; 604 xfs_agino_t *buffer_nextp; 605 606 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); 607 608 /* 609 * Post recovery validation only works properly on CRC enabled 610 * filesystems. 611 */ 612 if (xfs_has_crc(mp)) 613 bp->b_ops = &xfs_inode_buf_ops; 614 615 inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; 616 for (i = 0; i < inodes_per_buf; i++) { 617 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 618 offsetof(struct xfs_dinode, di_next_unlinked); 619 620 while (next_unlinked_offset >= 621 (reg_buf_offset + reg_buf_bytes)) { 622 /* 623 * The next di_next_unlinked field is beyond 624 * the current logged region. Find the next 625 * logged region that contains or is beyond 626 * the current di_next_unlinked field. 627 */ 628 bit += nbits; 629 bit = xfs_next_bit(buf_f->blf_data_map, 630 buf_f->blf_map_size, bit); 631 632 /* 633 * If there are no more logged regions in the 634 * buffer, then we're done. 635 */ 636 if (bit == -1) 637 return 0; 638 639 nbits = xfs_contig_bits(buf_f->blf_data_map, 640 buf_f->blf_map_size, bit); 641 ASSERT(nbits > 0); 642 reg_buf_offset = bit << XFS_BLF_SHIFT; 643 reg_buf_bytes = nbits << XFS_BLF_SHIFT; 644 item_index++; 645 } 646 647 /* 648 * If the current logged region starts after the current 649 * di_next_unlinked field, then move on to the next 650 * di_next_unlinked field. 651 */ 652 if (next_unlinked_offset < reg_buf_offset) 653 continue; 654 655 ASSERT(item->ri_buf[item_index].i_addr != NULL); 656 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); 657 ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); 658 659 /* 660 * The current logged region contains a copy of the 661 * current di_next_unlinked field. Extract its value 662 * and copy it to the buffer copy. 663 */ 664 logged_nextp = item->ri_buf[item_index].i_addr + 665 next_unlinked_offset - reg_buf_offset; 666 if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { 667 xfs_alert(mp, 668 "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " 669 "Trying to replay bad (0) inode di_next_unlinked field.", 670 item, bp); 671 return -EFSCORRUPTED; 672 } 673 674 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); 675 *buffer_nextp = *logged_nextp; 676 677 /* 678 * If necessary, recalculate the CRC in the on-disk inode. We 679 * have to leave the inode in a consistent state for whoever 680 * reads it next.... 681 */ 682 xfs_dinode_calc_crc(mp, 683 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); 684 685 } 686 687 return 0; 688 } 689 690 /* 691 * Update the in-memory superblock and perag structures from the primary SB 692 * buffer. 693 * 694 * This is required because transactions running after growfs may require the 695 * updated values to be set in a previous fully commit transaction. 696 */ 697 static int 698 xlog_recover_do_primary_sb_buffer( 699 struct xfs_mount *mp, 700 struct xlog_recover_item *item, 701 struct xfs_buf *bp, 702 struct xfs_buf_log_format *buf_f, 703 xfs_lsn_t current_lsn) 704 { 705 struct xfs_dsb *dsb = bp->b_addr; 706 xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; 707 int error; 708 709 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 710 711 if (orig_agcount == 0) { 712 xfs_alert(mp, "Trying to grow file system without AGs"); 713 return -EFSCORRUPTED; 714 } 715 716 /* 717 * Update the in-core super block from the freshly recovered on-disk one. 718 */ 719 xfs_sb_from_disk(&mp->m_sb, dsb); 720 721 if (mp->m_sb.sb_agcount < orig_agcount) { 722 xfs_alert(mp, "Shrinking AG count in log recovery not supported"); 723 return -EFSCORRUPTED; 724 } 725 726 /* 727 * Growfs can also grow the last existing AG. In this case we also need 728 * to update the length in the in-core perag structure and values 729 * depending on it. 730 */ 731 error = xfs_update_last_ag_size(mp, orig_agcount); 732 if (error) 733 return error; 734 735 /* 736 * Initialize the new perags, and also update various block and inode 737 * allocator setting based off the number of AGs or total blocks. 738 * Because of the latter this also needs to happen if the agcount did 739 * not change. 740 */ 741 error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, 742 mp->m_sb.sb_dblocks, &mp->m_maxagi); 743 if (error) { 744 xfs_warn(mp, "Failed recovery per-ag init: %d", error); 745 return error; 746 } 747 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); 748 return 0; 749 } 750 751 /* 752 * V5 filesystems know the age of the buffer on disk being recovered. We can 753 * have newer objects on disk than we are replaying, and so for these cases we 754 * don't want to replay the current change as that will make the buffer contents 755 * temporarily invalid on disk. 756 * 757 * The magic number might not match the buffer type we are going to recover 758 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence 759 * extract the LSN of the existing object in the buffer based on it's current 760 * magic number. If we don't recognise the magic number in the buffer, then 761 * return a LSN of -1 so that the caller knows it was an unrecognised block and 762 * so can recover the buffer. 763 * 764 * Note: we cannot rely solely on magic number matches to determine that the 765 * buffer has a valid LSN - we also need to verify that it belongs to this 766 * filesystem, so we need to extract the object's LSN and compare it to that 767 * which we read from the superblock. If the UUIDs don't match, then we've got a 768 * stale metadata block from an old filesystem instance that we need to recover 769 * over the top of. 770 */ 771 static xfs_lsn_t 772 xlog_recover_get_buf_lsn( 773 struct xfs_mount *mp, 774 struct xfs_buf *bp, 775 struct xfs_buf_log_format *buf_f) 776 { 777 uint32_t magic32; 778 uint16_t magic16; 779 uint16_t magicda; 780 void *blk = bp->b_addr; 781 uuid_t *uuid; 782 xfs_lsn_t lsn = -1; 783 uint16_t blft; 784 785 /* v4 filesystems always recover immediately */ 786 if (!xfs_has_crc(mp)) 787 goto recover_immediately; 788 789 /* 790 * realtime bitmap and summary file blocks do not have magic numbers or 791 * UUIDs, so we must recover them immediately. 792 */ 793 blft = xfs_blft_from_flags(buf_f); 794 if (blft == XFS_BLFT_RTBITMAP_BUF || blft == XFS_BLFT_RTSUMMARY_BUF) 795 goto recover_immediately; 796 797 magic32 = be32_to_cpu(*(__be32 *)blk); 798 switch (magic32) { 799 case XFS_ABTB_CRC_MAGIC: 800 case XFS_ABTC_CRC_MAGIC: 801 case XFS_ABTB_MAGIC: 802 case XFS_ABTC_MAGIC: 803 case XFS_RMAP_CRC_MAGIC: 804 case XFS_REFC_CRC_MAGIC: 805 case XFS_FIBT_CRC_MAGIC: 806 case XFS_FIBT_MAGIC: 807 case XFS_IBT_CRC_MAGIC: 808 case XFS_IBT_MAGIC: { 809 struct xfs_btree_block *btb = blk; 810 811 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); 812 uuid = &btb->bb_u.s.bb_uuid; 813 break; 814 } 815 case XFS_BMAP_CRC_MAGIC: 816 case XFS_BMAP_MAGIC: { 817 struct xfs_btree_block *btb = blk; 818 819 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); 820 uuid = &btb->bb_u.l.bb_uuid; 821 break; 822 } 823 case XFS_AGF_MAGIC: 824 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); 825 uuid = &((struct xfs_agf *)blk)->agf_uuid; 826 break; 827 case XFS_AGFL_MAGIC: 828 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); 829 uuid = &((struct xfs_agfl *)blk)->agfl_uuid; 830 break; 831 case XFS_AGI_MAGIC: 832 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); 833 uuid = &((struct xfs_agi *)blk)->agi_uuid; 834 break; 835 case XFS_SYMLINK_MAGIC: 836 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); 837 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; 838 break; 839 case XFS_DIR3_BLOCK_MAGIC: 840 case XFS_DIR3_DATA_MAGIC: 841 case XFS_DIR3_FREE_MAGIC: 842 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); 843 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; 844 break; 845 case XFS_ATTR3_RMT_MAGIC: 846 /* 847 * Remote attr blocks are written synchronously, rather than 848 * being logged. That means they do not contain a valid LSN 849 * (i.e. transactionally ordered) in them, and hence any time we 850 * see a buffer to replay over the top of a remote attribute 851 * block we should simply do so. 852 */ 853 goto recover_immediately; 854 case XFS_SB_MAGIC: 855 /* 856 * superblock uuids are magic. We may or may not have a 857 * sb_meta_uuid on disk, but it will be set in the in-core 858 * superblock. We set the uuid pointer for verification 859 * according to the superblock feature mask to ensure we check 860 * the relevant UUID in the superblock. 861 */ 862 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); 863 if (xfs_has_metauuid(mp)) 864 uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; 865 else 866 uuid = &((struct xfs_dsb *)blk)->sb_uuid; 867 break; 868 default: 869 break; 870 } 871 872 if (lsn != (xfs_lsn_t)-1) { 873 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 874 goto recover_immediately; 875 return lsn; 876 } 877 878 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); 879 switch (magicda) { 880 case XFS_DIR3_LEAF1_MAGIC: 881 case XFS_DIR3_LEAFN_MAGIC: 882 case XFS_ATTR3_LEAF_MAGIC: 883 case XFS_DA3_NODE_MAGIC: 884 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); 885 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; 886 break; 887 default: 888 break; 889 } 890 891 if (lsn != (xfs_lsn_t)-1) { 892 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) 893 goto recover_immediately; 894 return lsn; 895 } 896 897 /* 898 * We do individual object checks on dquot and inode buffers as they 899 * have their own individual LSN records. Also, we could have a stale 900 * buffer here, so we have to at least recognise these buffer types. 901 * 902 * A notd complexity here is inode unlinked list processing - it logs 903 * the inode directly in the buffer, but we don't know which inodes have 904 * been modified, and there is no global buffer LSN. Hence we need to 905 * recover all inode buffer types immediately. This problem will be 906 * fixed by logical logging of the unlinked list modifications. 907 */ 908 magic16 = be16_to_cpu(*(__be16 *)blk); 909 switch (magic16) { 910 case XFS_DQUOT_MAGIC: 911 case XFS_DINODE_MAGIC: 912 goto recover_immediately; 913 default: 914 break; 915 } 916 917 /* unknown buffer contents, recover immediately */ 918 919 recover_immediately: 920 return (xfs_lsn_t)-1; 921 922 } 923 924 /* 925 * This routine replays a modification made to a buffer at runtime. 926 * There are actually two types of buffer, regular and inode, which 927 * are handled differently. Inode buffers are handled differently 928 * in that we only recover a specific set of data from them, namely 929 * the inode di_next_unlinked fields. This is because all other inode 930 * data is actually logged via inode records and any data we replay 931 * here which overlaps that may be stale. 932 * 933 * When meta-data buffers are freed at run time we log a buffer item 934 * with the XFS_BLF_CANCEL bit set to indicate that previous copies 935 * of the buffer in the log should not be replayed at recovery time. 936 * This is so that if the blocks covered by the buffer are reused for 937 * file data before we crash we don't end up replaying old, freed 938 * meta-data into a user's file. 939 * 940 * To handle the cancellation of buffer log items, we make two passes 941 * over the log during recovery. During the first we build a table of 942 * those buffers which have been cancelled, and during the second we 943 * only replay those buffers which do not have corresponding cancel 944 * records in the table. See xlog_recover_buf_pass[1,2] above 945 * for more details on the implementation of the table of cancel records. 946 */ 947 STATIC int 948 xlog_recover_buf_commit_pass2( 949 struct xlog *log, 950 struct list_head *buffer_list, 951 struct xlog_recover_item *item, 952 xfs_lsn_t current_lsn) 953 { 954 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; 955 struct xfs_mount *mp = log->l_mp; 956 struct xfs_buf *bp; 957 int error; 958 uint buf_flags; 959 xfs_lsn_t lsn; 960 961 /* 962 * In this pass we only want to recover all the buffers which have 963 * not been cancelled and are not cancellation buffers themselves. 964 */ 965 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 966 if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, 967 buf_f->blf_len)) 968 goto cancelled; 969 } else { 970 971 if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, 972 buf_f->blf_len)) 973 goto cancelled; 974 } 975 976 trace_xfs_log_recover_buf_recover(log, buf_f); 977 978 buf_flags = 0; 979 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) 980 buf_flags |= XBF_UNMAPPED; 981 982 error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 983 buf_flags, &bp, NULL); 984 if (error) 985 return error; 986 987 /* 988 * Recover the buffer only if we get an LSN from it and it's less than 989 * the lsn of the transaction we are replaying. 990 * 991 * Note that we have to be extremely careful of readahead here. 992 * Readahead does not attach verfiers to the buffers so if we don't 993 * actually do any replay after readahead because of the LSN we found 994 * in the buffer if more recent than that current transaction then we 995 * need to attach the verifier directly. Failure to do so can lead to 996 * future recovery actions (e.g. EFI and unlinked list recovery) can 997 * operate on the buffers and they won't get the verifier attached. This 998 * can lead to blocks on disk having the correct content but a stale 999 * CRC. 1000 * 1001 * It is safe to assume these clean buffers are currently up to date. 1002 * If the buffer is dirtied by a later transaction being replayed, then 1003 * the verifier will be reset to match whatever recover turns that 1004 * buffer into. 1005 */ 1006 lsn = xlog_recover_get_buf_lsn(mp, bp, buf_f); 1007 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { 1008 trace_xfs_log_recover_buf_skip(log, buf_f); 1009 xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); 1010 1011 /* 1012 * We're skipping replay of this buffer log item due to the log 1013 * item LSN being behind the ondisk buffer. Verify the buffer 1014 * contents since we aren't going to run the write verifier. 1015 */ 1016 if (bp->b_ops) { 1017 bp->b_ops->verify_read(bp); 1018 error = bp->b_error; 1019 } 1020 goto out_release; 1021 } 1022 1023 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 1024 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 1025 if (error) 1026 goto out_release; 1027 } else if (buf_f->blf_flags & 1028 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1029 bool dirty; 1030 1031 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 1032 if (!dirty) 1033 goto out_release; 1034 } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && 1035 xfs_buf_daddr(bp) == 0) { 1036 error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, 1037 current_lsn); 1038 if (error) 1039 goto out_release; 1040 } else { 1041 xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); 1042 } 1043 1044 /* 1045 * Perform delayed write on the buffer. Asynchronous writes will be 1046 * slower when taking into account all the buffers to be flushed. 1047 * 1048 * Also make sure that only inode buffers with good sizes stay in 1049 * the buffer cache. The kernel moves inodes in buffers of 1 block 1050 * or inode_cluster_size bytes, whichever is bigger. The inode 1051 * buffers in the log can be a different size if the log was generated 1052 * by an older kernel using unclustered inode buffers or a newer kernel 1053 * running with a different inode cluster size. Regardless, if 1054 * the inode buffer size isn't max(blocksize, inode_cluster_size) 1055 * for *our* value of inode_cluster_size, then we need to keep 1056 * the buffer out of the buffer cache so that the buffer won't 1057 * overlap with future reads of those inodes. 1058 */ 1059 if (XFS_DINODE_MAGIC == 1060 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 1061 (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { 1062 xfs_buf_stale(bp); 1063 error = xfs_bwrite(bp); 1064 } else { 1065 ASSERT(bp->b_mount == mp); 1066 bp->b_flags |= _XBF_LOGRECOVERY; 1067 xfs_buf_delwri_queue(bp, buffer_list); 1068 } 1069 1070 out_release: 1071 xfs_buf_relse(bp); 1072 return error; 1073 cancelled: 1074 trace_xfs_log_recover_buf_cancel(log, buf_f); 1075 return 0; 1076 } 1077 1078 const struct xlog_recover_item_ops xlog_buf_item_ops = { 1079 .item_type = XFS_LI_BUF, 1080 .reorder = xlog_recover_buf_reorder, 1081 .ra_pass2 = xlog_recover_buf_ra_pass2, 1082 .commit_pass1 = xlog_recover_buf_commit_pass1, 1083 .commit_pass2 = xlog_recover_buf_commit_pass2, 1084 }; 1085 1086 #ifdef DEBUG 1087 void 1088 xlog_check_buf_cancel_table( 1089 struct xlog *log) 1090 { 1091 int i; 1092 1093 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1094 ASSERT(list_empty(&log->l_buf_cancel_table[i])); 1095 } 1096 #endif 1097 1098 int 1099 xlog_alloc_buf_cancel_table( 1100 struct xlog *log) 1101 { 1102 void *p; 1103 int i; 1104 1105 ASSERT(log->l_buf_cancel_table == NULL); 1106 1107 p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), 1108 GFP_KERNEL); 1109 if (!p) 1110 return -ENOMEM; 1111 1112 log->l_buf_cancel_table = p; 1113 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 1114 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); 1115 1116 return 0; 1117 } 1118 1119 void 1120 xlog_free_buf_cancel_table( 1121 struct xlog *log) 1122 { 1123 int i; 1124 1125 if (!log->l_buf_cancel_table) 1126 return; 1127 1128 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { 1129 struct xfs_buf_cancel *bc; 1130 1131 while ((bc = list_first_entry_or_null( 1132 &log->l_buf_cancel_table[i], 1133 struct xfs_buf_cancel, bc_list))) { 1134 list_del(&bc->bc_list); 1135 kfree(bc); 1136 } 1137 } 1138 1139 kfree(log->l_buf_cancel_table); 1140 log->l_buf_cancel_table = NULL; 1141 } 1142