1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_trace.h" 17 #include "xfs_trans_priv.h" 18 #include "xfs_buf_item.h" 19 #include "xfs_log.h" 20 #include "xfs_error.h" 21 #include "xfs_log_priv.h" 22 #include "xfs_log_recover.h" 23 #include "xfs_icache.h" 24 #include "xfs_bmap_btree.h" 25 26 STATIC void 27 xlog_recover_inode_ra_pass2( 28 struct xlog *log, 29 struct xlog_recover_item *item) 30 { 31 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 32 struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; 33 34 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 35 &xfs_inode_buf_ra_ops); 36 } else { 37 struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; 38 39 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 40 &xfs_inode_buf_ra_ops); 41 } 42 } 43 44 /* 45 * Inode fork owner changes 46 * 47 * If we have been told that we have to reparent the inode fork, it's because an 48 * extent swap operation on a CRC enabled filesystem has been done and we are 49 * replaying it. We need to walk the BMBT of the appropriate fork and change the 50 * owners of it. 51 * 52 * The complexity here is that we don't have an inode context to work with, so 53 * after we've replayed the inode we need to instantiate one. This is where the 54 * fun begins. 55 * 56 * We are in the middle of log recovery, so we can't run transactions. That 57 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 58 * that will result in the corresponding iput() running the inode through 59 * xfs_inactive(). If we've just replayed an inode core that changes the link 60 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 61 * transactions (bad!). 62 * 63 * So, to avoid this, we instantiate an inode directly from the inode core we've 64 * just recovered. We have the buffer still locked, and all we really need to 65 * instantiate is the inode core and the forks being modified. We can do this 66 * manually, then run the inode btree owner change, and then tear down the 67 * xfs_inode without having to run any transactions at all. 68 * 69 * Also, because we don't have a transaction context available here but need to 70 * gather all the buffers we modify for writeback so we pass the buffer_list 71 * instead for the operation to use. 72 */ 73 74 STATIC int 75 xfs_recover_inode_owner_change( 76 struct xfs_mount *mp, 77 struct xfs_dinode *dip, 78 struct xfs_inode_log_format *in_f, 79 struct list_head *buffer_list) 80 { 81 struct xfs_inode *ip; 82 int error; 83 84 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 85 86 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 87 if (!ip) 88 return -ENOMEM; 89 90 /* instantiate the inode */ 91 ASSERT(dip->di_version >= 3); 92 93 error = xfs_inode_from_disk(ip, dip); 94 if (error) 95 goto out_free_ip; 96 97 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 98 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 99 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 100 ip->i_ino, buffer_list); 101 if (error) 102 goto out_free_ip; 103 } 104 105 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 106 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 107 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 108 ip->i_ino, buffer_list); 109 if (error) 110 goto out_free_ip; 111 } 112 113 out_free_ip: 114 xfs_inode_free(ip); 115 return error; 116 } 117 118 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) 119 { 120 return ld->di_version >= 3 && 121 (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); 122 } 123 124 /* Convert a log timestamp to an ondisk timestamp. */ 125 static inline xfs_timestamp_t 126 xfs_log_dinode_to_disk_ts( 127 struct xfs_log_dinode *from, 128 const xfs_log_timestamp_t its) 129 { 130 struct xfs_legacy_timestamp *lts; 131 struct xfs_log_legacy_timestamp *lits; 132 xfs_timestamp_t ts; 133 134 if (xfs_log_dinode_has_bigtime(from)) 135 return cpu_to_be64(its); 136 137 lts = (struct xfs_legacy_timestamp *)&ts; 138 lits = (struct xfs_log_legacy_timestamp *)&its; 139 lts->t_sec = cpu_to_be32(lits->t_sec); 140 lts->t_nsec = cpu_to_be32(lits->t_nsec); 141 142 return ts; 143 } 144 145 static inline bool xfs_log_dinode_has_large_extent_counts( 146 const struct xfs_log_dinode *ld) 147 { 148 return ld->di_version >= 3 && 149 (ld->di_flags2 & XFS_DIFLAG2_NREXT64); 150 } 151 152 static inline void 153 xfs_log_dinode_to_disk_iext_counters( 154 struct xfs_log_dinode *from, 155 struct xfs_dinode *to) 156 { 157 if (xfs_log_dinode_has_large_extent_counts(from)) { 158 to->di_big_nextents = cpu_to_be64(from->di_big_nextents); 159 to->di_big_anextents = cpu_to_be32(from->di_big_anextents); 160 to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); 161 } else { 162 to->di_nextents = cpu_to_be32(from->di_nextents); 163 to->di_anextents = cpu_to_be16(from->di_anextents); 164 } 165 166 } 167 168 STATIC void 169 xfs_log_dinode_to_disk( 170 struct xfs_log_dinode *from, 171 struct xfs_dinode *to, 172 xfs_lsn_t lsn) 173 { 174 to->di_magic = cpu_to_be16(from->di_magic); 175 to->di_mode = cpu_to_be16(from->di_mode); 176 to->di_version = from->di_version; 177 to->di_format = from->di_format; 178 to->di_metatype = cpu_to_be16(from->di_metatype); 179 to->di_uid = cpu_to_be32(from->di_uid); 180 to->di_gid = cpu_to_be32(from->di_gid); 181 to->di_nlink = cpu_to_be32(from->di_nlink); 182 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 183 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 184 185 to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); 186 to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); 187 to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); 188 189 to->di_size = cpu_to_be64(from->di_size); 190 to->di_nblocks = cpu_to_be64(from->di_nblocks); 191 to->di_extsize = cpu_to_be32(from->di_extsize); 192 to->di_forkoff = from->di_forkoff; 193 to->di_aformat = from->di_aformat; 194 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 195 to->di_dmstate = cpu_to_be16(from->di_dmstate); 196 to->di_flags = cpu_to_be16(from->di_flags); 197 to->di_gen = cpu_to_be32(from->di_gen); 198 199 if (from->di_version == 3) { 200 to->di_changecount = cpu_to_be64(from->di_changecount); 201 to->di_crtime = xfs_log_dinode_to_disk_ts(from, 202 from->di_crtime); 203 to->di_flags2 = cpu_to_be64(from->di_flags2); 204 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 205 to->di_ino = cpu_to_be64(from->di_ino); 206 to->di_lsn = cpu_to_be64(lsn); 207 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 208 uuid_copy(&to->di_uuid, &from->di_uuid); 209 to->di_v3_pad = 0; 210 } else { 211 to->di_flushiter = cpu_to_be16(from->di_flushiter); 212 memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); 213 } 214 215 xfs_log_dinode_to_disk_iext_counters(from, to); 216 } 217 218 STATIC int 219 xlog_dinode_verify_extent_counts( 220 struct xfs_mount *mp, 221 struct xfs_log_dinode *ldip) 222 { 223 xfs_extnum_t nextents; 224 xfs_aextnum_t anextents; 225 226 if (xfs_log_dinode_has_large_extent_counts(ldip)) { 227 if (!xfs_has_large_extent_counts(mp) || 228 (ldip->di_nrext64_pad != 0)) { 229 XFS_CORRUPTION_ERROR( 230 "Bad log dinode large extent count format", 231 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 232 xfs_alert(mp, 233 "Bad inode 0x%llx, large extent counts %d, padding 0x%x", 234 ldip->di_ino, xfs_has_large_extent_counts(mp), 235 ldip->di_nrext64_pad); 236 return -EFSCORRUPTED; 237 } 238 239 nextents = ldip->di_big_nextents; 240 anextents = ldip->di_big_anextents; 241 } else { 242 if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { 243 XFS_CORRUPTION_ERROR( 244 "Bad log dinode di_v3_pad", 245 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 246 xfs_alert(mp, 247 "Bad inode 0x%llx, di_v3_pad 0x%llx", 248 ldip->di_ino, ldip->di_v3_pad); 249 return -EFSCORRUPTED; 250 } 251 252 nextents = ldip->di_nextents; 253 anextents = ldip->di_anextents; 254 } 255 256 if (unlikely(nextents + anextents > ldip->di_nblocks)) { 257 XFS_CORRUPTION_ERROR("Bad log dinode extent counts", 258 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 259 xfs_alert(mp, 260 "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", 261 ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, 262 anextents, ldip->di_nblocks); 263 return -EFSCORRUPTED; 264 } 265 266 return 0; 267 } 268 269 STATIC int 270 xlog_recover_inode_commit_pass2( 271 struct xlog *log, 272 struct list_head *buffer_list, 273 struct xlog_recover_item *item, 274 xfs_lsn_t current_lsn) 275 { 276 struct xfs_inode_log_format *in_f; 277 struct xfs_mount *mp = log->l_mp; 278 struct xfs_buf *bp; 279 struct xfs_dinode *dip; 280 int len; 281 char *src; 282 char *dest; 283 int error; 284 int attr_index; 285 uint fields; 286 struct xfs_log_dinode *ldip; 287 uint isize; 288 int need_free = 0; 289 xfs_failaddr_t fa; 290 291 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 292 in_f = item->ri_buf[0].i_addr; 293 } else { 294 in_f = kmalloc(sizeof(struct xfs_inode_log_format), 295 GFP_KERNEL | __GFP_NOFAIL); 296 need_free = 1; 297 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 298 if (error) 299 goto error; 300 } 301 302 /* 303 * Inode buffers can be freed, look out for it, 304 * and do not replay the inode. 305 */ 306 if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { 307 error = 0; 308 trace_xfs_log_recover_inode_cancel(log, in_f); 309 goto error; 310 } 311 trace_xfs_log_recover_inode_recover(log, in_f); 312 313 error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 314 0, &bp, &xfs_inode_buf_ops); 315 if (error) 316 goto error; 317 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 318 dip = xfs_buf_offset(bp, in_f->ilf_boffset); 319 320 /* 321 * Make sure the place we're flushing out to really looks 322 * like an inode! 323 */ 324 if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { 325 xfs_alert(mp, 326 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", 327 __func__, dip, bp, in_f->ilf_ino); 328 error = -EFSCORRUPTED; 329 goto out_release; 330 } 331 ldip = item->ri_buf[1].i_addr; 332 if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { 333 xfs_alert(mp, 334 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", 335 __func__, item, in_f->ilf_ino); 336 error = -EFSCORRUPTED; 337 goto out_release; 338 } 339 340 /* 341 * If the inode has an LSN in it, recover the inode only if the on-disk 342 * inode's LSN is older than the lsn of the transaction we are 343 * replaying. We can have multiple checkpoints with the same start LSN, 344 * so the current LSN being equal to the on-disk LSN doesn't necessarily 345 * mean that the on-disk inode is more recent than the change being 346 * replayed. 347 * 348 * We must check the current_lsn against the on-disk inode 349 * here because the we can't trust the log dinode to contain a valid LSN 350 * (see comment below before replaying the log dinode for details). 351 * 352 * Note: we still need to replay an owner change even though the inode 353 * is more recent than the transaction as there is no guarantee that all 354 * the btree blocks are more recent than this transaction, too. 355 */ 356 if (dip->di_version >= 3) { 357 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 358 359 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { 360 trace_xfs_log_recover_inode_skip(log, in_f); 361 error = 0; 362 goto out_owner_change; 363 } 364 } 365 366 /* 367 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 368 * are transactional and if ordering is necessary we can determine that 369 * more accurately by the LSN field in the V3 inode core. Don't trust 370 * the inode versions we might be changing them here - use the 371 * superblock flag to determine whether we need to look at di_flushiter 372 * to skip replay when the on disk inode is newer than the log one 373 */ 374 if (!xfs_has_v3inodes(mp)) { 375 if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 376 /* 377 * Deal with the wrap case, DI_MAX_FLUSH is less 378 * than smaller numbers 379 */ 380 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 381 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { 382 /* do nothing */ 383 } else { 384 trace_xfs_log_recover_inode_skip(log, in_f); 385 error = 0; 386 goto out_release; 387 } 388 } 389 390 /* Take the opportunity to reset the flush iteration count */ 391 ldip->di_flushiter = 0; 392 } 393 394 395 if (unlikely(S_ISREG(ldip->di_mode))) { 396 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 397 (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 398 XFS_CORRUPTION_ERROR( 399 "Bad log dinode data fork format for regular file", 400 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 401 xfs_alert(mp, 402 "Bad inode 0x%llx, data fork format 0x%x", 403 in_f->ilf_ino, ldip->di_format); 404 error = -EFSCORRUPTED; 405 goto out_release; 406 } 407 } else if (unlikely(S_ISDIR(ldip->di_mode))) { 408 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 409 (ldip->di_format != XFS_DINODE_FMT_BTREE) && 410 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { 411 XFS_CORRUPTION_ERROR( 412 "Bad log dinode data fork format for directory", 413 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 414 xfs_alert(mp, 415 "Bad inode 0x%llx, data fork format 0x%x", 416 in_f->ilf_ino, ldip->di_format); 417 error = -EFSCORRUPTED; 418 goto out_release; 419 } 420 } 421 422 error = xlog_dinode_verify_extent_counts(mp, ldip); 423 if (error) 424 goto out_release; 425 426 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { 427 XFS_CORRUPTION_ERROR("Bad log dinode fork offset", 428 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 429 xfs_alert(mp, 430 "Bad inode 0x%llx, di_forkoff 0x%x", 431 in_f->ilf_ino, ldip->di_forkoff); 432 error = -EFSCORRUPTED; 433 goto out_release; 434 } 435 isize = xfs_log_dinode_size(mp); 436 if (unlikely(item->ri_buf[1].i_len > isize)) { 437 XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, 438 mp, ldip, sizeof(*ldip)); 439 xfs_alert(mp, 440 "Bad inode 0x%llx log dinode size 0x%x", 441 in_f->ilf_ino, item->ri_buf[1].i_len); 442 error = -EFSCORRUPTED; 443 goto out_release; 444 } 445 446 /* 447 * Recover the log dinode inode into the on disk inode. 448 * 449 * The LSN in the log dinode is garbage - it can be zero or reflect 450 * stale in-memory runtime state that isn't coherent with the changes 451 * logged in this transaction or the changes written to the on-disk 452 * inode. Hence we write the current lSN into the inode because that 453 * matches what xfs_iflush() would write inode the inode when flushing 454 * the changes in this transaction. 455 */ 456 xfs_log_dinode_to_disk(ldip, dip, current_lsn); 457 458 fields = in_f->ilf_fields; 459 if (fields & XFS_ILOG_DEV) 460 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 461 462 if (in_f->ilf_size == 2) 463 goto out_owner_change; 464 len = item->ri_buf[2].i_len; 465 src = item->ri_buf[2].i_addr; 466 ASSERT(in_f->ilf_size <= 4); 467 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 468 ASSERT(!(fields & XFS_ILOG_DFORK) || 469 (len == xlog_calc_iovec_len(in_f->ilf_dsize))); 470 471 switch (fields & XFS_ILOG_DFORK) { 472 case XFS_ILOG_DDATA: 473 case XFS_ILOG_DEXT: 474 memcpy(XFS_DFORK_DPTR(dip), src, len); 475 break; 476 477 case XFS_ILOG_DBROOT: 478 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 479 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), 480 XFS_DFORK_DSIZE(dip, mp)); 481 break; 482 483 default: 484 /* 485 * There are no data fork flags set. 486 */ 487 ASSERT((fields & XFS_ILOG_DFORK) == 0); 488 break; 489 } 490 491 /* 492 * If we logged any attribute data, recover it. There may or 493 * may not have been any other non-core data logged in this 494 * transaction. 495 */ 496 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 497 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 498 attr_index = 3; 499 } else { 500 attr_index = 2; 501 } 502 len = item->ri_buf[attr_index].i_len; 503 src = item->ri_buf[attr_index].i_addr; 504 ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); 505 506 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 507 case XFS_ILOG_ADATA: 508 case XFS_ILOG_AEXT: 509 dest = XFS_DFORK_APTR(dip); 510 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 511 memcpy(dest, src, len); 512 break; 513 514 case XFS_ILOG_ABROOT: 515 dest = XFS_DFORK_APTR(dip); 516 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 517 len, (struct xfs_bmdr_block *)dest, 518 XFS_DFORK_ASIZE(dip, mp)); 519 break; 520 521 default: 522 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 523 ASSERT(0); 524 error = -EFSCORRUPTED; 525 goto out_release; 526 } 527 } 528 529 out_owner_change: 530 /* Recover the swapext owner change unless inode has been deleted */ 531 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && 532 (dip->di_mode != 0)) 533 error = xfs_recover_inode_owner_change(mp, dip, in_f, 534 buffer_list); 535 /* re-generate the checksum and validate the recovered inode. */ 536 xfs_dinode_calc_crc(log->l_mp, dip); 537 fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); 538 if (fa) { 539 XFS_CORRUPTION_ERROR( 540 "Bad dinode after recovery", 541 XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); 542 xfs_alert(mp, 543 "Metadata corruption detected at %pS, inode 0x%llx", 544 fa, in_f->ilf_ino); 545 error = -EFSCORRUPTED; 546 goto out_release; 547 } 548 549 ASSERT(bp->b_mount == mp); 550 bp->b_flags |= _XBF_LOGRECOVERY; 551 xfs_buf_delwri_queue(bp, buffer_list); 552 553 out_release: 554 xfs_buf_relse(bp); 555 error: 556 if (need_free) 557 kfree(in_f); 558 return error; 559 } 560 561 const struct xlog_recover_item_ops xlog_inode_item_ops = { 562 .item_type = XFS_LI_INODE, 563 .ra_pass2 = xlog_recover_inode_ra_pass2, 564 .commit_pass2 = xlog_recover_inode_commit_pass2, 565 }; 566