1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_trace.h" 17 #include "xfs_trans_priv.h" 18 #include "xfs_buf_item.h" 19 #include "xfs_log.h" 20 #include "xfs_error.h" 21 #include "xfs_log_priv.h" 22 #include "xfs_log_recover.h" 23 #include "xfs_icache.h" 24 #include "xfs_bmap_btree.h" 25 26 STATIC void 27 xlog_recover_inode_ra_pass2( 28 struct xlog *log, 29 struct xlog_recover_item *item) 30 { 31 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 32 struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; 33 34 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 35 &xfs_inode_buf_ra_ops); 36 } else { 37 struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; 38 39 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 40 &xfs_inode_buf_ra_ops); 41 } 42 } 43 44 /* 45 * Inode fork owner changes 46 * 47 * If we have been told that we have to reparent the inode fork, it's because an 48 * extent swap operation on a CRC enabled filesystem has been done and we are 49 * replaying it. We need to walk the BMBT of the appropriate fork and change the 50 * owners of it. 51 * 52 * The complexity here is that we don't have an inode context to work with, so 53 * after we've replayed the inode we need to instantiate one. This is where the 54 * fun begins. 55 * 56 * We are in the middle of log recovery, so we can't run transactions. That 57 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 58 * that will result in the corresponding iput() running the inode through 59 * xfs_inactive(). If we've just replayed an inode core that changes the link 60 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 61 * transactions (bad!). 62 * 63 * So, to avoid this, we instantiate an inode directly from the inode core we've 64 * just recovered. We have the buffer still locked, and all we really need to 65 * instantiate is the inode core and the forks being modified. We can do this 66 * manually, then run the inode btree owner change, and then tear down the 67 * xfs_inode without having to run any transactions at all. 68 * 69 * Also, because we don't have a transaction context available here but need to 70 * gather all the buffers we modify for writeback so we pass the buffer_list 71 * instead for the operation to use. 72 */ 73 74 STATIC int 75 xfs_recover_inode_owner_change( 76 struct xfs_mount *mp, 77 struct xfs_dinode *dip, 78 struct xfs_inode_log_format *in_f, 79 struct list_head *buffer_list) 80 { 81 struct xfs_inode *ip; 82 int error; 83 84 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 85 86 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 87 if (!ip) 88 return -ENOMEM; 89 90 /* instantiate the inode */ 91 ASSERT(dip->di_version >= 3); 92 93 error = xfs_inode_from_disk(ip, dip); 94 if (error) 95 goto out_free_ip; 96 97 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 98 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 99 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 100 ip->i_ino, buffer_list); 101 if (error) 102 goto out_free_ip; 103 } 104 105 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 106 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 107 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 108 ip->i_ino, buffer_list); 109 if (error) 110 goto out_free_ip; 111 } 112 113 out_free_ip: 114 xfs_inode_free(ip); 115 return error; 116 } 117 118 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) 119 { 120 return ld->di_version >= 3 && 121 (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); 122 } 123 124 /* Convert a log timestamp to an ondisk timestamp. */ 125 static inline xfs_timestamp_t 126 xfs_log_dinode_to_disk_ts( 127 struct xfs_log_dinode *from, 128 const xfs_log_timestamp_t its) 129 { 130 struct xfs_legacy_timestamp *lts; 131 struct xfs_log_legacy_timestamp *lits; 132 xfs_timestamp_t ts; 133 134 if (xfs_log_dinode_has_bigtime(from)) 135 return cpu_to_be64(its); 136 137 lts = (struct xfs_legacy_timestamp *)&ts; 138 lits = (struct xfs_log_legacy_timestamp *)&its; 139 lts->t_sec = cpu_to_be32(lits->t_sec); 140 lts->t_nsec = cpu_to_be32(lits->t_nsec); 141 142 return ts; 143 } 144 145 static inline bool xfs_log_dinode_has_large_extent_counts( 146 const struct xfs_log_dinode *ld) 147 { 148 return ld->di_version >= 3 && 149 (ld->di_flags2 & XFS_DIFLAG2_NREXT64); 150 } 151 152 static inline void 153 xfs_log_dinode_to_disk_iext_counters( 154 struct xfs_log_dinode *from, 155 struct xfs_dinode *to) 156 { 157 if (xfs_log_dinode_has_large_extent_counts(from)) { 158 to->di_big_nextents = cpu_to_be64(from->di_big_nextents); 159 to->di_big_anextents = cpu_to_be32(from->di_big_anextents); 160 to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); 161 } else { 162 to->di_nextents = cpu_to_be32(from->di_nextents); 163 to->di_anextents = cpu_to_be16(from->di_anextents); 164 } 165 166 } 167 168 STATIC void 169 xfs_log_dinode_to_disk( 170 struct xfs_log_dinode *from, 171 struct xfs_dinode *to, 172 xfs_lsn_t lsn) 173 { 174 to->di_magic = cpu_to_be16(from->di_magic); 175 to->di_mode = cpu_to_be16(from->di_mode); 176 to->di_version = from->di_version; 177 to->di_format = from->di_format; 178 to->di_onlink = 0; 179 to->di_uid = cpu_to_be32(from->di_uid); 180 to->di_gid = cpu_to_be32(from->di_gid); 181 to->di_nlink = cpu_to_be32(from->di_nlink); 182 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 183 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 184 185 to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); 186 to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); 187 to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); 188 189 to->di_size = cpu_to_be64(from->di_size); 190 to->di_nblocks = cpu_to_be64(from->di_nblocks); 191 to->di_extsize = cpu_to_be32(from->di_extsize); 192 to->di_forkoff = from->di_forkoff; 193 to->di_aformat = from->di_aformat; 194 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 195 to->di_dmstate = cpu_to_be16(from->di_dmstate); 196 to->di_flags = cpu_to_be16(from->di_flags); 197 to->di_gen = cpu_to_be32(from->di_gen); 198 199 if (from->di_version == 3) { 200 to->di_changecount = cpu_to_be64(from->di_changecount); 201 to->di_crtime = xfs_log_dinode_to_disk_ts(from, 202 from->di_crtime); 203 to->di_flags2 = cpu_to_be64(from->di_flags2); 204 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 205 to->di_ino = cpu_to_be64(from->di_ino); 206 to->di_lsn = cpu_to_be64(lsn); 207 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 208 uuid_copy(&to->di_uuid, &from->di_uuid); 209 to->di_v3_pad = 0; 210 } else { 211 to->di_flushiter = cpu_to_be16(from->di_flushiter); 212 memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); 213 } 214 215 xfs_log_dinode_to_disk_iext_counters(from, to); 216 } 217 218 STATIC int 219 xlog_dinode_verify_extent_counts( 220 struct xfs_mount *mp, 221 struct xfs_log_dinode *ldip) 222 { 223 xfs_extnum_t nextents; 224 xfs_aextnum_t anextents; 225 226 if (xfs_log_dinode_has_large_extent_counts(ldip)) { 227 if (!xfs_has_large_extent_counts(mp) || 228 (ldip->di_nrext64_pad != 0)) { 229 XFS_CORRUPTION_ERROR( 230 "Bad log dinode large extent count format", 231 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 232 xfs_alert(mp, 233 "Bad inode 0x%llx, large extent counts %d, padding 0x%x", 234 ldip->di_ino, xfs_has_large_extent_counts(mp), 235 ldip->di_nrext64_pad); 236 return -EFSCORRUPTED; 237 } 238 239 nextents = ldip->di_big_nextents; 240 anextents = ldip->di_big_anextents; 241 } else { 242 if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { 243 XFS_CORRUPTION_ERROR( 244 "Bad log dinode di_v3_pad", 245 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 246 xfs_alert(mp, 247 "Bad inode 0x%llx, di_v3_pad 0x%llx", 248 ldip->di_ino, ldip->di_v3_pad); 249 return -EFSCORRUPTED; 250 } 251 252 nextents = ldip->di_nextents; 253 anextents = ldip->di_anextents; 254 } 255 256 if (unlikely(nextents + anextents > ldip->di_nblocks)) { 257 XFS_CORRUPTION_ERROR("Bad log dinode extent counts", 258 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 259 xfs_alert(mp, 260 "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", 261 ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, 262 anextents, ldip->di_nblocks); 263 return -EFSCORRUPTED; 264 } 265 266 return 0; 267 } 268 269 STATIC int 270 xlog_recover_inode_commit_pass2( 271 struct xlog *log, 272 struct list_head *buffer_list, 273 struct xlog_recover_item *item, 274 xfs_lsn_t current_lsn) 275 { 276 struct xfs_inode_log_format *in_f; 277 struct xfs_mount *mp = log->l_mp; 278 struct xfs_buf *bp; 279 struct xfs_dinode *dip; 280 int len; 281 char *src; 282 char *dest; 283 int error; 284 int attr_index; 285 uint fields; 286 struct xfs_log_dinode *ldip; 287 uint isize; 288 int need_free = 0; 289 xfs_failaddr_t fa; 290 291 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 292 in_f = item->ri_buf[0].i_addr; 293 } else { 294 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); 295 need_free = 1; 296 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 297 if (error) 298 goto error; 299 } 300 301 /* 302 * Inode buffers can be freed, look out for it, 303 * and do not replay the inode. 304 */ 305 if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { 306 error = 0; 307 trace_xfs_log_recover_inode_cancel(log, in_f); 308 goto error; 309 } 310 trace_xfs_log_recover_inode_recover(log, in_f); 311 312 error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 313 0, &bp, &xfs_inode_buf_ops); 314 if (error) 315 goto error; 316 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 317 dip = xfs_buf_offset(bp, in_f->ilf_boffset); 318 319 /* 320 * Make sure the place we're flushing out to really looks 321 * like an inode! 322 */ 323 if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { 324 xfs_alert(mp, 325 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", 326 __func__, dip, bp, in_f->ilf_ino); 327 error = -EFSCORRUPTED; 328 goto out_release; 329 } 330 ldip = item->ri_buf[1].i_addr; 331 if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { 332 xfs_alert(mp, 333 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", 334 __func__, item, in_f->ilf_ino); 335 error = -EFSCORRUPTED; 336 goto out_release; 337 } 338 339 /* 340 * If the inode has an LSN in it, recover the inode only if the on-disk 341 * inode's LSN is older than the lsn of the transaction we are 342 * replaying. We can have multiple checkpoints with the same start LSN, 343 * so the current LSN being equal to the on-disk LSN doesn't necessarily 344 * mean that the on-disk inode is more recent than the change being 345 * replayed. 346 * 347 * We must check the current_lsn against the on-disk inode 348 * here because the we can't trust the log dinode to contain a valid LSN 349 * (see comment below before replaying the log dinode for details). 350 * 351 * Note: we still need to replay an owner change even though the inode 352 * is more recent than the transaction as there is no guarantee that all 353 * the btree blocks are more recent than this transaction, too. 354 */ 355 if (dip->di_version >= 3) { 356 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 357 358 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { 359 trace_xfs_log_recover_inode_skip(log, in_f); 360 error = 0; 361 goto out_owner_change; 362 } 363 } 364 365 /* 366 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 367 * are transactional and if ordering is necessary we can determine that 368 * more accurately by the LSN field in the V3 inode core. Don't trust 369 * the inode versions we might be changing them here - use the 370 * superblock flag to determine whether we need to look at di_flushiter 371 * to skip replay when the on disk inode is newer than the log one 372 */ 373 if (!xfs_has_v3inodes(mp)) { 374 if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 375 /* 376 * Deal with the wrap case, DI_MAX_FLUSH is less 377 * than smaller numbers 378 */ 379 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 380 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { 381 /* do nothing */ 382 } else { 383 trace_xfs_log_recover_inode_skip(log, in_f); 384 error = 0; 385 goto out_release; 386 } 387 } 388 389 /* Take the opportunity to reset the flush iteration count */ 390 ldip->di_flushiter = 0; 391 } 392 393 394 if (unlikely(S_ISREG(ldip->di_mode))) { 395 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 396 (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 397 XFS_CORRUPTION_ERROR( 398 "Bad log dinode data fork format for regular file", 399 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 400 xfs_alert(mp, 401 "Bad inode 0x%llx, data fork format 0x%x", 402 in_f->ilf_ino, ldip->di_format); 403 error = -EFSCORRUPTED; 404 goto out_release; 405 } 406 } else if (unlikely(S_ISDIR(ldip->di_mode))) { 407 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 408 (ldip->di_format != XFS_DINODE_FMT_BTREE) && 409 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { 410 XFS_CORRUPTION_ERROR( 411 "Bad log dinode data fork format for directory", 412 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 413 xfs_alert(mp, 414 "Bad inode 0x%llx, data fork format 0x%x", 415 in_f->ilf_ino, ldip->di_format); 416 error = -EFSCORRUPTED; 417 goto out_release; 418 } 419 } 420 421 error = xlog_dinode_verify_extent_counts(mp, ldip); 422 if (error) 423 goto out_release; 424 425 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { 426 XFS_CORRUPTION_ERROR("Bad log dinode fork offset", 427 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 428 xfs_alert(mp, 429 "Bad inode 0x%llx, di_forkoff 0x%x", 430 in_f->ilf_ino, ldip->di_forkoff); 431 error = -EFSCORRUPTED; 432 goto out_release; 433 } 434 isize = xfs_log_dinode_size(mp); 435 if (unlikely(item->ri_buf[1].i_len > isize)) { 436 XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, 437 mp, ldip, sizeof(*ldip)); 438 xfs_alert(mp, 439 "Bad inode 0x%llx log dinode size 0x%x", 440 in_f->ilf_ino, item->ri_buf[1].i_len); 441 error = -EFSCORRUPTED; 442 goto out_release; 443 } 444 445 /* 446 * Recover the log dinode inode into the on disk inode. 447 * 448 * The LSN in the log dinode is garbage - it can be zero or reflect 449 * stale in-memory runtime state that isn't coherent with the changes 450 * logged in this transaction or the changes written to the on-disk 451 * inode. Hence we write the current lSN into the inode because that 452 * matches what xfs_iflush() would write inode the inode when flushing 453 * the changes in this transaction. 454 */ 455 xfs_log_dinode_to_disk(ldip, dip, current_lsn); 456 457 fields = in_f->ilf_fields; 458 if (fields & XFS_ILOG_DEV) 459 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 460 461 if (in_f->ilf_size == 2) 462 goto out_owner_change; 463 len = item->ri_buf[2].i_len; 464 src = item->ri_buf[2].i_addr; 465 ASSERT(in_f->ilf_size <= 4); 466 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 467 ASSERT(!(fields & XFS_ILOG_DFORK) || 468 (len == xlog_calc_iovec_len(in_f->ilf_dsize))); 469 470 switch (fields & XFS_ILOG_DFORK) { 471 case XFS_ILOG_DDATA: 472 case XFS_ILOG_DEXT: 473 memcpy(XFS_DFORK_DPTR(dip), src, len); 474 break; 475 476 case XFS_ILOG_DBROOT: 477 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 478 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), 479 XFS_DFORK_DSIZE(dip, mp)); 480 break; 481 482 default: 483 /* 484 * There are no data fork flags set. 485 */ 486 ASSERT((fields & XFS_ILOG_DFORK) == 0); 487 break; 488 } 489 490 /* 491 * If we logged any attribute data, recover it. There may or 492 * may not have been any other non-core data logged in this 493 * transaction. 494 */ 495 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 496 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 497 attr_index = 3; 498 } else { 499 attr_index = 2; 500 } 501 len = item->ri_buf[attr_index].i_len; 502 src = item->ri_buf[attr_index].i_addr; 503 ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); 504 505 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 506 case XFS_ILOG_ADATA: 507 case XFS_ILOG_AEXT: 508 dest = XFS_DFORK_APTR(dip); 509 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 510 memcpy(dest, src, len); 511 break; 512 513 case XFS_ILOG_ABROOT: 514 dest = XFS_DFORK_APTR(dip); 515 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 516 len, (struct xfs_bmdr_block *)dest, 517 XFS_DFORK_ASIZE(dip, mp)); 518 break; 519 520 default: 521 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 522 ASSERT(0); 523 error = -EFSCORRUPTED; 524 goto out_release; 525 } 526 } 527 528 out_owner_change: 529 /* Recover the swapext owner change unless inode has been deleted */ 530 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && 531 (dip->di_mode != 0)) 532 error = xfs_recover_inode_owner_change(mp, dip, in_f, 533 buffer_list); 534 /* re-generate the checksum and validate the recovered inode. */ 535 xfs_dinode_calc_crc(log->l_mp, dip); 536 fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); 537 if (fa) { 538 XFS_CORRUPTION_ERROR( 539 "Bad dinode after recovery", 540 XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); 541 xfs_alert(mp, 542 "Metadata corruption detected at %pS, inode 0x%llx", 543 fa, in_f->ilf_ino); 544 error = -EFSCORRUPTED; 545 goto out_release; 546 } 547 548 ASSERT(bp->b_mount == mp); 549 bp->b_flags |= _XBF_LOGRECOVERY; 550 xfs_buf_delwri_queue(bp, buffer_list); 551 552 out_release: 553 xfs_buf_relse(bp); 554 error: 555 if (need_free) 556 kmem_free(in_f); 557 return error; 558 } 559 560 const struct xlog_recover_item_ops xlog_inode_item_ops = { 561 .item_type = XFS_LI_INODE, 562 .ra_pass2 = xlog_recover_inode_ra_pass2, 563 .commit_pass2 = xlog_recover_inode_commit_pass2, 564 }; 565