1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_ag.h" 14 #include "xfs_inode.h" 15 #include "xfs_errortag.h" 16 #include "xfs_error.h" 17 #include "xfs_icache.h" 18 #include "xfs_trans.h" 19 #include "xfs_ialloc.h" 20 #include "xfs_dir2.h" 21 #include "xfs_health.h" 22 #include "xfs_metafile.h" 23 24 #include <linux/iversion.h> 25 26 /* 27 * If we are doing readahead on an inode buffer, we might be in log recovery 28 * reading an inode allocation buffer that hasn't yet been replayed, and hence 29 * has not had the inode cores stamped into it. Hence for readahead, the buffer 30 * may be potentially invalid. 31 * 32 * If the readahead buffer is invalid, we need to mark it with an error and 33 * clear the DONE status of the buffer so that a followup read will re-read it 34 * from disk. We don't report the error otherwise to avoid warnings during log 35 * recovery and we don't get unnecessary panics on debug kernels. We use EIO here 36 * because all we want to do is say readahead failed; there is no-one to report 37 * the error to, so this will distinguish it from a non-ra verifier failure. 38 * Changes to this readahead error behaviour also need to be reflected in 39 * xfs_dquot_buf_readahead_verify(). 40 */ 41 static void 42 xfs_inode_buf_verify( 43 struct xfs_buf *bp, 44 bool readahead) 45 { 46 struct xfs_mount *mp = bp->b_mount; 47 int i; 48 int ni; 49 50 /* 51 * Validate the magic number and version of every inode in the buffer 52 */ 53 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 54 for (i = 0; i < ni; i++) { 55 struct xfs_dinode *dip; 56 xfs_agino_t unlinked_ino; 57 int di_ok; 58 59 dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); 60 unlinked_ino = be32_to_cpu(dip->di_next_unlinked); 61 di_ok = xfs_verify_magic16(bp, dip->di_magic) && 62 xfs_dinode_good_version(mp, dip->di_version) && 63 xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); 64 if (unlikely(!di_ok || 65 XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { 66 if (readahead) { 67 bp->b_flags &= ~XBF_DONE; 68 xfs_buf_ioerror(bp, -EIO); 69 return; 70 } 71 72 #ifdef DEBUG 73 xfs_alert(mp, 74 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 75 (unsigned long long)xfs_buf_daddr(bp), i, 76 be16_to_cpu(dip->di_magic)); 77 #endif 78 xfs_buf_verifier_error(bp, -EFSCORRUPTED, 79 __func__, dip, sizeof(*dip), 80 NULL); 81 return; 82 } 83 } 84 } 85 86 87 static void 88 xfs_inode_buf_read_verify( 89 struct xfs_buf *bp) 90 { 91 xfs_inode_buf_verify(bp, false); 92 } 93 94 static void 95 xfs_inode_buf_readahead_verify( 96 struct xfs_buf *bp) 97 { 98 xfs_inode_buf_verify(bp, true); 99 } 100 101 static void 102 xfs_inode_buf_write_verify( 103 struct xfs_buf *bp) 104 { 105 xfs_inode_buf_verify(bp, false); 106 } 107 108 const struct xfs_buf_ops xfs_inode_buf_ops = { 109 .name = "xfs_inode", 110 .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), 111 cpu_to_be16(XFS_DINODE_MAGIC) }, 112 .verify_read = xfs_inode_buf_read_verify, 113 .verify_write = xfs_inode_buf_write_verify, 114 }; 115 116 const struct xfs_buf_ops xfs_inode_buf_ra_ops = { 117 .name = "xfs_inode_ra", 118 .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), 119 cpu_to_be16(XFS_DINODE_MAGIC) }, 120 .verify_read = xfs_inode_buf_readahead_verify, 121 .verify_write = xfs_inode_buf_write_verify, 122 }; 123 124 125 /* 126 * This routine is called to map an inode to the buffer containing the on-disk 127 * version of the inode. It returns a pointer to the buffer containing the 128 * on-disk inode in the bpp parameter. 129 */ 130 int 131 xfs_imap_to_bp( 132 struct xfs_mount *mp, 133 struct xfs_trans *tp, 134 struct xfs_imap *imap, 135 struct xfs_buf **bpp) 136 { 137 int error; 138 139 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 140 imap->im_len, 0, bpp, &xfs_inode_buf_ops); 141 if (xfs_metadata_is_sick(error)) 142 xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), 143 XFS_SICK_AG_INODES); 144 return error; 145 } 146 147 static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) 148 { 149 struct timespec64 tv; 150 uint32_t n; 151 152 tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n)); 153 tv.tv_nsec = n; 154 155 return tv; 156 } 157 158 /* Convert an ondisk timestamp to an incore timestamp. */ 159 struct timespec64 160 xfs_inode_from_disk_ts( 161 struct xfs_dinode *dip, 162 const xfs_timestamp_t ts) 163 { 164 struct timespec64 tv; 165 struct xfs_legacy_timestamp *lts; 166 167 if (xfs_dinode_has_bigtime(dip)) 168 return xfs_inode_decode_bigtime(be64_to_cpu(ts)); 169 170 lts = (struct xfs_legacy_timestamp *)&ts; 171 tv.tv_sec = (int)be32_to_cpu(lts->t_sec); 172 tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec); 173 174 return tv; 175 } 176 177 int 178 xfs_inode_from_disk( 179 struct xfs_inode *ip, 180 struct xfs_dinode *from) 181 { 182 struct inode *inode = VFS_I(ip); 183 int error; 184 xfs_failaddr_t fa; 185 186 ASSERT(ip->i_cowfp == NULL); 187 188 fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); 189 if (fa) { 190 xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, 191 sizeof(*from), fa); 192 return -EFSCORRUPTED; 193 } 194 195 /* 196 * First get the permanent information that is needed to allocate an 197 * inode. If the inode is unused, mode is zero and we shouldn't mess 198 * with the uninitialized part of it. 199 */ 200 if (!xfs_has_v3inodes(ip->i_mount)) 201 ip->i_flushiter = be16_to_cpu(from->di_flushiter); 202 inode->i_generation = be32_to_cpu(from->di_gen); 203 inode->i_mode = be16_to_cpu(from->di_mode); 204 if (!inode->i_mode) 205 return 0; 206 207 /* 208 * Convert v1 inodes immediately to v2 inode format as this is the 209 * minimum inode version format we support in the rest of the code. 210 * They will also be unconditionally written back to disk as v2 inodes. 211 */ 212 if (unlikely(from->di_version == 1)) { 213 /* di_metatype used to be di_onlink */ 214 set_nlink(inode, be16_to_cpu(from->di_metatype)); 215 ip->i_projid = 0; 216 } else { 217 set_nlink(inode, be32_to_cpu(from->di_nlink)); 218 ip->i_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | 219 be16_to_cpu(from->di_projid_lo); 220 if (xfs_dinode_is_metadir(from)) 221 ip->i_metatype = be16_to_cpu(from->di_metatype); 222 } 223 224 i_uid_write(inode, be32_to_cpu(from->di_uid)); 225 i_gid_write(inode, be32_to_cpu(from->di_gid)); 226 227 /* 228 * Time is signed, so need to convert to signed 32 bit before 229 * storing in inode timestamp which may be 64 bit. Otherwise 230 * a time before epoch is converted to a time long after epoch 231 * on 64 bit systems. 232 */ 233 inode_set_atime_to_ts(inode, 234 xfs_inode_from_disk_ts(from, from->di_atime)); 235 inode_set_mtime_to_ts(inode, 236 xfs_inode_from_disk_ts(from, from->di_mtime)); 237 inode_set_ctime_to_ts(inode, 238 xfs_inode_from_disk_ts(from, from->di_ctime)); 239 240 ip->i_disk_size = be64_to_cpu(from->di_size); 241 ip->i_nblocks = be64_to_cpu(from->di_nblocks); 242 ip->i_extsize = be32_to_cpu(from->di_extsize); 243 ip->i_forkoff = from->di_forkoff; 244 ip->i_diflags = be16_to_cpu(from->di_flags); 245 ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); 246 247 if (from->di_dmevmask || from->di_dmstate) 248 xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); 249 250 if (xfs_has_v3inodes(ip->i_mount)) { 251 inode_set_iversion_queried(inode, 252 be64_to_cpu(from->di_changecount)); 253 ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); 254 ip->i_diflags2 = be64_to_cpu(from->di_flags2); 255 /* also covers the di_used_blocks union arm: */ 256 ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); 257 BUILD_BUG_ON(sizeof(from->di_cowextsize) != 258 sizeof(from->di_used_blocks)); 259 } 260 261 error = xfs_iformat_data_fork(ip, from); 262 if (error) 263 return error; 264 if (from->di_forkoff) { 265 error = xfs_iformat_attr_fork(ip, from); 266 if (error) 267 goto out_destroy_data_fork; 268 } 269 if (xfs_is_reflink_inode(ip)) 270 xfs_ifork_init_cow(ip); 271 if (xfs_is_metadir_inode(ip)) { 272 XFS_STATS_DEC(ip->i_mount, xs_inodes_active); 273 XFS_STATS_INC(ip->i_mount, xs_inodes_meta); 274 } 275 return 0; 276 277 out_destroy_data_fork: 278 xfs_idestroy_fork(&ip->i_df); 279 return error; 280 } 281 282 /* Convert an incore timestamp to an ondisk timestamp. */ 283 static inline xfs_timestamp_t 284 xfs_inode_to_disk_ts( 285 struct xfs_inode *ip, 286 const struct timespec64 tv) 287 { 288 struct xfs_legacy_timestamp *lts; 289 xfs_timestamp_t ts; 290 291 if (xfs_inode_has_bigtime(ip)) 292 return cpu_to_be64(xfs_inode_encode_bigtime(tv)); 293 294 lts = (struct xfs_legacy_timestamp *)&ts; 295 lts->t_sec = cpu_to_be32(tv.tv_sec); 296 lts->t_nsec = cpu_to_be32(tv.tv_nsec); 297 298 return ts; 299 } 300 301 static inline void 302 xfs_inode_to_disk_iext_counters( 303 struct xfs_inode *ip, 304 struct xfs_dinode *to) 305 { 306 if (xfs_inode_has_large_extent_counts(ip)) { 307 to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); 308 to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); 309 /* 310 * We might be upgrading the inode to use larger extent counters 311 * than was previously used. Hence zero the unused field. 312 */ 313 to->di_nrext64_pad = cpu_to_be16(0); 314 } else { 315 to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); 316 to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); 317 } 318 } 319 320 void 321 xfs_inode_to_disk( 322 struct xfs_inode *ip, 323 struct xfs_dinode *to, 324 xfs_lsn_t lsn) 325 { 326 struct inode *inode = VFS_I(ip); 327 328 to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 329 if (xfs_is_metadir_inode(ip)) 330 to->di_metatype = cpu_to_be16(ip->i_metatype); 331 else 332 to->di_metatype = 0; 333 334 to->di_format = xfs_ifork_format(&ip->i_df); 335 to->di_uid = cpu_to_be32(i_uid_read(inode)); 336 to->di_gid = cpu_to_be32(i_gid_read(inode)); 337 to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); 338 to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); 339 340 to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode)); 341 to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode)); 342 to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); 343 to->di_nlink = cpu_to_be32(inode->i_nlink); 344 to->di_gen = cpu_to_be32(inode->i_generation); 345 to->di_mode = cpu_to_be16(inode->i_mode); 346 347 to->di_size = cpu_to_be64(ip->i_disk_size); 348 to->di_nblocks = cpu_to_be64(ip->i_nblocks); 349 to->di_extsize = cpu_to_be32(ip->i_extsize); 350 to->di_forkoff = ip->i_forkoff; 351 to->di_aformat = xfs_ifork_format(&ip->i_af); 352 to->di_flags = cpu_to_be16(ip->i_diflags); 353 354 if (xfs_has_v3inodes(ip->i_mount)) { 355 to->di_version = 3; 356 to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); 357 to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); 358 to->di_flags2 = cpu_to_be64(ip->i_diflags2); 359 /* also covers the di_used_blocks union arm: */ 360 to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); 361 to->di_ino = cpu_to_be64(ip->i_ino); 362 to->di_lsn = cpu_to_be64(lsn); 363 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 364 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); 365 to->di_v3_pad = 0; 366 } else { 367 to->di_version = 2; 368 to->di_flushiter = cpu_to_be16(ip->i_flushiter); 369 memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); 370 } 371 372 xfs_inode_to_disk_iext_counters(ip, to); 373 } 374 375 static xfs_failaddr_t 376 xfs_dinode_verify_fork( 377 struct xfs_dinode *dip, 378 struct xfs_mount *mp, 379 int whichfork) 380 { 381 xfs_extnum_t di_nextents; 382 xfs_extnum_t max_extents; 383 mode_t mode = be16_to_cpu(dip->di_mode); 384 uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork); 385 uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork); 386 387 di_nextents = xfs_dfork_nextents(dip, whichfork); 388 389 /* 390 * For fork types that can contain local data, check that the fork 391 * format matches the size of local data contained within the fork. 392 */ 393 if (whichfork == XFS_DATA_FORK) { 394 /* 395 * A directory small enough to fit in the inode must be stored 396 * in local format. The directory sf <-> extents conversion 397 * code updates the directory size accordingly. Directories 398 * being truncated have zero size and are not subject to this 399 * check. 400 */ 401 if (S_ISDIR(mode)) { 402 if (dip->di_size && 403 be64_to_cpu(dip->di_size) <= fork_size && 404 fork_format != XFS_DINODE_FMT_LOCAL) 405 return __this_address; 406 } 407 408 /* 409 * A symlink with a target small enough to fit in the inode can 410 * be stored in extents format if xattrs were added (thus 411 * converting the data fork from shortform to remote format) 412 * and then removed. 413 */ 414 if (S_ISLNK(mode)) { 415 if (be64_to_cpu(dip->di_size) <= fork_size && 416 fork_format != XFS_DINODE_FMT_EXTENTS && 417 fork_format != XFS_DINODE_FMT_LOCAL) 418 return __this_address; 419 } 420 421 /* 422 * For all types, check that when the size says the fork should 423 * be in extent or btree format, the inode isn't claiming to be 424 * in local format. 425 */ 426 if (be64_to_cpu(dip->di_size) > fork_size && 427 fork_format == XFS_DINODE_FMT_LOCAL) 428 return __this_address; 429 } 430 431 switch (fork_format) { 432 case XFS_DINODE_FMT_LOCAL: 433 /* 434 * No local regular files yet. 435 */ 436 if (S_ISREG(mode) && whichfork == XFS_DATA_FORK) 437 return __this_address; 438 if (di_nextents) 439 return __this_address; 440 break; 441 case XFS_DINODE_FMT_EXTENTS: 442 if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork)) 443 return __this_address; 444 break; 445 case XFS_DINODE_FMT_BTREE: 446 max_extents = xfs_iext_max_nextents( 447 xfs_dinode_has_large_extent_counts(dip), 448 whichfork); 449 if (di_nextents > max_extents) 450 return __this_address; 451 break; 452 case XFS_DINODE_FMT_META_BTREE: 453 if (!xfs_has_metadir(mp)) 454 return __this_address; 455 if (!(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_METADATA))) 456 return __this_address; 457 switch (be16_to_cpu(dip->di_metatype)) { 458 case XFS_METAFILE_RTRMAP: 459 /* 460 * growfs must create the rtrmap inodes before adding a 461 * realtime volume to the filesystem, so we cannot use 462 * the rtrmapbt predicate here. 463 */ 464 if (!xfs_has_rmapbt(mp)) 465 return __this_address; 466 break; 467 case XFS_METAFILE_RTREFCOUNT: 468 /* same comment about growfs and rmap inodes applies */ 469 if (!xfs_has_reflink(mp)) 470 return __this_address; 471 break; 472 default: 473 return __this_address; 474 } 475 break; 476 default: 477 return __this_address; 478 } 479 return NULL; 480 } 481 482 static xfs_failaddr_t 483 xfs_dinode_verify_forkoff( 484 struct xfs_dinode *dip, 485 struct xfs_mount *mp) 486 { 487 if (!dip->di_forkoff) 488 return NULL; 489 490 switch (dip->di_format) { 491 case XFS_DINODE_FMT_DEV: 492 if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) 493 return __this_address; 494 break; 495 case XFS_DINODE_FMT_META_BTREE: 496 if (!xfs_has_metadir(mp) || !xfs_has_parent(mp)) 497 return __this_address; 498 fallthrough; 499 case XFS_DINODE_FMT_LOCAL: /* fall through ... */ 500 case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ 501 case XFS_DINODE_FMT_BTREE: 502 if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) 503 return __this_address; 504 break; 505 default: 506 return __this_address; 507 } 508 return NULL; 509 } 510 511 static xfs_failaddr_t 512 xfs_dinode_verify_nrext64( 513 struct xfs_mount *mp, 514 struct xfs_dinode *dip) 515 { 516 if (xfs_dinode_has_large_extent_counts(dip)) { 517 if (!xfs_has_large_extent_counts(mp)) 518 return __this_address; 519 if (dip->di_nrext64_pad != 0) 520 return __this_address; 521 } else if (dip->di_version >= 3) { 522 if (dip->di_v3_pad != 0) 523 return __this_address; 524 } 525 526 return NULL; 527 } 528 529 /* 530 * Validate all the picky requirements we have for a file that claims to be 531 * filesystem metadata. 532 */ 533 xfs_failaddr_t 534 xfs_dinode_verify_metadir( 535 struct xfs_mount *mp, 536 struct xfs_dinode *dip, 537 uint16_t mode, 538 uint16_t flags, 539 uint64_t flags2) 540 { 541 if (!xfs_has_metadir(mp)) 542 return __this_address; 543 544 /* V5 filesystem only */ 545 if (dip->di_version < 3) 546 return __this_address; 547 548 if (be16_to_cpu(dip->di_metatype) >= XFS_METAFILE_MAX) 549 return __this_address; 550 551 /* V3 inode fields that are always zero */ 552 if ((flags2 & XFS_DIFLAG2_NREXT64) && dip->di_nrext64_pad) 553 return __this_address; 554 if (!(flags2 & XFS_DIFLAG2_NREXT64) && dip->di_flushiter) 555 return __this_address; 556 557 /* Metadata files can only be directories or regular files */ 558 if (!S_ISDIR(mode) && !S_ISREG(mode)) 559 return __this_address; 560 561 /* They must have zero access permissions */ 562 if (mode & 0777) 563 return __this_address; 564 565 /* DMAPI event and state masks are zero */ 566 if (dip->di_dmevmask || dip->di_dmstate) 567 return __this_address; 568 569 /* 570 * User and group IDs must be zero. The project ID is used for 571 * grouping inodes. Metadata inodes are never accounted to quotas. 572 */ 573 if (dip->di_uid || dip->di_gid) 574 return __this_address; 575 576 /* Mandatory inode flags must be set */ 577 if (S_ISDIR(mode)) { 578 if ((flags & XFS_METADIR_DIFLAGS) != XFS_METADIR_DIFLAGS) 579 return __this_address; 580 } else { 581 if ((flags & XFS_METAFILE_DIFLAGS) != XFS_METAFILE_DIFLAGS) 582 return __this_address; 583 } 584 585 /* dax flags2 must not be set */ 586 if (flags2 & XFS_DIFLAG2_DAX) 587 return __this_address; 588 589 return NULL; 590 } 591 592 xfs_failaddr_t 593 xfs_dinode_verify( 594 struct xfs_mount *mp, 595 xfs_ino_t ino, 596 struct xfs_dinode *dip) 597 { 598 xfs_failaddr_t fa; 599 uint16_t mode; 600 uint16_t flags; 601 uint64_t flags2; 602 uint64_t di_size; 603 xfs_extnum_t nextents; 604 xfs_extnum_t naextents; 605 xfs_filblks_t nblocks; 606 607 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) 608 return __this_address; 609 610 /* Verify v3 integrity information first */ 611 if (dip->di_version >= 3) { 612 if (!xfs_has_v3inodes(mp)) 613 return __this_address; 614 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 615 XFS_DINODE_CRC_OFF)) 616 return __this_address; 617 if (be64_to_cpu(dip->di_ino) != ino) 618 return __this_address; 619 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) 620 return __this_address; 621 } 622 623 /* 624 * Historical note: xfsprogs in the 3.2 era set up its incore inodes to 625 * have di_nlink track the link count, even if the actual filesystem 626 * only supported V1 inodes (i.e. di_onlink). When writing out the 627 * ondisk inode, it would set both the ondisk di_nlink and di_onlink to 628 * the the incore di_nlink value, which is why we cannot check for 629 * di_nlink==0 on a V1 inode. V2/3 inodes would get written out with 630 * di_onlink==0, so we can check that. 631 */ 632 if (dip->di_version == 2) { 633 if (dip->di_metatype) 634 return __this_address; 635 } else if (dip->di_version >= 3) { 636 if (!xfs_dinode_is_metadir(dip) && dip->di_metatype) 637 return __this_address; 638 } 639 640 /* don't allow invalid i_size */ 641 di_size = be64_to_cpu(dip->di_size); 642 if (di_size & (1ULL << 63)) 643 return __this_address; 644 645 mode = be16_to_cpu(dip->di_mode); 646 if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) 647 return __this_address; 648 649 /* 650 * No zero-length symlinks/dirs unless they're unlinked and hence being 651 * inactivated. 652 */ 653 if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { 654 if (dip->di_version > 1) { 655 if (dip->di_nlink) 656 return __this_address; 657 } else { 658 /* di_metatype used to be di_onlink */ 659 if (dip->di_metatype) 660 return __this_address; 661 } 662 } 663 664 fa = xfs_dinode_verify_nrext64(mp, dip); 665 if (fa) 666 return fa; 667 668 nextents = xfs_dfork_data_extents(dip); 669 naextents = xfs_dfork_attr_extents(dip); 670 nblocks = be64_to_cpu(dip->di_nblocks); 671 672 /* Fork checks carried over from xfs_iformat_fork */ 673 if (mode && nextents + naextents > nblocks) 674 return __this_address; 675 676 if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) 677 return __this_address; 678 679 if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize) 680 return __this_address; 681 682 flags = be16_to_cpu(dip->di_flags); 683 684 if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) 685 return __this_address; 686 687 /* check for illegal values of forkoff */ 688 fa = xfs_dinode_verify_forkoff(dip, mp); 689 if (fa) 690 return fa; 691 692 /* Do we have appropriate data fork formats for the mode? */ 693 switch (mode & S_IFMT) { 694 case S_IFIFO: 695 case S_IFCHR: 696 case S_IFBLK: 697 case S_IFSOCK: 698 if (dip->di_format != XFS_DINODE_FMT_DEV) 699 return __this_address; 700 break; 701 case S_IFREG: 702 case S_IFLNK: 703 case S_IFDIR: 704 fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK); 705 if (fa) 706 return fa; 707 break; 708 case 0: 709 /* Uninitialized inode ok. */ 710 break; 711 default: 712 return __this_address; 713 } 714 715 if (dip->di_forkoff) { 716 fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); 717 if (fa) 718 return fa; 719 } else { 720 /* 721 * If there is no fork offset, this may be a freshly-made inode 722 * in a new disk cluster, in which case di_aformat is zeroed. 723 * Otherwise, such an inode must be in EXTENTS format; this goes 724 * for freed inodes as well. 725 */ 726 switch (dip->di_aformat) { 727 case 0: 728 case XFS_DINODE_FMT_EXTENTS: 729 break; 730 default: 731 return __this_address; 732 } 733 if (naextents) 734 return __this_address; 735 } 736 737 /* extent size hint validation */ 738 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), 739 mode, flags); 740 if (fa) 741 return fa; 742 743 /* only version 3 or greater inodes are extensively verified here */ 744 if (dip->di_version < 3) 745 return NULL; 746 747 flags2 = be64_to_cpu(dip->di_flags2); 748 749 /* don't allow reflink/cowextsize if we don't have reflink */ 750 if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && 751 !xfs_has_reflink(mp)) 752 return __this_address; 753 754 /* only regular files get reflink */ 755 if ((flags2 & XFS_DIFLAG2_REFLINK) && (mode & S_IFMT) != S_IFREG) 756 return __this_address; 757 758 /* don't let reflink and realtime mix */ 759 if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME) && 760 !xfs_has_rtreflink(mp)) 761 return __this_address; 762 763 if (xfs_has_zoned(mp) && 764 dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { 765 if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) 766 return __this_address; 767 } else { 768 /* COW extent size hint validation */ 769 fa = xfs_inode_validate_cowextsize(mp, 770 be32_to_cpu(dip->di_cowextsize), 771 mode, flags, flags2); 772 if (fa) 773 return fa; 774 } 775 776 /* bigtime iflag can only happen on bigtime filesystems */ 777 if (xfs_dinode_has_bigtime(dip) && 778 !xfs_has_bigtime(mp)) 779 return __this_address; 780 781 if (flags2 & XFS_DIFLAG2_METADATA) { 782 fa = xfs_dinode_verify_metadir(mp, dip, mode, flags, flags2); 783 if (fa) 784 return fa; 785 } 786 787 /* metadata inodes containing btrees always have zero extent count */ 788 if (XFS_DFORK_FORMAT(dip, XFS_DATA_FORK) != XFS_DINODE_FMT_META_BTREE) { 789 if (nextents + naextents == 0 && nblocks != 0) 790 return __this_address; 791 } 792 793 return NULL; 794 } 795 796 void 797 xfs_dinode_calc_crc( 798 struct xfs_mount *mp, 799 struct xfs_dinode *dip) 800 { 801 uint32_t crc; 802 803 if (dip->di_version < 3) 804 return; 805 806 ASSERT(xfs_has_crc(mp)); 807 crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, 808 XFS_DINODE_CRC_OFF); 809 dip->di_crc = xfs_end_cksum(crc); 810 } 811 812 /* 813 * Validate di_extsize hint. 814 * 815 * 1. Extent size hint is only valid for directories and regular files. 816 * 2. FS_XFLAG_EXTSIZE is only valid for regular files. 817 * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. 818 * 4. Hint cannot be larger than MAXTEXTLEN. 819 * 5. Can be changed on directories at any time. 820 * 6. Hint value of 0 turns off hints, clears inode flags. 821 * 7. Extent size must be a multiple of the appropriate block size. 822 * For realtime files, this is the rt extent size. 823 * 8. For non-realtime files, the extent size hint must be limited 824 * to half the AG size to avoid alignment extending the extent beyond the 825 * limits of the AG. 826 */ 827 xfs_failaddr_t 828 xfs_inode_validate_extsize( 829 struct xfs_mount *mp, 830 uint32_t extsize, 831 uint16_t mode, 832 uint16_t flags) 833 { 834 bool rt_flag; 835 bool hint_flag; 836 bool inherit_flag; 837 uint32_t extsize_bytes; 838 uint32_t blocksize_bytes; 839 840 rt_flag = (flags & XFS_DIFLAG_REALTIME); 841 hint_flag = (flags & XFS_DIFLAG_EXTSIZE); 842 inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); 843 extsize_bytes = XFS_FSB_TO_B(mp, extsize); 844 845 /* 846 * This comment describes a historic gap in this verifier function. 847 * 848 * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this 849 * function has never checked that the extent size hint is an integer 850 * multiple of the realtime extent size. Since we allow users to set 851 * this combination on non-rt filesystems /and/ to change the rt 852 * extent size when adding a rt device to a filesystem, the net effect 853 * is that users can configure a filesystem anticipating one rt 854 * geometry and change their minds later. Directories do not use the 855 * extent size hint, so this is harmless for them. 856 * 857 * If a directory with a misaligned extent size hint is allowed to 858 * propagate that hint into a new regular realtime file, the result 859 * is that the inode cluster buffer verifier will trigger a corruption 860 * shutdown the next time it is run, because the verifier has always 861 * enforced the alignment rule for regular files. 862 * 863 * Because we allow administrators to set a new rt extent size when 864 * adding a rt section, we cannot add a check to this verifier because 865 * that will result a new source of directory corruption errors when 866 * reading an existing filesystem. Instead, we rely on callers to 867 * decide when alignment checks are appropriate, and fix things up as 868 * needed. 869 */ 870 871 if (rt_flag) 872 blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); 873 else 874 blocksize_bytes = mp->m_sb.sb_blocksize; 875 876 if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) 877 return __this_address; 878 879 if (hint_flag && !S_ISREG(mode)) 880 return __this_address; 881 882 if (inherit_flag && !S_ISDIR(mode)) 883 return __this_address; 884 885 if ((hint_flag || inherit_flag) && extsize == 0) 886 return __this_address; 887 888 /* free inodes get flags set to zero but extsize remains */ 889 if (mode && !(hint_flag || inherit_flag) && extsize != 0) 890 return __this_address; 891 892 if (extsize_bytes % blocksize_bytes) 893 return __this_address; 894 895 if (extsize > XFS_MAX_BMBT_EXTLEN) 896 return __this_address; 897 898 if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) 899 return __this_address; 900 901 return NULL; 902 } 903 904 /* 905 * Validate di_cowextsize hint. 906 * 907 * 1. CoW extent size hint can only be set if reflink is enabled on the fs. 908 * The inode does not have to have any shared blocks, but it must be a v3. 909 * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; 910 * for a directory, the hint is propagated to new files. 911 * 3. Can be changed on files & directories at any time. 912 * 4. Hint value of 0 turns off hints, clears inode flags. 913 * 5. Extent size must be a multiple of the appropriate block size. 914 * 6. The extent size hint must be limited to half the AG size to avoid 915 * alignment extending the extent beyond the limits of the AG. 916 */ 917 xfs_failaddr_t 918 xfs_inode_validate_cowextsize( 919 struct xfs_mount *mp, 920 uint32_t cowextsize, 921 uint16_t mode, 922 uint16_t flags, 923 uint64_t flags2) 924 { 925 bool rt_flag; 926 bool hint_flag; 927 uint32_t cowextsize_bytes; 928 uint32_t blocksize_bytes; 929 930 rt_flag = (flags & XFS_DIFLAG_REALTIME); 931 hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); 932 cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); 933 934 /* 935 * Similar to extent size hints, a directory can be configured to 936 * propagate realtime status and a CoW extent size hint to newly 937 * created files even if there is no realtime device, and the hints on 938 * disk can become misaligned if the sysadmin changes the rt extent 939 * size while adding the realtime device. 940 * 941 * Therefore, we can only enforce the rextsize alignment check against 942 * regular realtime files, and rely on callers to decide when alignment 943 * checks are appropriate, and fix things up as needed. 944 */ 945 946 if (rt_flag) 947 blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); 948 else 949 blocksize_bytes = mp->m_sb.sb_blocksize; 950 951 if (hint_flag && !xfs_has_reflink(mp)) 952 return __this_address; 953 954 if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) 955 return __this_address; 956 957 if (hint_flag && cowextsize == 0) 958 return __this_address; 959 960 /* free inodes get flags set to zero but cowextsize remains */ 961 if (mode && !hint_flag && cowextsize != 0) 962 return __this_address; 963 964 if (cowextsize_bytes % blocksize_bytes) 965 return __this_address; 966 967 if (cowextsize > XFS_MAX_BMBT_EXTLEN) 968 return __this_address; 969 970 if (!rt_flag && cowextsize > mp->m_sb.sb_agblocks / 2) 971 return __this_address; 972 973 return NULL; 974 } 975