1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_mount.h" 25 #include "xfs_defer.h" 26 #include "xfs_inode.h" 27 #include "xfs_error.h" 28 #include "xfs_cksum.h" 29 #include "xfs_icache.h" 30 #include "xfs_trans.h" 31 #include "xfs_ialloc.h" 32 33 /* 34 * Check that none of the inode's in the buffer have a next 35 * unlinked field of 0. 36 */ 37 #if defined(DEBUG) 38 void 39 xfs_inobp_check( 40 xfs_mount_t *mp, 41 xfs_buf_t *bp) 42 { 43 int i; 44 int j; 45 xfs_dinode_t *dip; 46 47 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 48 49 for (i = 0; i < j; i++) { 50 dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); 51 if (!dip->di_next_unlinked) { 52 xfs_alert(mp, 53 "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", 54 i, (long long)bp->b_bn); 55 } 56 } 57 } 58 #endif 59 60 /* 61 * If we are doing readahead on an inode buffer, we might be in log recovery 62 * reading an inode allocation buffer that hasn't yet been replayed, and hence 63 * has not had the inode cores stamped into it. Hence for readahead, the buffer 64 * may be potentially invalid. 65 * 66 * If the readahead buffer is invalid, we need to mark it with an error and 67 * clear the DONE status of the buffer so that a followup read will re-read it 68 * from disk. We don't report the error otherwise to avoid warnings during log 69 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here 70 * because all we want to do is say readahead failed; there is no-one to report 71 * the error to, so this will distinguish it from a non-ra verifier failure. 72 * Changes to this readahead error behavour also need to be reflected in 73 * xfs_dquot_buf_readahead_verify(). 74 */ 75 static void 76 xfs_inode_buf_verify( 77 struct xfs_buf *bp, 78 bool readahead) 79 { 80 struct xfs_mount *mp = bp->b_target->bt_mount; 81 int i; 82 int ni; 83 84 /* 85 * Validate the magic number and version of every inode in the buffer 86 */ 87 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 88 for (i = 0; i < ni; i++) { 89 int di_ok; 90 xfs_dinode_t *dip; 91 92 dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); 93 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 94 XFS_DINODE_GOOD_VERSION(dip->di_version); 95 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 96 XFS_ERRTAG_ITOBP_INOTOBP, 97 XFS_RANDOM_ITOBP_INOTOBP))) { 98 if (readahead) { 99 bp->b_flags &= ~XBF_DONE; 100 xfs_buf_ioerror(bp, -EIO); 101 return; 102 } 103 104 xfs_buf_ioerror(bp, -EFSCORRUPTED); 105 xfs_verifier_error(bp); 106 #ifdef DEBUG 107 xfs_alert(mp, 108 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 109 (unsigned long long)bp->b_bn, i, 110 be16_to_cpu(dip->di_magic)); 111 #endif 112 } 113 } 114 xfs_inobp_check(mp, bp); 115 } 116 117 118 static void 119 xfs_inode_buf_read_verify( 120 struct xfs_buf *bp) 121 { 122 xfs_inode_buf_verify(bp, false); 123 } 124 125 static void 126 xfs_inode_buf_readahead_verify( 127 struct xfs_buf *bp) 128 { 129 xfs_inode_buf_verify(bp, true); 130 } 131 132 static void 133 xfs_inode_buf_write_verify( 134 struct xfs_buf *bp) 135 { 136 xfs_inode_buf_verify(bp, false); 137 } 138 139 const struct xfs_buf_ops xfs_inode_buf_ops = { 140 .name = "xfs_inode", 141 .verify_read = xfs_inode_buf_read_verify, 142 .verify_write = xfs_inode_buf_write_verify, 143 }; 144 145 const struct xfs_buf_ops xfs_inode_buf_ra_ops = { 146 .name = "xxfs_inode_ra", 147 .verify_read = xfs_inode_buf_readahead_verify, 148 .verify_write = xfs_inode_buf_write_verify, 149 }; 150 151 152 /* 153 * This routine is called to map an inode to the buffer containing the on-disk 154 * version of the inode. It returns a pointer to the buffer containing the 155 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a 156 * pointer to the on-disk inode within that buffer. 157 * 158 * If a non-zero error is returned, then the contents of bpp and dipp are 159 * undefined. 160 */ 161 int 162 xfs_imap_to_bp( 163 struct xfs_mount *mp, 164 struct xfs_trans *tp, 165 struct xfs_imap *imap, 166 struct xfs_dinode **dipp, 167 struct xfs_buf **bpp, 168 uint buf_flags, 169 uint iget_flags) 170 { 171 struct xfs_buf *bp; 172 int error; 173 174 buf_flags |= XBF_UNMAPPED; 175 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 176 (int)imap->im_len, buf_flags, &bp, 177 &xfs_inode_buf_ops); 178 if (error) { 179 if (error == -EAGAIN) { 180 ASSERT(buf_flags & XBF_TRYLOCK); 181 return error; 182 } 183 184 if (error == -EFSCORRUPTED && 185 (iget_flags & XFS_IGET_UNTRUSTED)) 186 return -EINVAL; 187 188 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", 189 __func__, error); 190 return error; 191 } 192 193 *bpp = bp; 194 *dipp = xfs_buf_offset(bp, imap->im_boffset); 195 return 0; 196 } 197 198 void 199 xfs_inode_from_disk( 200 struct xfs_inode *ip, 201 struct xfs_dinode *from) 202 { 203 struct xfs_icdinode *to = &ip->i_d; 204 struct inode *inode = VFS_I(ip); 205 206 207 /* 208 * Convert v1 inodes immediately to v2 inode format as this is the 209 * minimum inode version format we support in the rest of the code. 210 */ 211 to->di_version = from->di_version; 212 if (to->di_version == 1) { 213 set_nlink(inode, be16_to_cpu(from->di_onlink)); 214 to->di_projid_lo = 0; 215 to->di_projid_hi = 0; 216 to->di_version = 2; 217 } else { 218 set_nlink(inode, be32_to_cpu(from->di_nlink)); 219 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 220 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 221 } 222 223 to->di_format = from->di_format; 224 to->di_uid = be32_to_cpu(from->di_uid); 225 to->di_gid = be32_to_cpu(from->di_gid); 226 to->di_flushiter = be16_to_cpu(from->di_flushiter); 227 228 /* 229 * Time is signed, so need to convert to signed 32 bit before 230 * storing in inode timestamp which may be 64 bit. Otherwise 231 * a time before epoch is converted to a time long after epoch 232 * on 64 bit systems. 233 */ 234 inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); 235 inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); 236 inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); 237 inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); 238 inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); 239 inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); 240 inode->i_generation = be32_to_cpu(from->di_gen); 241 inode->i_mode = be16_to_cpu(from->di_mode); 242 243 to->di_size = be64_to_cpu(from->di_size); 244 to->di_nblocks = be64_to_cpu(from->di_nblocks); 245 to->di_extsize = be32_to_cpu(from->di_extsize); 246 to->di_nextents = be32_to_cpu(from->di_nextents); 247 to->di_anextents = be16_to_cpu(from->di_anextents); 248 to->di_forkoff = from->di_forkoff; 249 to->di_aformat = from->di_aformat; 250 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 251 to->di_dmstate = be16_to_cpu(from->di_dmstate); 252 to->di_flags = be16_to_cpu(from->di_flags); 253 254 if (to->di_version == 3) { 255 inode->i_version = be64_to_cpu(from->di_changecount); 256 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); 257 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); 258 to->di_flags2 = be64_to_cpu(from->di_flags2); 259 to->di_cowextsize = be32_to_cpu(from->di_cowextsize); 260 } 261 } 262 263 void 264 xfs_inode_to_disk( 265 struct xfs_inode *ip, 266 struct xfs_dinode *to, 267 xfs_lsn_t lsn) 268 { 269 struct xfs_icdinode *from = &ip->i_d; 270 struct inode *inode = VFS_I(ip); 271 272 to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 273 to->di_onlink = 0; 274 275 to->di_version = from->di_version; 276 to->di_format = from->di_format; 277 to->di_uid = cpu_to_be32(from->di_uid); 278 to->di_gid = cpu_to_be32(from->di_gid); 279 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 280 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 281 282 memset(to->di_pad, 0, sizeof(to->di_pad)); 283 to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); 284 to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); 285 to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); 286 to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); 287 to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); 288 to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); 289 to->di_nlink = cpu_to_be32(inode->i_nlink); 290 to->di_gen = cpu_to_be32(inode->i_generation); 291 to->di_mode = cpu_to_be16(inode->i_mode); 292 293 to->di_size = cpu_to_be64(from->di_size); 294 to->di_nblocks = cpu_to_be64(from->di_nblocks); 295 to->di_extsize = cpu_to_be32(from->di_extsize); 296 to->di_nextents = cpu_to_be32(from->di_nextents); 297 to->di_anextents = cpu_to_be16(from->di_anextents); 298 to->di_forkoff = from->di_forkoff; 299 to->di_aformat = from->di_aformat; 300 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 301 to->di_dmstate = cpu_to_be16(from->di_dmstate); 302 to->di_flags = cpu_to_be16(from->di_flags); 303 304 if (from->di_version == 3) { 305 to->di_changecount = cpu_to_be64(inode->i_version); 306 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); 307 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); 308 to->di_flags2 = cpu_to_be64(from->di_flags2); 309 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 310 to->di_ino = cpu_to_be64(ip->i_ino); 311 to->di_lsn = cpu_to_be64(lsn); 312 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 313 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); 314 to->di_flushiter = 0; 315 } else { 316 to->di_flushiter = cpu_to_be16(from->di_flushiter); 317 } 318 } 319 320 void 321 xfs_log_dinode_to_disk( 322 struct xfs_log_dinode *from, 323 struct xfs_dinode *to) 324 { 325 to->di_magic = cpu_to_be16(from->di_magic); 326 to->di_mode = cpu_to_be16(from->di_mode); 327 to->di_version = from->di_version; 328 to->di_format = from->di_format; 329 to->di_onlink = 0; 330 to->di_uid = cpu_to_be32(from->di_uid); 331 to->di_gid = cpu_to_be32(from->di_gid); 332 to->di_nlink = cpu_to_be32(from->di_nlink); 333 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 334 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 335 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 336 337 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 338 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 339 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 340 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 341 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 342 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 343 344 to->di_size = cpu_to_be64(from->di_size); 345 to->di_nblocks = cpu_to_be64(from->di_nblocks); 346 to->di_extsize = cpu_to_be32(from->di_extsize); 347 to->di_nextents = cpu_to_be32(from->di_nextents); 348 to->di_anextents = cpu_to_be16(from->di_anextents); 349 to->di_forkoff = from->di_forkoff; 350 to->di_aformat = from->di_aformat; 351 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 352 to->di_dmstate = cpu_to_be16(from->di_dmstate); 353 to->di_flags = cpu_to_be16(from->di_flags); 354 to->di_gen = cpu_to_be32(from->di_gen); 355 356 if (from->di_version == 3) { 357 to->di_changecount = cpu_to_be64(from->di_changecount); 358 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); 359 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); 360 to->di_flags2 = cpu_to_be64(from->di_flags2); 361 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 362 to->di_ino = cpu_to_be64(from->di_ino); 363 to->di_lsn = cpu_to_be64(from->di_lsn); 364 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 365 uuid_copy(&to->di_uuid, &from->di_uuid); 366 to->di_flushiter = 0; 367 } else { 368 to->di_flushiter = cpu_to_be16(from->di_flushiter); 369 } 370 } 371 372 static bool 373 xfs_dinode_verify( 374 struct xfs_mount *mp, 375 struct xfs_inode *ip, 376 struct xfs_dinode *dip) 377 { 378 uint16_t flags; 379 uint64_t flags2; 380 381 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) 382 return false; 383 384 /* only version 3 or greater inodes are extensively verified here */ 385 if (dip->di_version < 3) 386 return true; 387 388 if (!xfs_sb_version_hascrc(&mp->m_sb)) 389 return false; 390 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 391 XFS_DINODE_CRC_OFF)) 392 return false; 393 if (be64_to_cpu(dip->di_ino) != ip->i_ino) 394 return false; 395 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) 396 return false; 397 398 flags = be16_to_cpu(dip->di_flags); 399 flags2 = be64_to_cpu(dip->di_flags2); 400 401 /* don't allow reflink/cowextsize if we don't have reflink */ 402 if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && 403 !xfs_sb_version_hasreflink(&mp->m_sb)) 404 return false; 405 406 /* don't let reflink and realtime mix */ 407 if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) 408 return false; 409 410 /* don't let reflink and dax mix */ 411 if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) 412 return false; 413 414 return true; 415 } 416 417 void 418 xfs_dinode_calc_crc( 419 struct xfs_mount *mp, 420 struct xfs_dinode *dip) 421 { 422 __uint32_t crc; 423 424 if (dip->di_version < 3) 425 return; 426 427 ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); 428 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, 429 XFS_DINODE_CRC_OFF); 430 dip->di_crc = xfs_end_cksum(crc); 431 } 432 433 /* 434 * Read the disk inode attributes into the in-core inode structure. 435 * 436 * For version 5 superblocks, if we are initialising a new inode and we are not 437 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new 438 * inode core with a random generation number. If we are keeping inodes around, 439 * we need to read the inode cluster to get the existing generation number off 440 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode 441 * format) then log recovery is dependent on the di_flushiter field being 442 * initialised from the current on-disk value and hence we must also read the 443 * inode off disk. 444 */ 445 int 446 xfs_iread( 447 xfs_mount_t *mp, 448 xfs_trans_t *tp, 449 xfs_inode_t *ip, 450 uint iget_flags) 451 { 452 xfs_buf_t *bp; 453 xfs_dinode_t *dip; 454 int error; 455 456 /* 457 * Fill in the location information in the in-core inode. 458 */ 459 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 460 if (error) 461 return error; 462 463 /* shortcut IO on inode allocation if possible */ 464 if ((iget_flags & XFS_IGET_CREATE) && 465 xfs_sb_version_hascrc(&mp->m_sb) && 466 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 467 /* initialise the on-disk inode core */ 468 memset(&ip->i_d, 0, sizeof(ip->i_d)); 469 VFS_I(ip)->i_generation = prandom_u32(); 470 if (xfs_sb_version_hascrc(&mp->m_sb)) 471 ip->i_d.di_version = 3; 472 else 473 ip->i_d.di_version = 2; 474 return 0; 475 } 476 477 /* 478 * Get pointers to the on-disk inode and the buffer containing it. 479 */ 480 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); 481 if (error) 482 return error; 483 484 /* even unallocated inodes are verified */ 485 if (!xfs_dinode_verify(mp, ip, dip)) { 486 xfs_alert(mp, "%s: validation failed for inode %lld failed", 487 __func__, ip->i_ino); 488 489 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); 490 error = -EFSCORRUPTED; 491 goto out_brelse; 492 } 493 494 /* 495 * If the on-disk inode is already linked to a directory 496 * entry, copy all of the inode into the in-core inode. 497 * xfs_iformat_fork() handles copying in the inode format 498 * specific information. 499 * Otherwise, just get the truly permanent information. 500 */ 501 if (dip->di_mode) { 502 xfs_inode_from_disk(ip, dip); 503 error = xfs_iformat_fork(ip, dip); 504 if (error) { 505 #ifdef DEBUG 506 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 507 __func__, error); 508 #endif /* DEBUG */ 509 goto out_brelse; 510 } 511 } else { 512 /* 513 * Partial initialisation of the in-core inode. Just the bits 514 * that xfs_ialloc won't overwrite or relies on being correct. 515 */ 516 ip->i_d.di_version = dip->di_version; 517 VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); 518 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 519 520 /* 521 * Make sure to pull in the mode here as well in 522 * case the inode is released without being used. 523 * This ensures that xfs_inactive() will see that 524 * the inode is already free and not try to mess 525 * with the uninitialized part of it. 526 */ 527 VFS_I(ip)->i_mode = 0; 528 } 529 530 ASSERT(ip->i_d.di_version >= 2); 531 ip->i_delayed_blks = 0; 532 533 /* 534 * Mark the buffer containing the inode as something to keep 535 * around for a while. This helps to keep recently accessed 536 * meta-data in-core longer. 537 */ 538 xfs_buf_set_ref(bp, XFS_INO_REF); 539 540 /* 541 * Use xfs_trans_brelse() to release the buffer containing the on-disk 542 * inode, because it was acquired with xfs_trans_read_buf() in 543 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal 544 * brelse(). If we're within a transaction, then xfs_trans_brelse() 545 * will only release the buffer if it is not dirty within the 546 * transaction. It will be OK to release the buffer in this case, 547 * because inodes on disk are never destroyed and we will be locking the 548 * new in-core inode before putting it in the cache where other 549 * processes can find it. Thus we don't have to worry about the inode 550 * being changed just because we released the buffer. 551 */ 552 out_brelse: 553 xfs_trans_brelse(tp, bp); 554 return error; 555 } 556