1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_mount.h" 25 #include "xfs_defer.h" 26 #include "xfs_inode.h" 27 #include "xfs_error.h" 28 #include "xfs_cksum.h" 29 #include "xfs_icache.h" 30 #include "xfs_trans.h" 31 #include "xfs_ialloc.h" 32 33 /* 34 * Check that none of the inode's in the buffer have a next 35 * unlinked field of 0. 36 */ 37 #if defined(DEBUG) 38 void 39 xfs_inobp_check( 40 xfs_mount_t *mp, 41 xfs_buf_t *bp) 42 { 43 int i; 44 int j; 45 xfs_dinode_t *dip; 46 47 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 48 49 for (i = 0; i < j; i++) { 50 dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); 51 if (!dip->di_next_unlinked) { 52 xfs_alert(mp, 53 "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", 54 i, (long long)bp->b_bn); 55 } 56 } 57 } 58 #endif 59 60 /* 61 * If we are doing readahead on an inode buffer, we might be in log recovery 62 * reading an inode allocation buffer that hasn't yet been replayed, and hence 63 * has not had the inode cores stamped into it. Hence for readahead, the buffer 64 * may be potentially invalid. 65 * 66 * If the readahead buffer is invalid, we need to mark it with an error and 67 * clear the DONE status of the buffer so that a followup read will re-read it 68 * from disk. We don't report the error otherwise to avoid warnings during log 69 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here 70 * because all we want to do is say readahead failed; there is no-one to report 71 * the error to, so this will distinguish it from a non-ra verifier failure. 72 * Changes to this readahead error behavour also need to be reflected in 73 * xfs_dquot_buf_readahead_verify(). 74 */ 75 static void 76 xfs_inode_buf_verify( 77 struct xfs_buf *bp, 78 bool readahead) 79 { 80 struct xfs_mount *mp = bp->b_target->bt_mount; 81 int i; 82 int ni; 83 84 /* 85 * Validate the magic number and version of every inode in the buffer 86 */ 87 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; 88 for (i = 0; i < ni; i++) { 89 int di_ok; 90 xfs_dinode_t *dip; 91 92 dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); 93 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 94 XFS_DINODE_GOOD_VERSION(dip->di_version); 95 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 96 XFS_ERRTAG_ITOBP_INOTOBP, 97 XFS_RANDOM_ITOBP_INOTOBP))) { 98 if (readahead) { 99 bp->b_flags &= ~XBF_DONE; 100 xfs_buf_ioerror(bp, -EIO); 101 return; 102 } 103 104 xfs_buf_ioerror(bp, -EFSCORRUPTED); 105 xfs_verifier_error(bp); 106 #ifdef DEBUG 107 xfs_alert(mp, 108 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 109 (unsigned long long)bp->b_bn, i, 110 be16_to_cpu(dip->di_magic)); 111 #endif 112 } 113 } 114 xfs_inobp_check(mp, bp); 115 } 116 117 118 static void 119 xfs_inode_buf_read_verify( 120 struct xfs_buf *bp) 121 { 122 xfs_inode_buf_verify(bp, false); 123 } 124 125 static void 126 xfs_inode_buf_readahead_verify( 127 struct xfs_buf *bp) 128 { 129 xfs_inode_buf_verify(bp, true); 130 } 131 132 static void 133 xfs_inode_buf_write_verify( 134 struct xfs_buf *bp) 135 { 136 xfs_inode_buf_verify(bp, false); 137 } 138 139 const struct xfs_buf_ops xfs_inode_buf_ops = { 140 .name = "xfs_inode", 141 .verify_read = xfs_inode_buf_read_verify, 142 .verify_write = xfs_inode_buf_write_verify, 143 }; 144 145 const struct xfs_buf_ops xfs_inode_buf_ra_ops = { 146 .name = "xxfs_inode_ra", 147 .verify_read = xfs_inode_buf_readahead_verify, 148 .verify_write = xfs_inode_buf_write_verify, 149 }; 150 151 152 /* 153 * This routine is called to map an inode to the buffer containing the on-disk 154 * version of the inode. It returns a pointer to the buffer containing the 155 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a 156 * pointer to the on-disk inode within that buffer. 157 * 158 * If a non-zero error is returned, then the contents of bpp and dipp are 159 * undefined. 160 */ 161 int 162 xfs_imap_to_bp( 163 struct xfs_mount *mp, 164 struct xfs_trans *tp, 165 struct xfs_imap *imap, 166 struct xfs_dinode **dipp, 167 struct xfs_buf **bpp, 168 uint buf_flags, 169 uint iget_flags) 170 { 171 struct xfs_buf *bp; 172 int error; 173 174 buf_flags |= XBF_UNMAPPED; 175 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 176 (int)imap->im_len, buf_flags, &bp, 177 &xfs_inode_buf_ops); 178 if (error) { 179 if (error == -EAGAIN) { 180 ASSERT(buf_flags & XBF_TRYLOCK); 181 return error; 182 } 183 184 if (error == -EFSCORRUPTED && 185 (iget_flags & XFS_IGET_UNTRUSTED)) 186 return -EINVAL; 187 188 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", 189 __func__, error); 190 return error; 191 } 192 193 *bpp = bp; 194 *dipp = xfs_buf_offset(bp, imap->im_boffset); 195 return 0; 196 } 197 198 void 199 xfs_inode_from_disk( 200 struct xfs_inode *ip, 201 struct xfs_dinode *from) 202 { 203 struct xfs_icdinode *to = &ip->i_d; 204 struct inode *inode = VFS_I(ip); 205 206 207 /* 208 * Convert v1 inodes immediately to v2 inode format as this is the 209 * minimum inode version format we support in the rest of the code. 210 */ 211 to->di_version = from->di_version; 212 if (to->di_version == 1) { 213 set_nlink(inode, be16_to_cpu(from->di_onlink)); 214 to->di_projid_lo = 0; 215 to->di_projid_hi = 0; 216 to->di_version = 2; 217 } else { 218 set_nlink(inode, be32_to_cpu(from->di_nlink)); 219 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 220 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 221 } 222 223 to->di_format = from->di_format; 224 to->di_uid = be32_to_cpu(from->di_uid); 225 to->di_gid = be32_to_cpu(from->di_gid); 226 to->di_flushiter = be16_to_cpu(from->di_flushiter); 227 228 /* 229 * Time is signed, so need to convert to signed 32 bit before 230 * storing in inode timestamp which may be 64 bit. Otherwise 231 * a time before epoch is converted to a time long after epoch 232 * on 64 bit systems. 233 */ 234 inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec); 235 inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec); 236 inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec); 237 inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); 238 inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); 239 inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); 240 inode->i_generation = be32_to_cpu(from->di_gen); 241 inode->i_mode = be16_to_cpu(from->di_mode); 242 243 to->di_size = be64_to_cpu(from->di_size); 244 to->di_nblocks = be64_to_cpu(from->di_nblocks); 245 to->di_extsize = be32_to_cpu(from->di_extsize); 246 to->di_nextents = be32_to_cpu(from->di_nextents); 247 to->di_anextents = be16_to_cpu(from->di_anextents); 248 to->di_forkoff = from->di_forkoff; 249 to->di_aformat = from->di_aformat; 250 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 251 to->di_dmstate = be16_to_cpu(from->di_dmstate); 252 to->di_flags = be16_to_cpu(from->di_flags); 253 254 if (to->di_version == 3) { 255 inode->i_version = be64_to_cpu(from->di_changecount); 256 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); 257 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); 258 to->di_flags2 = be64_to_cpu(from->di_flags2); 259 } 260 } 261 262 void 263 xfs_inode_to_disk( 264 struct xfs_inode *ip, 265 struct xfs_dinode *to, 266 xfs_lsn_t lsn) 267 { 268 struct xfs_icdinode *from = &ip->i_d; 269 struct inode *inode = VFS_I(ip); 270 271 to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); 272 to->di_onlink = 0; 273 274 to->di_version = from->di_version; 275 to->di_format = from->di_format; 276 to->di_uid = cpu_to_be32(from->di_uid); 277 to->di_gid = cpu_to_be32(from->di_gid); 278 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 279 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 280 281 memset(to->di_pad, 0, sizeof(to->di_pad)); 282 to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec); 283 to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec); 284 to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec); 285 to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); 286 to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec); 287 to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); 288 to->di_nlink = cpu_to_be32(inode->i_nlink); 289 to->di_gen = cpu_to_be32(inode->i_generation); 290 to->di_mode = cpu_to_be16(inode->i_mode); 291 292 to->di_size = cpu_to_be64(from->di_size); 293 to->di_nblocks = cpu_to_be64(from->di_nblocks); 294 to->di_extsize = cpu_to_be32(from->di_extsize); 295 to->di_nextents = cpu_to_be32(from->di_nextents); 296 to->di_anextents = cpu_to_be16(from->di_anextents); 297 to->di_forkoff = from->di_forkoff; 298 to->di_aformat = from->di_aformat; 299 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 300 to->di_dmstate = cpu_to_be16(from->di_dmstate); 301 to->di_flags = cpu_to_be16(from->di_flags); 302 303 if (from->di_version == 3) { 304 to->di_changecount = cpu_to_be64(inode->i_version); 305 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); 306 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); 307 to->di_flags2 = cpu_to_be64(from->di_flags2); 308 309 to->di_ino = cpu_to_be64(ip->i_ino); 310 to->di_lsn = cpu_to_be64(lsn); 311 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 312 uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); 313 to->di_flushiter = 0; 314 } else { 315 to->di_flushiter = cpu_to_be16(from->di_flushiter); 316 } 317 } 318 319 void 320 xfs_log_dinode_to_disk( 321 struct xfs_log_dinode *from, 322 struct xfs_dinode *to) 323 { 324 to->di_magic = cpu_to_be16(from->di_magic); 325 to->di_mode = cpu_to_be16(from->di_mode); 326 to->di_version = from->di_version; 327 to->di_format = from->di_format; 328 to->di_onlink = 0; 329 to->di_uid = cpu_to_be32(from->di_uid); 330 to->di_gid = cpu_to_be32(from->di_gid); 331 to->di_nlink = cpu_to_be32(from->di_nlink); 332 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 333 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 334 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 335 336 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 337 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 338 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 339 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 340 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 341 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 342 343 to->di_size = cpu_to_be64(from->di_size); 344 to->di_nblocks = cpu_to_be64(from->di_nblocks); 345 to->di_extsize = cpu_to_be32(from->di_extsize); 346 to->di_nextents = cpu_to_be32(from->di_nextents); 347 to->di_anextents = cpu_to_be16(from->di_anextents); 348 to->di_forkoff = from->di_forkoff; 349 to->di_aformat = from->di_aformat; 350 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 351 to->di_dmstate = cpu_to_be16(from->di_dmstate); 352 to->di_flags = cpu_to_be16(from->di_flags); 353 to->di_gen = cpu_to_be32(from->di_gen); 354 355 if (from->di_version == 3) { 356 to->di_changecount = cpu_to_be64(from->di_changecount); 357 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); 358 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); 359 to->di_flags2 = cpu_to_be64(from->di_flags2); 360 to->di_ino = cpu_to_be64(from->di_ino); 361 to->di_lsn = cpu_to_be64(from->di_lsn); 362 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 363 uuid_copy(&to->di_uuid, &from->di_uuid); 364 to->di_flushiter = 0; 365 } else { 366 to->di_flushiter = cpu_to_be16(from->di_flushiter); 367 } 368 } 369 370 static bool 371 xfs_dinode_verify( 372 struct xfs_mount *mp, 373 struct xfs_inode *ip, 374 struct xfs_dinode *dip) 375 { 376 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) 377 return false; 378 379 /* only version 3 or greater inodes are extensively verified here */ 380 if (dip->di_version < 3) 381 return true; 382 383 if (!xfs_sb_version_hascrc(&mp->m_sb)) 384 return false; 385 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, 386 XFS_DINODE_CRC_OFF)) 387 return false; 388 if (be64_to_cpu(dip->di_ino) != ip->i_ino) 389 return false; 390 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) 391 return false; 392 return true; 393 } 394 395 void 396 xfs_dinode_calc_crc( 397 struct xfs_mount *mp, 398 struct xfs_dinode *dip) 399 { 400 __uint32_t crc; 401 402 if (dip->di_version < 3) 403 return; 404 405 ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); 406 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, 407 XFS_DINODE_CRC_OFF); 408 dip->di_crc = xfs_end_cksum(crc); 409 } 410 411 /* 412 * Read the disk inode attributes into the in-core inode structure. 413 * 414 * For version 5 superblocks, if we are initialising a new inode and we are not 415 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new 416 * inode core with a random generation number. If we are keeping inodes around, 417 * we need to read the inode cluster to get the existing generation number off 418 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode 419 * format) then log recovery is dependent on the di_flushiter field being 420 * initialised from the current on-disk value and hence we must also read the 421 * inode off disk. 422 */ 423 int 424 xfs_iread( 425 xfs_mount_t *mp, 426 xfs_trans_t *tp, 427 xfs_inode_t *ip, 428 uint iget_flags) 429 { 430 xfs_buf_t *bp; 431 xfs_dinode_t *dip; 432 int error; 433 434 /* 435 * Fill in the location information in the in-core inode. 436 */ 437 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 438 if (error) 439 return error; 440 441 /* shortcut IO on inode allocation if possible */ 442 if ((iget_flags & XFS_IGET_CREATE) && 443 xfs_sb_version_hascrc(&mp->m_sb) && 444 !(mp->m_flags & XFS_MOUNT_IKEEP)) { 445 /* initialise the on-disk inode core */ 446 memset(&ip->i_d, 0, sizeof(ip->i_d)); 447 VFS_I(ip)->i_generation = prandom_u32(); 448 if (xfs_sb_version_hascrc(&mp->m_sb)) 449 ip->i_d.di_version = 3; 450 else 451 ip->i_d.di_version = 2; 452 return 0; 453 } 454 455 /* 456 * Get pointers to the on-disk inode and the buffer containing it. 457 */ 458 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); 459 if (error) 460 return error; 461 462 /* even unallocated inodes are verified */ 463 if (!xfs_dinode_verify(mp, ip, dip)) { 464 xfs_alert(mp, "%s: validation failed for inode %lld failed", 465 __func__, ip->i_ino); 466 467 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); 468 error = -EFSCORRUPTED; 469 goto out_brelse; 470 } 471 472 /* 473 * If the on-disk inode is already linked to a directory 474 * entry, copy all of the inode into the in-core inode. 475 * xfs_iformat_fork() handles copying in the inode format 476 * specific information. 477 * Otherwise, just get the truly permanent information. 478 */ 479 if (dip->di_mode) { 480 xfs_inode_from_disk(ip, dip); 481 error = xfs_iformat_fork(ip, dip); 482 if (error) { 483 #ifdef DEBUG 484 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 485 __func__, error); 486 #endif /* DEBUG */ 487 goto out_brelse; 488 } 489 } else { 490 /* 491 * Partial initialisation of the in-core inode. Just the bits 492 * that xfs_ialloc won't overwrite or relies on being correct. 493 */ 494 ip->i_d.di_version = dip->di_version; 495 VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); 496 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 497 498 /* 499 * Make sure to pull in the mode here as well in 500 * case the inode is released without being used. 501 * This ensures that xfs_inactive() will see that 502 * the inode is already free and not try to mess 503 * with the uninitialized part of it. 504 */ 505 VFS_I(ip)->i_mode = 0; 506 } 507 508 ASSERT(ip->i_d.di_version >= 2); 509 ip->i_delayed_blks = 0; 510 511 /* 512 * Mark the buffer containing the inode as something to keep 513 * around for a while. This helps to keep recently accessed 514 * meta-data in-core longer. 515 */ 516 xfs_buf_set_ref(bp, XFS_INO_REF); 517 518 /* 519 * Use xfs_trans_brelse() to release the buffer containing the on-disk 520 * inode, because it was acquired with xfs_trans_read_buf() in 521 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal 522 * brelse(). If we're within a transaction, then xfs_trans_brelse() 523 * will only release the buffer if it is not dirty within the 524 * transaction. It will be OK to release the buffer in this case, 525 * because inodes on disk are never destroyed and we will be locking the 526 * new in-core inode before putting it in the cache where other 527 * processes can find it. Thus we don't have to worry about the inode 528 * being changed just because we released the buffer. 529 */ 530 out_brelse: 531 xfs_trans_brelse(tp, bp); 532 return error; 533 } 534