1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_bit.h" 24 #include "xfs_log.h" 25 #include "xfs_inum.h" 26 #include "xfs_trans.h" 27 #include "xfs_trans_priv.h" 28 #include "xfs_sb.h" 29 #include "xfs_ag.h" 30 #include "xfs_mount.h" 31 #include "xfs_bmap_btree.h" 32 #include "xfs_alloc_btree.h" 33 #include "xfs_ialloc_btree.h" 34 #include "xfs_attr_sf.h" 35 #include "xfs_dinode.h" 36 #include "xfs_inode.h" 37 #include "xfs_buf_item.h" 38 #include "xfs_inode_item.h" 39 #include "xfs_btree.h" 40 #include "xfs_alloc.h" 41 #include "xfs_ialloc.h" 42 #include "xfs_bmap.h" 43 #include "xfs_error.h" 44 #include "xfs_utils.h" 45 #include "xfs_quota.h" 46 #include "xfs_filestream.h" 47 #include "xfs_vnodeops.h" 48 #include "xfs_trace.h" 49 50 kmem_zone_t *xfs_ifork_zone; 51 kmem_zone_t *xfs_inode_zone; 52 53 /* 54 * Used in xfs_itruncate_extents(). This is the maximum number of extents 55 * freed from a file in a single transaction. 56 */ 57 #define XFS_ITRUNC_MAX_EXTENTS 2 58 59 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 60 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 61 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 62 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 63 64 #ifdef DEBUG 65 /* 66 * Make sure that the extents in the given memory buffer 67 * are valid. 68 */ 69 STATIC void 70 xfs_validate_extents( 71 xfs_ifork_t *ifp, 72 int nrecs, 73 xfs_exntfmt_t fmt) 74 { 75 xfs_bmbt_irec_t irec; 76 xfs_bmbt_rec_host_t rec; 77 int i; 78 79 for (i = 0; i < nrecs; i++) { 80 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 81 rec.l0 = get_unaligned(&ep->l0); 82 rec.l1 = get_unaligned(&ep->l1); 83 xfs_bmbt_get_all(&rec, &irec); 84 if (fmt == XFS_EXTFMT_NOSTATE) 85 ASSERT(irec.br_state == XFS_EXT_NORM); 86 } 87 } 88 #else /* DEBUG */ 89 #define xfs_validate_extents(ifp, nrecs, fmt) 90 #endif /* DEBUG */ 91 92 /* 93 * Check that none of the inode's in the buffer have a next 94 * unlinked field of 0. 95 */ 96 #if defined(DEBUG) 97 void 98 xfs_inobp_check( 99 xfs_mount_t *mp, 100 xfs_buf_t *bp) 101 { 102 int i; 103 int j; 104 xfs_dinode_t *dip; 105 106 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 107 108 for (i = 0; i < j; i++) { 109 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 110 i * mp->m_sb.sb_inodesize); 111 if (!dip->di_next_unlinked) { 112 xfs_alert(mp, 113 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", 114 bp); 115 ASSERT(dip->di_next_unlinked); 116 } 117 } 118 } 119 #endif 120 121 /* 122 * Find the buffer associated with the given inode map 123 * We do basic validation checks on the buffer once it has been 124 * retrieved from disk. 125 */ 126 STATIC int 127 xfs_imap_to_bp( 128 xfs_mount_t *mp, 129 xfs_trans_t *tp, 130 struct xfs_imap *imap, 131 xfs_buf_t **bpp, 132 uint buf_flags, 133 uint iget_flags) 134 { 135 int error; 136 int i; 137 int ni; 138 xfs_buf_t *bp; 139 140 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 141 (int)imap->im_len, buf_flags, &bp); 142 if (error) { 143 if (error != EAGAIN) { 144 xfs_warn(mp, 145 "%s: xfs_trans_read_buf() returned error %d.", 146 __func__, error); 147 } else { 148 ASSERT(buf_flags & XBF_TRYLOCK); 149 } 150 return error; 151 } 152 153 /* 154 * Validate the magic number and version of every inode in the buffer 155 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 156 */ 157 #ifdef DEBUG 158 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; 159 #else /* usual case */ 160 ni = 1; 161 #endif 162 163 for (i = 0; i < ni; i++) { 164 int di_ok; 165 xfs_dinode_t *dip; 166 167 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 168 (i << mp->m_sb.sb_inodelog)); 169 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 170 XFS_DINODE_GOOD_VERSION(dip->di_version); 171 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 172 XFS_ERRTAG_ITOBP_INOTOBP, 173 XFS_RANDOM_ITOBP_INOTOBP))) { 174 if (iget_flags & XFS_IGET_UNTRUSTED) { 175 xfs_trans_brelse(tp, bp); 176 return XFS_ERROR(EINVAL); 177 } 178 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 179 XFS_ERRLEVEL_HIGH, mp, dip); 180 #ifdef DEBUG 181 xfs_emerg(mp, 182 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 183 (unsigned long long)imap->im_blkno, i, 184 be16_to_cpu(dip->di_magic)); 185 ASSERT(0); 186 #endif 187 xfs_trans_brelse(tp, bp); 188 return XFS_ERROR(EFSCORRUPTED); 189 } 190 } 191 192 xfs_inobp_check(mp, bp); 193 *bpp = bp; 194 return 0; 195 } 196 197 /* 198 * This routine is called to map an inode number within a file 199 * system to the buffer containing the on-disk version of the 200 * inode. It returns a pointer to the buffer containing the 201 * on-disk inode in the bpp parameter, and in the dip parameter 202 * it returns a pointer to the on-disk inode within that buffer. 203 * 204 * If a non-zero error is returned, then the contents of bpp and 205 * dipp are undefined. 206 * 207 * Use xfs_imap() to determine the size and location of the 208 * buffer to read from disk. 209 */ 210 int 211 xfs_inotobp( 212 xfs_mount_t *mp, 213 xfs_trans_t *tp, 214 xfs_ino_t ino, 215 xfs_dinode_t **dipp, 216 xfs_buf_t **bpp, 217 int *offset, 218 uint imap_flags) 219 { 220 struct xfs_imap imap; 221 xfs_buf_t *bp; 222 int error; 223 224 imap.im_blkno = 0; 225 error = xfs_imap(mp, tp, ino, &imap, imap_flags); 226 if (error) 227 return error; 228 229 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); 230 if (error) 231 return error; 232 233 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 234 *bpp = bp; 235 *offset = imap.im_boffset; 236 return 0; 237 } 238 239 240 /* 241 * This routine is called to map an inode to the buffer containing 242 * the on-disk version of the inode. It returns a pointer to the 243 * buffer containing the on-disk inode in the bpp parameter, and in 244 * the dip parameter it returns a pointer to the on-disk inode within 245 * that buffer. 246 * 247 * If a non-zero error is returned, then the contents of bpp and 248 * dipp are undefined. 249 * 250 * The inode is expected to already been mapped to its buffer and read 251 * in once, thus we can use the mapping information stored in the inode 252 * rather than calling xfs_imap(). This allows us to avoid the overhead 253 * of looking at the inode btree for small block file systems 254 * (see xfs_imap()). 255 */ 256 int 257 xfs_itobp( 258 xfs_mount_t *mp, 259 xfs_trans_t *tp, 260 xfs_inode_t *ip, 261 xfs_dinode_t **dipp, 262 xfs_buf_t **bpp, 263 uint buf_flags) 264 { 265 xfs_buf_t *bp; 266 int error; 267 268 ASSERT(ip->i_imap.im_blkno != 0); 269 270 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); 271 if (error) 272 return error; 273 274 if (!bp) { 275 ASSERT(buf_flags & XBF_TRYLOCK); 276 ASSERT(tp == NULL); 277 *bpp = NULL; 278 return EAGAIN; 279 } 280 281 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 282 *bpp = bp; 283 return 0; 284 } 285 286 /* 287 * Move inode type and inode format specific information from the 288 * on-disk inode to the in-core inode. For fifos, devs, and sockets 289 * this means set if_rdev to the proper value. For files, directories, 290 * and symlinks this means to bring in the in-line data or extent 291 * pointers. For a file in B-tree format, only the root is immediately 292 * brought in-core. The rest will be in-lined in if_extents when it 293 * is first referenced (see xfs_iread_extents()). 294 */ 295 STATIC int 296 xfs_iformat( 297 xfs_inode_t *ip, 298 xfs_dinode_t *dip) 299 { 300 xfs_attr_shortform_t *atp; 301 int size; 302 int error; 303 xfs_fsize_t di_size; 304 ip->i_df.if_ext_max = 305 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 306 error = 0; 307 308 if (unlikely(be32_to_cpu(dip->di_nextents) + 309 be16_to_cpu(dip->di_anextents) > 310 be64_to_cpu(dip->di_nblocks))) { 311 xfs_warn(ip->i_mount, 312 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 313 (unsigned long long)ip->i_ino, 314 (int)(be32_to_cpu(dip->di_nextents) + 315 be16_to_cpu(dip->di_anextents)), 316 (unsigned long long) 317 be64_to_cpu(dip->di_nblocks)); 318 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 319 ip->i_mount, dip); 320 return XFS_ERROR(EFSCORRUPTED); 321 } 322 323 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 324 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", 325 (unsigned long long)ip->i_ino, 326 dip->di_forkoff); 327 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 328 ip->i_mount, dip); 329 return XFS_ERROR(EFSCORRUPTED); 330 } 331 332 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 333 !ip->i_mount->m_rtdev_targp)) { 334 xfs_warn(ip->i_mount, 335 "corrupt dinode %Lu, has realtime flag set.", 336 ip->i_ino); 337 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 338 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 339 return XFS_ERROR(EFSCORRUPTED); 340 } 341 342 switch (ip->i_d.di_mode & S_IFMT) { 343 case S_IFIFO: 344 case S_IFCHR: 345 case S_IFBLK: 346 case S_IFSOCK: 347 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 348 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 349 ip->i_mount, dip); 350 return XFS_ERROR(EFSCORRUPTED); 351 } 352 ip->i_d.di_size = 0; 353 ip->i_size = 0; 354 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 355 break; 356 357 case S_IFREG: 358 case S_IFLNK: 359 case S_IFDIR: 360 switch (dip->di_format) { 361 case XFS_DINODE_FMT_LOCAL: 362 /* 363 * no local regular files yet 364 */ 365 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { 366 xfs_warn(ip->i_mount, 367 "corrupt inode %Lu (local format for regular file).", 368 (unsigned long long) ip->i_ino); 369 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 370 XFS_ERRLEVEL_LOW, 371 ip->i_mount, dip); 372 return XFS_ERROR(EFSCORRUPTED); 373 } 374 375 di_size = be64_to_cpu(dip->di_size); 376 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 377 xfs_warn(ip->i_mount, 378 "corrupt inode %Lu (bad size %Ld for local inode).", 379 (unsigned long long) ip->i_ino, 380 (long long) di_size); 381 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 382 XFS_ERRLEVEL_LOW, 383 ip->i_mount, dip); 384 return XFS_ERROR(EFSCORRUPTED); 385 } 386 387 size = (int)di_size; 388 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 389 break; 390 case XFS_DINODE_FMT_EXTENTS: 391 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 392 break; 393 case XFS_DINODE_FMT_BTREE: 394 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 395 break; 396 default: 397 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 398 ip->i_mount); 399 return XFS_ERROR(EFSCORRUPTED); 400 } 401 break; 402 403 default: 404 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 405 return XFS_ERROR(EFSCORRUPTED); 406 } 407 if (error) { 408 return error; 409 } 410 if (!XFS_DFORK_Q(dip)) 411 return 0; 412 ASSERT(ip->i_afp == NULL); 413 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 414 ip->i_afp->if_ext_max = 415 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 416 switch (dip->di_aformat) { 417 case XFS_DINODE_FMT_LOCAL: 418 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 419 size = be16_to_cpu(atp->hdr.totsize); 420 421 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 422 xfs_warn(ip->i_mount, 423 "corrupt inode %Lu (bad attr fork size %Ld).", 424 (unsigned long long) ip->i_ino, 425 (long long) size); 426 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 427 XFS_ERRLEVEL_LOW, 428 ip->i_mount, dip); 429 return XFS_ERROR(EFSCORRUPTED); 430 } 431 432 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 433 break; 434 case XFS_DINODE_FMT_EXTENTS: 435 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 436 break; 437 case XFS_DINODE_FMT_BTREE: 438 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 439 break; 440 default: 441 error = XFS_ERROR(EFSCORRUPTED); 442 break; 443 } 444 if (error) { 445 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 446 ip->i_afp = NULL; 447 xfs_idestroy_fork(ip, XFS_DATA_FORK); 448 } 449 return error; 450 } 451 452 /* 453 * The file is in-lined in the on-disk inode. 454 * If it fits into if_inline_data, then copy 455 * it there, otherwise allocate a buffer for it 456 * and copy the data there. Either way, set 457 * if_data to point at the data. 458 * If we allocate a buffer for the data, make 459 * sure that its size is a multiple of 4 and 460 * record the real size in i_real_bytes. 461 */ 462 STATIC int 463 xfs_iformat_local( 464 xfs_inode_t *ip, 465 xfs_dinode_t *dip, 466 int whichfork, 467 int size) 468 { 469 xfs_ifork_t *ifp; 470 int real_size; 471 472 /* 473 * If the size is unreasonable, then something 474 * is wrong and we just bail out rather than crash in 475 * kmem_alloc() or memcpy() below. 476 */ 477 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 478 xfs_warn(ip->i_mount, 479 "corrupt inode %Lu (bad size %d for local fork, size = %d).", 480 (unsigned long long) ip->i_ino, size, 481 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 482 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 483 ip->i_mount, dip); 484 return XFS_ERROR(EFSCORRUPTED); 485 } 486 ifp = XFS_IFORK_PTR(ip, whichfork); 487 real_size = 0; 488 if (size == 0) 489 ifp->if_u1.if_data = NULL; 490 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 491 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 492 else { 493 real_size = roundup(size, 4); 494 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 495 } 496 ifp->if_bytes = size; 497 ifp->if_real_bytes = real_size; 498 if (size) 499 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 500 ifp->if_flags &= ~XFS_IFEXTENTS; 501 ifp->if_flags |= XFS_IFINLINE; 502 return 0; 503 } 504 505 /* 506 * The file consists of a set of extents all 507 * of which fit into the on-disk inode. 508 * If there are few enough extents to fit into 509 * the if_inline_ext, then copy them there. 510 * Otherwise allocate a buffer for them and copy 511 * them into it. Either way, set if_extents 512 * to point at the extents. 513 */ 514 STATIC int 515 xfs_iformat_extents( 516 xfs_inode_t *ip, 517 xfs_dinode_t *dip, 518 int whichfork) 519 { 520 xfs_bmbt_rec_t *dp; 521 xfs_ifork_t *ifp; 522 int nex; 523 int size; 524 int i; 525 526 ifp = XFS_IFORK_PTR(ip, whichfork); 527 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 528 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 529 530 /* 531 * If the number of extents is unreasonable, then something 532 * is wrong and we just bail out rather than crash in 533 * kmem_alloc() or memcpy() below. 534 */ 535 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 536 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", 537 (unsigned long long) ip->i_ino, nex); 538 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 539 ip->i_mount, dip); 540 return XFS_ERROR(EFSCORRUPTED); 541 } 542 543 ifp->if_real_bytes = 0; 544 if (nex == 0) 545 ifp->if_u1.if_extents = NULL; 546 else if (nex <= XFS_INLINE_EXTS) 547 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 548 else 549 xfs_iext_add(ifp, 0, nex); 550 551 ifp->if_bytes = size; 552 if (size) { 553 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 554 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 555 for (i = 0; i < nex; i++, dp++) { 556 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 557 ep->l0 = get_unaligned_be64(&dp->l0); 558 ep->l1 = get_unaligned_be64(&dp->l1); 559 } 560 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 561 if (whichfork != XFS_DATA_FORK || 562 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 563 if (unlikely(xfs_check_nostate_extents( 564 ifp, 0, nex))) { 565 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 566 XFS_ERRLEVEL_LOW, 567 ip->i_mount); 568 return XFS_ERROR(EFSCORRUPTED); 569 } 570 } 571 ifp->if_flags |= XFS_IFEXTENTS; 572 return 0; 573 } 574 575 /* 576 * The file has too many extents to fit into 577 * the inode, so they are in B-tree format. 578 * Allocate a buffer for the root of the B-tree 579 * and copy the root into it. The i_extents 580 * field will remain NULL until all of the 581 * extents are read in (when they are needed). 582 */ 583 STATIC int 584 xfs_iformat_btree( 585 xfs_inode_t *ip, 586 xfs_dinode_t *dip, 587 int whichfork) 588 { 589 xfs_bmdr_block_t *dfp; 590 xfs_ifork_t *ifp; 591 /* REFERENCED */ 592 int nrecs; 593 int size; 594 595 ifp = XFS_IFORK_PTR(ip, whichfork); 596 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 597 size = XFS_BMAP_BROOT_SPACE(dfp); 598 nrecs = be16_to_cpu(dfp->bb_numrecs); 599 600 /* 601 * blow out if -- fork has less extents than can fit in 602 * fork (fork shouldn't be a btree format), root btree 603 * block has more records than can fit into the fork, 604 * or the number of extents is greater than the number of 605 * blocks. 606 */ 607 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 608 || XFS_BMDR_SPACE_CALC(nrecs) > 609 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 610 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 611 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 612 (unsigned long long) ip->i_ino); 613 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 614 ip->i_mount, dip); 615 return XFS_ERROR(EFSCORRUPTED); 616 } 617 618 ifp->if_broot_bytes = size; 619 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 620 ASSERT(ifp->if_broot != NULL); 621 /* 622 * Copy and convert from the on-disk structure 623 * to the in-memory structure. 624 */ 625 xfs_bmdr_to_bmbt(ip->i_mount, dfp, 626 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 627 ifp->if_broot, size); 628 ifp->if_flags &= ~XFS_IFEXTENTS; 629 ifp->if_flags |= XFS_IFBROOT; 630 631 return 0; 632 } 633 634 STATIC void 635 xfs_dinode_from_disk( 636 xfs_icdinode_t *to, 637 xfs_dinode_t *from) 638 { 639 to->di_magic = be16_to_cpu(from->di_magic); 640 to->di_mode = be16_to_cpu(from->di_mode); 641 to->di_version = from ->di_version; 642 to->di_format = from->di_format; 643 to->di_onlink = be16_to_cpu(from->di_onlink); 644 to->di_uid = be32_to_cpu(from->di_uid); 645 to->di_gid = be32_to_cpu(from->di_gid); 646 to->di_nlink = be32_to_cpu(from->di_nlink); 647 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 648 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 649 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 650 to->di_flushiter = be16_to_cpu(from->di_flushiter); 651 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 652 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 653 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 654 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 655 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 656 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 657 to->di_size = be64_to_cpu(from->di_size); 658 to->di_nblocks = be64_to_cpu(from->di_nblocks); 659 to->di_extsize = be32_to_cpu(from->di_extsize); 660 to->di_nextents = be32_to_cpu(from->di_nextents); 661 to->di_anextents = be16_to_cpu(from->di_anextents); 662 to->di_forkoff = from->di_forkoff; 663 to->di_aformat = from->di_aformat; 664 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 665 to->di_dmstate = be16_to_cpu(from->di_dmstate); 666 to->di_flags = be16_to_cpu(from->di_flags); 667 to->di_gen = be32_to_cpu(from->di_gen); 668 } 669 670 void 671 xfs_dinode_to_disk( 672 xfs_dinode_t *to, 673 xfs_icdinode_t *from) 674 { 675 to->di_magic = cpu_to_be16(from->di_magic); 676 to->di_mode = cpu_to_be16(from->di_mode); 677 to->di_version = from ->di_version; 678 to->di_format = from->di_format; 679 to->di_onlink = cpu_to_be16(from->di_onlink); 680 to->di_uid = cpu_to_be32(from->di_uid); 681 to->di_gid = cpu_to_be32(from->di_gid); 682 to->di_nlink = cpu_to_be32(from->di_nlink); 683 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 684 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 685 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 686 to->di_flushiter = cpu_to_be16(from->di_flushiter); 687 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 688 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 689 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 690 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 691 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 692 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 693 to->di_size = cpu_to_be64(from->di_size); 694 to->di_nblocks = cpu_to_be64(from->di_nblocks); 695 to->di_extsize = cpu_to_be32(from->di_extsize); 696 to->di_nextents = cpu_to_be32(from->di_nextents); 697 to->di_anextents = cpu_to_be16(from->di_anextents); 698 to->di_forkoff = from->di_forkoff; 699 to->di_aformat = from->di_aformat; 700 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 701 to->di_dmstate = cpu_to_be16(from->di_dmstate); 702 to->di_flags = cpu_to_be16(from->di_flags); 703 to->di_gen = cpu_to_be32(from->di_gen); 704 } 705 706 STATIC uint 707 _xfs_dic2xflags( 708 __uint16_t di_flags) 709 { 710 uint flags = 0; 711 712 if (di_flags & XFS_DIFLAG_ANY) { 713 if (di_flags & XFS_DIFLAG_REALTIME) 714 flags |= XFS_XFLAG_REALTIME; 715 if (di_flags & XFS_DIFLAG_PREALLOC) 716 flags |= XFS_XFLAG_PREALLOC; 717 if (di_flags & XFS_DIFLAG_IMMUTABLE) 718 flags |= XFS_XFLAG_IMMUTABLE; 719 if (di_flags & XFS_DIFLAG_APPEND) 720 flags |= XFS_XFLAG_APPEND; 721 if (di_flags & XFS_DIFLAG_SYNC) 722 flags |= XFS_XFLAG_SYNC; 723 if (di_flags & XFS_DIFLAG_NOATIME) 724 flags |= XFS_XFLAG_NOATIME; 725 if (di_flags & XFS_DIFLAG_NODUMP) 726 flags |= XFS_XFLAG_NODUMP; 727 if (di_flags & XFS_DIFLAG_RTINHERIT) 728 flags |= XFS_XFLAG_RTINHERIT; 729 if (di_flags & XFS_DIFLAG_PROJINHERIT) 730 flags |= XFS_XFLAG_PROJINHERIT; 731 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 732 flags |= XFS_XFLAG_NOSYMLINKS; 733 if (di_flags & XFS_DIFLAG_EXTSIZE) 734 flags |= XFS_XFLAG_EXTSIZE; 735 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 736 flags |= XFS_XFLAG_EXTSZINHERIT; 737 if (di_flags & XFS_DIFLAG_NODEFRAG) 738 flags |= XFS_XFLAG_NODEFRAG; 739 if (di_flags & XFS_DIFLAG_FILESTREAM) 740 flags |= XFS_XFLAG_FILESTREAM; 741 } 742 743 return flags; 744 } 745 746 uint 747 xfs_ip2xflags( 748 xfs_inode_t *ip) 749 { 750 xfs_icdinode_t *dic = &ip->i_d; 751 752 return _xfs_dic2xflags(dic->di_flags) | 753 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 754 } 755 756 uint 757 xfs_dic2xflags( 758 xfs_dinode_t *dip) 759 { 760 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 761 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 762 } 763 764 /* 765 * Read the disk inode attributes into the in-core inode structure. 766 */ 767 int 768 xfs_iread( 769 xfs_mount_t *mp, 770 xfs_trans_t *tp, 771 xfs_inode_t *ip, 772 uint iget_flags) 773 { 774 xfs_buf_t *bp; 775 xfs_dinode_t *dip; 776 int error; 777 778 /* 779 * Fill in the location information in the in-core inode. 780 */ 781 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 782 if (error) 783 return error; 784 785 /* 786 * Get pointers to the on-disk inode and the buffer containing it. 787 */ 788 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 789 XBF_LOCK, iget_flags); 790 if (error) 791 return error; 792 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 793 794 /* 795 * If we got something that isn't an inode it means someone 796 * (nfs or dmi) has a stale handle. 797 */ 798 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { 799 #ifdef DEBUG 800 xfs_alert(mp, 801 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", 802 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); 803 #endif /* DEBUG */ 804 error = XFS_ERROR(EINVAL); 805 goto out_brelse; 806 } 807 808 /* 809 * If the on-disk inode is already linked to a directory 810 * entry, copy all of the inode into the in-core inode. 811 * xfs_iformat() handles copying in the inode format 812 * specific information. 813 * Otherwise, just get the truly permanent information. 814 */ 815 if (dip->di_mode) { 816 xfs_dinode_from_disk(&ip->i_d, dip); 817 error = xfs_iformat(ip, dip); 818 if (error) { 819 #ifdef DEBUG 820 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 821 __func__, error); 822 #endif /* DEBUG */ 823 goto out_brelse; 824 } 825 } else { 826 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 827 ip->i_d.di_version = dip->di_version; 828 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 829 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 830 /* 831 * Make sure to pull in the mode here as well in 832 * case the inode is released without being used. 833 * This ensures that xfs_inactive() will see that 834 * the inode is already free and not try to mess 835 * with the uninitialized part of it. 836 */ 837 ip->i_d.di_mode = 0; 838 /* 839 * Initialize the per-fork minima and maxima for a new 840 * inode here. xfs_iformat will do it for old inodes. 841 */ 842 ip->i_df.if_ext_max = 843 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 844 } 845 846 /* 847 * The inode format changed when we moved the link count and 848 * made it 32 bits long. If this is an old format inode, 849 * convert it in memory to look like a new one. If it gets 850 * flushed to disk we will convert back before flushing or 851 * logging it. We zero out the new projid field and the old link 852 * count field. We'll handle clearing the pad field (the remains 853 * of the old uuid field) when we actually convert the inode to 854 * the new format. We don't change the version number so that we 855 * can distinguish this from a real new format inode. 856 */ 857 if (ip->i_d.di_version == 1) { 858 ip->i_d.di_nlink = ip->i_d.di_onlink; 859 ip->i_d.di_onlink = 0; 860 xfs_set_projid(ip, 0); 861 } 862 863 ip->i_delayed_blks = 0; 864 ip->i_size = ip->i_d.di_size; 865 866 /* 867 * Mark the buffer containing the inode as something to keep 868 * around for a while. This helps to keep recently accessed 869 * meta-data in-core longer. 870 */ 871 xfs_buf_set_ref(bp, XFS_INO_REF); 872 873 /* 874 * Use xfs_trans_brelse() to release the buffer containing the 875 * on-disk inode, because it was acquired with xfs_trans_read_buf() 876 * in xfs_itobp() above. If tp is NULL, this is just a normal 877 * brelse(). If we're within a transaction, then xfs_trans_brelse() 878 * will only release the buffer if it is not dirty within the 879 * transaction. It will be OK to release the buffer in this case, 880 * because inodes on disk are never destroyed and we will be 881 * locking the new in-core inode before putting it in the hash 882 * table where other processes can find it. Thus we don't have 883 * to worry about the inode being changed just because we released 884 * the buffer. 885 */ 886 out_brelse: 887 xfs_trans_brelse(tp, bp); 888 return error; 889 } 890 891 /* 892 * Read in extents from a btree-format inode. 893 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 894 */ 895 int 896 xfs_iread_extents( 897 xfs_trans_t *tp, 898 xfs_inode_t *ip, 899 int whichfork) 900 { 901 int error; 902 xfs_ifork_t *ifp; 903 xfs_extnum_t nextents; 904 905 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 906 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 907 ip->i_mount); 908 return XFS_ERROR(EFSCORRUPTED); 909 } 910 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 911 ifp = XFS_IFORK_PTR(ip, whichfork); 912 913 /* 914 * We know that the size is valid (it's checked in iformat_btree) 915 */ 916 ifp->if_bytes = ifp->if_real_bytes = 0; 917 ifp->if_flags |= XFS_IFEXTENTS; 918 xfs_iext_add(ifp, 0, nextents); 919 error = xfs_bmap_read_extents(tp, ip, whichfork); 920 if (error) { 921 xfs_iext_destroy(ifp); 922 ifp->if_flags &= ~XFS_IFEXTENTS; 923 return error; 924 } 925 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 926 return 0; 927 } 928 929 /* 930 * Allocate an inode on disk and return a copy of its in-core version. 931 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 932 * appropriately within the inode. The uid and gid for the inode are 933 * set according to the contents of the given cred structure. 934 * 935 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 936 * has a free inode available, call xfs_iget() 937 * to obtain the in-core version of the allocated inode. Finally, 938 * fill in the inode and log its initial contents. In this case, 939 * ialloc_context would be set to NULL and call_again set to false. 940 * 941 * If xfs_dialloc() does not have an available inode, 942 * it will replenish its supply by doing an allocation. Since we can 943 * only do one allocation within a transaction without deadlocks, we 944 * must commit the current transaction before returning the inode itself. 945 * In this case, therefore, we will set call_again to true and return. 946 * The caller should then commit the current transaction, start a new 947 * transaction, and call xfs_ialloc() again to actually get the inode. 948 * 949 * To ensure that some other process does not grab the inode that 950 * was allocated during the first call to xfs_ialloc(), this routine 951 * also returns the [locked] bp pointing to the head of the freelist 952 * as ialloc_context. The caller should hold this buffer across 953 * the commit and pass it back into this routine on the second call. 954 * 955 * If we are allocating quota inodes, we do not have a parent inode 956 * to attach to or associate with (i.e. pip == NULL) because they 957 * are not linked into the directory structure - they are attached 958 * directly to the superblock - and so have no parent. 959 */ 960 int 961 xfs_ialloc( 962 xfs_trans_t *tp, 963 xfs_inode_t *pip, 964 mode_t mode, 965 xfs_nlink_t nlink, 966 xfs_dev_t rdev, 967 prid_t prid, 968 int okalloc, 969 xfs_buf_t **ialloc_context, 970 boolean_t *call_again, 971 xfs_inode_t **ipp) 972 { 973 xfs_ino_t ino; 974 xfs_inode_t *ip; 975 uint flags; 976 int error; 977 timespec_t tv; 978 int filestreams = 0; 979 980 /* 981 * Call the space management code to pick 982 * the on-disk inode to be allocated. 983 */ 984 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 985 ialloc_context, call_again, &ino); 986 if (error) 987 return error; 988 if (*call_again || ino == NULLFSINO) { 989 *ipp = NULL; 990 return 0; 991 } 992 ASSERT(*ialloc_context == NULL); 993 994 /* 995 * Get the in-core inode with the lock held exclusively. 996 * This is because we're setting fields here we need 997 * to prevent others from looking at until we're done. 998 */ 999 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, 1000 XFS_ILOCK_EXCL, &ip); 1001 if (error) 1002 return error; 1003 ASSERT(ip != NULL); 1004 1005 ip->i_d.di_mode = (__uint16_t)mode; 1006 ip->i_d.di_onlink = 0; 1007 ip->i_d.di_nlink = nlink; 1008 ASSERT(ip->i_d.di_nlink == nlink); 1009 ip->i_d.di_uid = current_fsuid(); 1010 ip->i_d.di_gid = current_fsgid(); 1011 xfs_set_projid(ip, prid); 1012 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1013 1014 /* 1015 * If the superblock version is up to where we support new format 1016 * inodes and this is currently an old format inode, then change 1017 * the inode version number now. This way we only do the conversion 1018 * here rather than here and in the flush/logging code. 1019 */ 1020 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1021 ip->i_d.di_version == 1) { 1022 ip->i_d.di_version = 2; 1023 /* 1024 * We've already zeroed the old link count, the projid field, 1025 * and the pad field. 1026 */ 1027 } 1028 1029 /* 1030 * Project ids won't be stored on disk if we are using a version 1 inode. 1031 */ 1032 if ((prid != 0) && (ip->i_d.di_version == 1)) 1033 xfs_bump_ino_vers2(tp, ip); 1034 1035 if (pip && XFS_INHERIT_GID(pip)) { 1036 ip->i_d.di_gid = pip->i_d.di_gid; 1037 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { 1038 ip->i_d.di_mode |= S_ISGID; 1039 } 1040 } 1041 1042 /* 1043 * If the group ID of the new file does not match the effective group 1044 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1045 * (and only if the irix_sgid_inherit compatibility variable is set). 1046 */ 1047 if ((irix_sgid_inherit) && 1048 (ip->i_d.di_mode & S_ISGID) && 1049 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1050 ip->i_d.di_mode &= ~S_ISGID; 1051 } 1052 1053 ip->i_d.di_size = 0; 1054 ip->i_size = 0; 1055 ip->i_d.di_nextents = 0; 1056 ASSERT(ip->i_d.di_nblocks == 0); 1057 1058 nanotime(&tv); 1059 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 1060 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 1061 ip->i_d.di_atime = ip->i_d.di_mtime; 1062 ip->i_d.di_ctime = ip->i_d.di_mtime; 1063 1064 /* 1065 * di_gen will have been taken care of in xfs_iread. 1066 */ 1067 ip->i_d.di_extsize = 0; 1068 ip->i_d.di_dmevmask = 0; 1069 ip->i_d.di_dmstate = 0; 1070 ip->i_d.di_flags = 0; 1071 flags = XFS_ILOG_CORE; 1072 switch (mode & S_IFMT) { 1073 case S_IFIFO: 1074 case S_IFCHR: 1075 case S_IFBLK: 1076 case S_IFSOCK: 1077 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1078 ip->i_df.if_u2.if_rdev = rdev; 1079 ip->i_df.if_flags = 0; 1080 flags |= XFS_ILOG_DEV; 1081 break; 1082 case S_IFREG: 1083 /* 1084 * we can't set up filestreams until after the VFS inode 1085 * is set up properly. 1086 */ 1087 if (pip && xfs_inode_is_filestream(pip)) 1088 filestreams = 1; 1089 /* fall through */ 1090 case S_IFDIR: 1091 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1092 uint di_flags = 0; 1093 1094 if (S_ISDIR(mode)) { 1095 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1096 di_flags |= XFS_DIFLAG_RTINHERIT; 1097 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1098 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1099 ip->i_d.di_extsize = pip->i_d.di_extsize; 1100 } 1101 } else if (S_ISREG(mode)) { 1102 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1103 di_flags |= XFS_DIFLAG_REALTIME; 1104 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1105 di_flags |= XFS_DIFLAG_EXTSIZE; 1106 ip->i_d.di_extsize = pip->i_d.di_extsize; 1107 } 1108 } 1109 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1110 xfs_inherit_noatime) 1111 di_flags |= XFS_DIFLAG_NOATIME; 1112 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1113 xfs_inherit_nodump) 1114 di_flags |= XFS_DIFLAG_NODUMP; 1115 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1116 xfs_inherit_sync) 1117 di_flags |= XFS_DIFLAG_SYNC; 1118 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1119 xfs_inherit_nosymlinks) 1120 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1121 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1122 di_flags |= XFS_DIFLAG_PROJINHERIT; 1123 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1124 xfs_inherit_nodefrag) 1125 di_flags |= XFS_DIFLAG_NODEFRAG; 1126 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 1127 di_flags |= XFS_DIFLAG_FILESTREAM; 1128 ip->i_d.di_flags |= di_flags; 1129 } 1130 /* FALLTHROUGH */ 1131 case S_IFLNK: 1132 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1133 ip->i_df.if_flags = XFS_IFEXTENTS; 1134 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1135 ip->i_df.if_u1.if_extents = NULL; 1136 break; 1137 default: 1138 ASSERT(0); 1139 } 1140 /* 1141 * Attribute fork settings for new inode. 1142 */ 1143 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1144 ip->i_d.di_anextents = 0; 1145 1146 /* 1147 * Log the new values stuffed into the inode. 1148 */ 1149 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1150 xfs_trans_log_inode(tp, ip, flags); 1151 1152 /* now that we have an i_mode we can setup inode ops and unlock */ 1153 xfs_setup_inode(ip); 1154 1155 /* now we have set up the vfs inode we can associate the filestream */ 1156 if (filestreams) { 1157 error = xfs_filestream_associate(pip, ip); 1158 if (error < 0) 1159 return -error; 1160 if (!error) 1161 xfs_iflags_set(ip, XFS_IFILESTREAM); 1162 } 1163 1164 *ipp = ip; 1165 return 0; 1166 } 1167 1168 /* 1169 * Check to make sure that there are no blocks allocated to the 1170 * file beyond the size of the file. We don't check this for 1171 * files with fixed size extents or real time extents, but we 1172 * at least do it for regular files. 1173 */ 1174 #ifdef DEBUG 1175 STATIC void 1176 xfs_isize_check( 1177 struct xfs_inode *ip, 1178 xfs_fsize_t isize) 1179 { 1180 struct xfs_mount *mp = ip->i_mount; 1181 xfs_fileoff_t map_first; 1182 int nimaps; 1183 xfs_bmbt_irec_t imaps[2]; 1184 int error; 1185 1186 if (!S_ISREG(ip->i_d.di_mode)) 1187 return; 1188 1189 if (XFS_IS_REALTIME_INODE(ip)) 1190 return; 1191 1192 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 1193 return; 1194 1195 nimaps = 2; 1196 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1197 /* 1198 * The filesystem could be shutting down, so bmapi may return 1199 * an error. 1200 */ 1201 error = xfs_bmapi_read(ip, map_first, 1202 (XFS_B_TO_FSB(mp, 1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first), 1204 imaps, &nimaps, XFS_BMAPI_ENTIRE); 1205 if (error) 1206 return; 1207 ASSERT(nimaps == 1); 1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1209 } 1210 #else /* DEBUG */ 1211 #define xfs_isize_check(ip, isize) 1212 #endif /* DEBUG */ 1213 1214 /* 1215 * Free up the underlying blocks past new_size. The new size must be smaller 1216 * than the current size. This routine can be used both for the attribute and 1217 * data fork, and does not modify the inode size, which is left to the caller. 1218 * 1219 * The transaction passed to this routine must have made a permanent log 1220 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1221 * given transaction and start new ones, so make sure everything involved in 1222 * the transaction is tidy before calling here. Some transaction will be 1223 * returned to the caller to be committed. The incoming transaction must 1224 * already include the inode, and both inode locks must be held exclusively. 1225 * The inode must also be "held" within the transaction. On return the inode 1226 * will be "held" within the returned transaction. This routine does NOT 1227 * require any disk space to be reserved for it within the transaction. 1228 * 1229 * If we get an error, we must return with the inode locked and linked into the 1230 * current transaction. This keeps things simple for the higher level code, 1231 * because it always knows that the inode is locked and held in the transaction 1232 * that returns to it whether errors occur or not. We don't mark the inode 1233 * dirty on error so that transactions can be easily aborted if possible. 1234 */ 1235 int 1236 xfs_itruncate_extents( 1237 struct xfs_trans **tpp, 1238 struct xfs_inode *ip, 1239 int whichfork, 1240 xfs_fsize_t new_size) 1241 { 1242 struct xfs_mount *mp = ip->i_mount; 1243 struct xfs_trans *tp = *tpp; 1244 struct xfs_trans *ntp; 1245 xfs_bmap_free_t free_list; 1246 xfs_fsblock_t first_block; 1247 xfs_fileoff_t first_unmap_block; 1248 xfs_fileoff_t last_block; 1249 xfs_filblks_t unmap_len; 1250 int committed; 1251 int error = 0; 1252 int done = 0; 1253 1254 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1255 ASSERT(new_size <= ip->i_size); 1256 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1257 ASSERT(ip->i_itemp != NULL); 1258 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1259 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1260 1261 /* 1262 * Since it is possible for space to become allocated beyond 1263 * the end of the file (in a crash where the space is allocated 1264 * but the inode size is not yet updated), simply remove any 1265 * blocks which show up between the new EOF and the maximum 1266 * possible file size. If the first block to be removed is 1267 * beyond the maximum file size (ie it is the same as last_block), 1268 * then there is nothing to do. 1269 */ 1270 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1271 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1272 if (first_unmap_block == last_block) 1273 return 0; 1274 1275 ASSERT(first_unmap_block < last_block); 1276 unmap_len = last_block - first_unmap_block + 1; 1277 while (!done) { 1278 xfs_bmap_init(&free_list, &first_block); 1279 error = xfs_bunmapi(tp, ip, 1280 first_unmap_block, unmap_len, 1281 xfs_bmapi_aflag(whichfork), 1282 XFS_ITRUNC_MAX_EXTENTS, 1283 &first_block, &free_list, 1284 &done); 1285 if (error) 1286 goto out_bmap_cancel; 1287 1288 /* 1289 * Duplicate the transaction that has the permanent 1290 * reservation and commit the old transaction. 1291 */ 1292 error = xfs_bmap_finish(&tp, &free_list, &committed); 1293 if (committed) 1294 xfs_trans_ijoin(tp, ip, 0); 1295 if (error) 1296 goto out_bmap_cancel; 1297 1298 if (committed) { 1299 /* 1300 * Mark the inode dirty so it will be logged and 1301 * moved forward in the log as part of every commit. 1302 */ 1303 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1304 } 1305 1306 ntp = xfs_trans_dup(tp); 1307 error = xfs_trans_commit(tp, 0); 1308 tp = ntp; 1309 1310 xfs_trans_ijoin(tp, ip, 0); 1311 1312 if (error) 1313 goto out; 1314 1315 /* 1316 * Transaction commit worked ok so we can drop the extra ticket 1317 * reference that we gained in xfs_trans_dup() 1318 */ 1319 xfs_log_ticket_put(tp->t_ticket); 1320 error = xfs_trans_reserve(tp, 0, 1321 XFS_ITRUNCATE_LOG_RES(mp), 0, 1322 XFS_TRANS_PERM_LOG_RES, 1323 XFS_ITRUNCATE_LOG_COUNT); 1324 if (error) 1325 goto out; 1326 } 1327 1328 out: 1329 *tpp = tp; 1330 return error; 1331 out_bmap_cancel: 1332 /* 1333 * If the bunmapi call encounters an error, return to the caller where 1334 * the transaction can be properly aborted. We just need to make sure 1335 * we're not holding any resources that we were not when we came in. 1336 */ 1337 xfs_bmap_cancel(&free_list); 1338 goto out; 1339 } 1340 1341 int 1342 xfs_itruncate_data( 1343 struct xfs_trans **tpp, 1344 struct xfs_inode *ip, 1345 xfs_fsize_t new_size) 1346 { 1347 int error; 1348 1349 trace_xfs_itruncate_data_start(ip, new_size); 1350 1351 /* 1352 * The first thing we do is set the size to new_size permanently on 1353 * disk. This way we don't have to worry about anyone ever being able 1354 * to look at the data being freed even in the face of a crash. 1355 * What we're getting around here is the case where we free a block, it 1356 * is allocated to another file, it is written to, and then we crash. 1357 * If the new data gets written to the file but the log buffers 1358 * containing the free and reallocation don't, then we'd end up with 1359 * garbage in the blocks being freed. As long as we make the new_size 1360 * permanent before actually freeing any blocks it doesn't matter if 1361 * they get written to. 1362 */ 1363 if (ip->i_d.di_nextents > 0) { 1364 /* 1365 * If we are not changing the file size then do not update 1366 * the on-disk file size - we may be called from 1367 * xfs_inactive_free_eofblocks(). If we update the on-disk 1368 * file size and then the system crashes before the contents 1369 * of the file are flushed to disk then the files may be 1370 * full of holes (ie NULL files bug). 1371 */ 1372 if (ip->i_size != new_size) { 1373 ip->i_d.di_size = new_size; 1374 ip->i_size = new_size; 1375 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1376 } 1377 } 1378 1379 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size); 1380 if (error) 1381 return error; 1382 1383 /* 1384 * If we are not changing the file size then do not update the on-disk 1385 * file size - we may be called from xfs_inactive_free_eofblocks(). 1386 * If we update the on-disk file size and then the system crashes 1387 * before the contents of the file are flushed to disk then the files 1388 * may be full of holes (ie NULL files bug). 1389 */ 1390 xfs_isize_check(ip, new_size); 1391 if (ip->i_size != new_size) { 1392 ip->i_d.di_size = new_size; 1393 ip->i_size = new_size; 1394 } 1395 1396 ASSERT(new_size != 0 || ip->i_delayed_blks == 0); 1397 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0); 1398 1399 /* 1400 * Always re-log the inode so that our permanent transaction can keep 1401 * on rolling it forward in the log. 1402 */ 1403 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 1404 1405 trace_xfs_itruncate_data_end(ip, new_size); 1406 return 0; 1407 } 1408 1409 /* 1410 * This is called when the inode's link count goes to 0. 1411 * We place the on-disk inode on a list in the AGI. It 1412 * will be pulled from this list when the inode is freed. 1413 */ 1414 int 1415 xfs_iunlink( 1416 xfs_trans_t *tp, 1417 xfs_inode_t *ip) 1418 { 1419 xfs_mount_t *mp; 1420 xfs_agi_t *agi; 1421 xfs_dinode_t *dip; 1422 xfs_buf_t *agibp; 1423 xfs_buf_t *ibp; 1424 xfs_agino_t agino; 1425 short bucket_index; 1426 int offset; 1427 int error; 1428 1429 ASSERT(ip->i_d.di_nlink == 0); 1430 ASSERT(ip->i_d.di_mode != 0); 1431 1432 mp = tp->t_mountp; 1433 1434 /* 1435 * Get the agi buffer first. It ensures lock ordering 1436 * on the list. 1437 */ 1438 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1439 if (error) 1440 return error; 1441 agi = XFS_BUF_TO_AGI(agibp); 1442 1443 /* 1444 * Get the index into the agi hash table for the 1445 * list this inode will go on. 1446 */ 1447 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1448 ASSERT(agino != 0); 1449 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1450 ASSERT(agi->agi_unlinked[bucket_index]); 1451 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1452 1453 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { 1454 /* 1455 * There is already another inode in the bucket we need 1456 * to add ourselves to. Add us at the front of the list. 1457 * Here we put the head pointer into our next pointer, 1458 * and then we fall through to point the head at us. 1459 */ 1460 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1461 if (error) 1462 return error; 1463 1464 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); 1465 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1466 offset = ip->i_imap.im_boffset + 1467 offsetof(xfs_dinode_t, di_next_unlinked); 1468 xfs_trans_inode_buf(tp, ibp); 1469 xfs_trans_log_buf(tp, ibp, offset, 1470 (offset + sizeof(xfs_agino_t) - 1)); 1471 xfs_inobp_check(mp, ibp); 1472 } 1473 1474 /* 1475 * Point the bucket head pointer at the inode being inserted. 1476 */ 1477 ASSERT(agino != 0); 1478 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1479 offset = offsetof(xfs_agi_t, agi_unlinked) + 1480 (sizeof(xfs_agino_t) * bucket_index); 1481 xfs_trans_log_buf(tp, agibp, offset, 1482 (offset + sizeof(xfs_agino_t) - 1)); 1483 return 0; 1484 } 1485 1486 /* 1487 * Pull the on-disk inode from the AGI unlinked list. 1488 */ 1489 STATIC int 1490 xfs_iunlink_remove( 1491 xfs_trans_t *tp, 1492 xfs_inode_t *ip) 1493 { 1494 xfs_ino_t next_ino; 1495 xfs_mount_t *mp; 1496 xfs_agi_t *agi; 1497 xfs_dinode_t *dip; 1498 xfs_buf_t *agibp; 1499 xfs_buf_t *ibp; 1500 xfs_agnumber_t agno; 1501 xfs_agino_t agino; 1502 xfs_agino_t next_agino; 1503 xfs_buf_t *last_ibp; 1504 xfs_dinode_t *last_dip = NULL; 1505 short bucket_index; 1506 int offset, last_offset = 0; 1507 int error; 1508 1509 mp = tp->t_mountp; 1510 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1511 1512 /* 1513 * Get the agi buffer first. It ensures lock ordering 1514 * on the list. 1515 */ 1516 error = xfs_read_agi(mp, tp, agno, &agibp); 1517 if (error) 1518 return error; 1519 1520 agi = XFS_BUF_TO_AGI(agibp); 1521 1522 /* 1523 * Get the index into the agi hash table for the 1524 * list this inode will go on. 1525 */ 1526 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1527 ASSERT(agino != 0); 1528 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1529 ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); 1530 ASSERT(agi->agi_unlinked[bucket_index]); 1531 1532 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1533 /* 1534 * We're at the head of the list. Get the inode's 1535 * on-disk buffer to see if there is anyone after us 1536 * on the list. Only modify our next pointer if it 1537 * is not already NULLAGINO. This saves us the overhead 1538 * of dealing with the buffer when there is no need to 1539 * change it. 1540 */ 1541 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1542 if (error) { 1543 xfs_warn(mp, "%s: xfs_itobp() returned error %d.", 1544 __func__, error); 1545 return error; 1546 } 1547 next_agino = be32_to_cpu(dip->di_next_unlinked); 1548 ASSERT(next_agino != 0); 1549 if (next_agino != NULLAGINO) { 1550 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1551 offset = ip->i_imap.im_boffset + 1552 offsetof(xfs_dinode_t, di_next_unlinked); 1553 xfs_trans_inode_buf(tp, ibp); 1554 xfs_trans_log_buf(tp, ibp, offset, 1555 (offset + sizeof(xfs_agino_t) - 1)); 1556 xfs_inobp_check(mp, ibp); 1557 } else { 1558 xfs_trans_brelse(tp, ibp); 1559 } 1560 /* 1561 * Point the bucket head pointer at the next inode. 1562 */ 1563 ASSERT(next_agino != 0); 1564 ASSERT(next_agino != agino); 1565 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 1566 offset = offsetof(xfs_agi_t, agi_unlinked) + 1567 (sizeof(xfs_agino_t) * bucket_index); 1568 xfs_trans_log_buf(tp, agibp, offset, 1569 (offset + sizeof(xfs_agino_t) - 1)); 1570 } else { 1571 /* 1572 * We need to search the list for the inode being freed. 1573 */ 1574 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1575 last_ibp = NULL; 1576 while (next_agino != agino) { 1577 /* 1578 * If the last inode wasn't the one pointing to 1579 * us, then release its buffer since we're not 1580 * going to do anything with it. 1581 */ 1582 if (last_ibp != NULL) { 1583 xfs_trans_brelse(tp, last_ibp); 1584 } 1585 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1586 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1587 &last_ibp, &last_offset, 0); 1588 if (error) { 1589 xfs_warn(mp, 1590 "%s: xfs_inotobp() returned error %d.", 1591 __func__, error); 1592 return error; 1593 } 1594 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1595 ASSERT(next_agino != NULLAGINO); 1596 ASSERT(next_agino != 0); 1597 } 1598 /* 1599 * Now last_ibp points to the buffer previous to us on 1600 * the unlinked list. Pull us from the list. 1601 */ 1602 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1603 if (error) { 1604 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", 1605 __func__, error); 1606 return error; 1607 } 1608 next_agino = be32_to_cpu(dip->di_next_unlinked); 1609 ASSERT(next_agino != 0); 1610 ASSERT(next_agino != agino); 1611 if (next_agino != NULLAGINO) { 1612 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1613 offset = ip->i_imap.im_boffset + 1614 offsetof(xfs_dinode_t, di_next_unlinked); 1615 xfs_trans_inode_buf(tp, ibp); 1616 xfs_trans_log_buf(tp, ibp, offset, 1617 (offset + sizeof(xfs_agino_t) - 1)); 1618 xfs_inobp_check(mp, ibp); 1619 } else { 1620 xfs_trans_brelse(tp, ibp); 1621 } 1622 /* 1623 * Point the previous inode on the list to the next inode. 1624 */ 1625 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1626 ASSERT(next_agino != 0); 1627 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1628 xfs_trans_inode_buf(tp, last_ibp); 1629 xfs_trans_log_buf(tp, last_ibp, offset, 1630 (offset + sizeof(xfs_agino_t) - 1)); 1631 xfs_inobp_check(mp, last_ibp); 1632 } 1633 return 0; 1634 } 1635 1636 /* 1637 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 1638 * inodes that are in memory - they all must be marked stale and attached to 1639 * the cluster buffer. 1640 */ 1641 STATIC int 1642 xfs_ifree_cluster( 1643 xfs_inode_t *free_ip, 1644 xfs_trans_t *tp, 1645 xfs_ino_t inum) 1646 { 1647 xfs_mount_t *mp = free_ip->i_mount; 1648 int blks_per_cluster; 1649 int nbufs; 1650 int ninodes; 1651 int i, j; 1652 xfs_daddr_t blkno; 1653 xfs_buf_t *bp; 1654 xfs_inode_t *ip; 1655 xfs_inode_log_item_t *iip; 1656 xfs_log_item_t *lip; 1657 struct xfs_perag *pag; 1658 1659 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 1660 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1661 blks_per_cluster = 1; 1662 ninodes = mp->m_sb.sb_inopblock; 1663 nbufs = XFS_IALLOC_BLOCKS(mp); 1664 } else { 1665 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 1666 mp->m_sb.sb_blocksize; 1667 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 1668 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1669 } 1670 1671 for (j = 0; j < nbufs; j++, inum += ninodes) { 1672 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1673 XFS_INO_TO_AGBNO(mp, inum)); 1674 1675 /* 1676 * We obtain and lock the backing buffer first in the process 1677 * here, as we have to ensure that any dirty inode that we 1678 * can't get the flush lock on is attached to the buffer. 1679 * If we scan the in-memory inodes first, then buffer IO can 1680 * complete before we get a lock on it, and hence we may fail 1681 * to mark all the active inodes on the buffer stale. 1682 */ 1683 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1684 mp->m_bsize * blks_per_cluster, 1685 XBF_LOCK); 1686 1687 if (!bp) 1688 return ENOMEM; 1689 /* 1690 * Walk the inodes already attached to the buffer and mark them 1691 * stale. These will all have the flush locks held, so an 1692 * in-memory inode walk can't lock them. By marking them all 1693 * stale first, we will not attempt to lock them in the loop 1694 * below as the XFS_ISTALE flag will be set. 1695 */ 1696 lip = bp->b_fspriv; 1697 while (lip) { 1698 if (lip->li_type == XFS_LI_INODE) { 1699 iip = (xfs_inode_log_item_t *)lip; 1700 ASSERT(iip->ili_logged == 1); 1701 lip->li_cb = xfs_istale_done; 1702 xfs_trans_ail_copy_lsn(mp->m_ail, 1703 &iip->ili_flush_lsn, 1704 &iip->ili_item.li_lsn); 1705 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1706 } 1707 lip = lip->li_bio_list; 1708 } 1709 1710 1711 /* 1712 * For each inode in memory attempt to add it to the inode 1713 * buffer and set it up for being staled on buffer IO 1714 * completion. This is safe as we've locked out tail pushing 1715 * and flushing by locking the buffer. 1716 * 1717 * We have already marked every inode that was part of a 1718 * transaction stale above, which means there is no point in 1719 * even trying to lock them. 1720 */ 1721 for (i = 0; i < ninodes; i++) { 1722 retry: 1723 rcu_read_lock(); 1724 ip = radix_tree_lookup(&pag->pag_ici_root, 1725 XFS_INO_TO_AGINO(mp, (inum + i))); 1726 1727 /* Inode not in memory, nothing to do */ 1728 if (!ip) { 1729 rcu_read_unlock(); 1730 continue; 1731 } 1732 1733 /* 1734 * because this is an RCU protected lookup, we could 1735 * find a recently freed or even reallocated inode 1736 * during the lookup. We need to check under the 1737 * i_flags_lock for a valid inode here. Skip it if it 1738 * is not valid, the wrong inode or stale. 1739 */ 1740 spin_lock(&ip->i_flags_lock); 1741 if (ip->i_ino != inum + i || 1742 __xfs_iflags_test(ip, XFS_ISTALE)) { 1743 spin_unlock(&ip->i_flags_lock); 1744 rcu_read_unlock(); 1745 continue; 1746 } 1747 spin_unlock(&ip->i_flags_lock); 1748 1749 /* 1750 * Don't try to lock/unlock the current inode, but we 1751 * _cannot_ skip the other inodes that we did not find 1752 * in the list attached to the buffer and are not 1753 * already marked stale. If we can't lock it, back off 1754 * and retry. 1755 */ 1756 if (ip != free_ip && 1757 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 1758 rcu_read_unlock(); 1759 delay(1); 1760 goto retry; 1761 } 1762 rcu_read_unlock(); 1763 1764 xfs_iflock(ip); 1765 xfs_iflags_set(ip, XFS_ISTALE); 1766 1767 /* 1768 * we don't need to attach clean inodes or those only 1769 * with unlogged changes (which we throw away, anyway). 1770 */ 1771 iip = ip->i_itemp; 1772 if (!iip || xfs_inode_clean(ip)) { 1773 ASSERT(ip != free_ip); 1774 ip->i_update_core = 0; 1775 xfs_ifunlock(ip); 1776 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1777 continue; 1778 } 1779 1780 iip->ili_last_fields = iip->ili_format.ilf_fields; 1781 iip->ili_format.ilf_fields = 0; 1782 iip->ili_logged = 1; 1783 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 1784 &iip->ili_item.li_lsn); 1785 1786 xfs_buf_attach_iodone(bp, xfs_istale_done, 1787 &iip->ili_item); 1788 1789 if (ip != free_ip) 1790 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1791 } 1792 1793 xfs_trans_stale_inode_buf(tp, bp); 1794 xfs_trans_binval(tp, bp); 1795 } 1796 1797 xfs_perag_put(pag); 1798 return 0; 1799 } 1800 1801 /* 1802 * This is called to return an inode to the inode free list. 1803 * The inode should already be truncated to 0 length and have 1804 * no pages associated with it. This routine also assumes that 1805 * the inode is already a part of the transaction. 1806 * 1807 * The on-disk copy of the inode will have been added to the list 1808 * of unlinked inodes in the AGI. We need to remove the inode from 1809 * that list atomically with respect to freeing it here. 1810 */ 1811 int 1812 xfs_ifree( 1813 xfs_trans_t *tp, 1814 xfs_inode_t *ip, 1815 xfs_bmap_free_t *flist) 1816 { 1817 int error; 1818 int delete; 1819 xfs_ino_t first_ino; 1820 xfs_dinode_t *dip; 1821 xfs_buf_t *ibp; 1822 1823 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1824 ASSERT(ip->i_d.di_nlink == 0); 1825 ASSERT(ip->i_d.di_nextents == 0); 1826 ASSERT(ip->i_d.di_anextents == 0); 1827 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 1828 (!S_ISREG(ip->i_d.di_mode))); 1829 ASSERT(ip->i_d.di_nblocks == 0); 1830 1831 /* 1832 * Pull the on-disk inode from the AGI unlinked list. 1833 */ 1834 error = xfs_iunlink_remove(tp, ip); 1835 if (error != 0) { 1836 return error; 1837 } 1838 1839 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 1840 if (error != 0) { 1841 return error; 1842 } 1843 ip->i_d.di_mode = 0; /* mark incore inode as free */ 1844 ip->i_d.di_flags = 0; 1845 ip->i_d.di_dmevmask = 0; 1846 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1847 ip->i_df.if_ext_max = 1848 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 1849 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1850 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1851 /* 1852 * Bump the generation count so no one will be confused 1853 * by reincarnations of this inode. 1854 */ 1855 ip->i_d.di_gen++; 1856 1857 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1858 1859 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); 1860 if (error) 1861 return error; 1862 1863 /* 1864 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat 1865 * from picking up this inode when it is reclaimed (its incore state 1866 * initialzed but not flushed to disk yet). The in-core di_mode is 1867 * already cleared and a corresponding transaction logged. 1868 * The hack here just synchronizes the in-core to on-disk 1869 * di_mode value in advance before the actual inode sync to disk. 1870 * This is OK because the inode is already unlinked and would never 1871 * change its di_mode again for this inode generation. 1872 * This is a temporary hack that would require a proper fix 1873 * in the future. 1874 */ 1875 dip->di_mode = 0; 1876 1877 if (delete) { 1878 error = xfs_ifree_cluster(ip, tp, first_ino); 1879 } 1880 1881 return error; 1882 } 1883 1884 /* 1885 * Reallocate the space for if_broot based on the number of records 1886 * being added or deleted as indicated in rec_diff. Move the records 1887 * and pointers in if_broot to fit the new size. When shrinking this 1888 * will eliminate holes between the records and pointers created by 1889 * the caller. When growing this will create holes to be filled in 1890 * by the caller. 1891 * 1892 * The caller must not request to add more records than would fit in 1893 * the on-disk inode root. If the if_broot is currently NULL, then 1894 * if we adding records one will be allocated. The caller must also 1895 * not request that the number of records go below zero, although 1896 * it can go to zero. 1897 * 1898 * ip -- the inode whose if_broot area is changing 1899 * ext_diff -- the change in the number of records, positive or negative, 1900 * requested for the if_broot array. 1901 */ 1902 void 1903 xfs_iroot_realloc( 1904 xfs_inode_t *ip, 1905 int rec_diff, 1906 int whichfork) 1907 { 1908 struct xfs_mount *mp = ip->i_mount; 1909 int cur_max; 1910 xfs_ifork_t *ifp; 1911 struct xfs_btree_block *new_broot; 1912 int new_max; 1913 size_t new_size; 1914 char *np; 1915 char *op; 1916 1917 /* 1918 * Handle the degenerate case quietly. 1919 */ 1920 if (rec_diff == 0) { 1921 return; 1922 } 1923 1924 ifp = XFS_IFORK_PTR(ip, whichfork); 1925 if (rec_diff > 0) { 1926 /* 1927 * If there wasn't any memory allocated before, just 1928 * allocate it now and get out. 1929 */ 1930 if (ifp->if_broot_bytes == 0) { 1931 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 1932 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 1933 ifp->if_broot_bytes = (int)new_size; 1934 return; 1935 } 1936 1937 /* 1938 * If there is already an existing if_broot, then we need 1939 * to realloc() it and shift the pointers to their new 1940 * location. The records don't change location because 1941 * they are kept butted up against the btree block header. 1942 */ 1943 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 1944 new_max = cur_max + rec_diff; 1945 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 1946 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 1947 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 1948 KM_SLEEP | KM_NOFS); 1949 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1950 ifp->if_broot_bytes); 1951 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1952 (int)new_size); 1953 ifp->if_broot_bytes = (int)new_size; 1954 ASSERT(ifp->if_broot_bytes <= 1955 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 1956 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 1957 return; 1958 } 1959 1960 /* 1961 * rec_diff is less than 0. In this case, we are shrinking the 1962 * if_broot buffer. It must already exist. If we go to zero 1963 * records, just get rid of the root and clear the status bit. 1964 */ 1965 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 1966 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 1967 new_max = cur_max + rec_diff; 1968 ASSERT(new_max >= 0); 1969 if (new_max > 0) 1970 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 1971 else 1972 new_size = 0; 1973 if (new_size > 0) { 1974 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 1975 /* 1976 * First copy over the btree block header. 1977 */ 1978 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); 1979 } else { 1980 new_broot = NULL; 1981 ifp->if_flags &= ~XFS_IFBROOT; 1982 } 1983 1984 /* 1985 * Only copy the records and pointers if there are any. 1986 */ 1987 if (new_max > 0) { 1988 /* 1989 * First copy the records. 1990 */ 1991 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); 1992 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); 1993 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 1994 1995 /* 1996 * Then copy the pointers. 1997 */ 1998 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1999 ifp->if_broot_bytes); 2000 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 2001 (int)new_size); 2002 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2003 } 2004 kmem_free(ifp->if_broot); 2005 ifp->if_broot = new_broot; 2006 ifp->if_broot_bytes = (int)new_size; 2007 ASSERT(ifp->if_broot_bytes <= 2008 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2009 return; 2010 } 2011 2012 2013 /* 2014 * This is called when the amount of space needed for if_data 2015 * is increased or decreased. The change in size is indicated by 2016 * the number of bytes that need to be added or deleted in the 2017 * byte_diff parameter. 2018 * 2019 * If the amount of space needed has decreased below the size of the 2020 * inline buffer, then switch to using the inline buffer. Otherwise, 2021 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2022 * to what is needed. 2023 * 2024 * ip -- the inode whose if_data area is changing 2025 * byte_diff -- the change in the number of bytes, positive or negative, 2026 * requested for the if_data array. 2027 */ 2028 void 2029 xfs_idata_realloc( 2030 xfs_inode_t *ip, 2031 int byte_diff, 2032 int whichfork) 2033 { 2034 xfs_ifork_t *ifp; 2035 int new_size; 2036 int real_size; 2037 2038 if (byte_diff == 0) { 2039 return; 2040 } 2041 2042 ifp = XFS_IFORK_PTR(ip, whichfork); 2043 new_size = (int)ifp->if_bytes + byte_diff; 2044 ASSERT(new_size >= 0); 2045 2046 if (new_size == 0) { 2047 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2048 kmem_free(ifp->if_u1.if_data); 2049 } 2050 ifp->if_u1.if_data = NULL; 2051 real_size = 0; 2052 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2053 /* 2054 * If the valid extents/data can fit in if_inline_ext/data, 2055 * copy them from the malloc'd vector and free it. 2056 */ 2057 if (ifp->if_u1.if_data == NULL) { 2058 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2059 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2060 ASSERT(ifp->if_real_bytes != 0); 2061 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2062 new_size); 2063 kmem_free(ifp->if_u1.if_data); 2064 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2065 } 2066 real_size = 0; 2067 } else { 2068 /* 2069 * Stuck with malloc/realloc. 2070 * For inline data, the underlying buffer must be 2071 * a multiple of 4 bytes in size so that it can be 2072 * logged and stay on word boundaries. We enforce 2073 * that here. 2074 */ 2075 real_size = roundup(new_size, 4); 2076 if (ifp->if_u1.if_data == NULL) { 2077 ASSERT(ifp->if_real_bytes == 0); 2078 ifp->if_u1.if_data = kmem_alloc(real_size, 2079 KM_SLEEP | KM_NOFS); 2080 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2081 /* 2082 * Only do the realloc if the underlying size 2083 * is really changing. 2084 */ 2085 if (ifp->if_real_bytes != real_size) { 2086 ifp->if_u1.if_data = 2087 kmem_realloc(ifp->if_u1.if_data, 2088 real_size, 2089 ifp->if_real_bytes, 2090 KM_SLEEP | KM_NOFS); 2091 } 2092 } else { 2093 ASSERT(ifp->if_real_bytes == 0); 2094 ifp->if_u1.if_data = kmem_alloc(real_size, 2095 KM_SLEEP | KM_NOFS); 2096 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2097 ifp->if_bytes); 2098 } 2099 } 2100 ifp->if_real_bytes = real_size; 2101 ifp->if_bytes = new_size; 2102 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2103 } 2104 2105 void 2106 xfs_idestroy_fork( 2107 xfs_inode_t *ip, 2108 int whichfork) 2109 { 2110 xfs_ifork_t *ifp; 2111 2112 ifp = XFS_IFORK_PTR(ip, whichfork); 2113 if (ifp->if_broot != NULL) { 2114 kmem_free(ifp->if_broot); 2115 ifp->if_broot = NULL; 2116 } 2117 2118 /* 2119 * If the format is local, then we can't have an extents 2120 * array so just look for an inline data array. If we're 2121 * not local then we may or may not have an extents list, 2122 * so check and free it up if we do. 2123 */ 2124 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2125 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2126 (ifp->if_u1.if_data != NULL)) { 2127 ASSERT(ifp->if_real_bytes != 0); 2128 kmem_free(ifp->if_u1.if_data); 2129 ifp->if_u1.if_data = NULL; 2130 ifp->if_real_bytes = 0; 2131 } 2132 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2133 ((ifp->if_flags & XFS_IFEXTIREC) || 2134 ((ifp->if_u1.if_extents != NULL) && 2135 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2136 ASSERT(ifp->if_real_bytes != 0); 2137 xfs_iext_destroy(ifp); 2138 } 2139 ASSERT(ifp->if_u1.if_extents == NULL || 2140 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2141 ASSERT(ifp->if_real_bytes == 0); 2142 if (whichfork == XFS_ATTR_FORK) { 2143 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2144 ip->i_afp = NULL; 2145 } 2146 } 2147 2148 /* 2149 * This is called to unpin an inode. The caller must have the inode locked 2150 * in at least shared mode so that the buffer cannot be subsequently pinned 2151 * once someone is waiting for it to be unpinned. 2152 */ 2153 static void 2154 xfs_iunpin_nowait( 2155 struct xfs_inode *ip) 2156 { 2157 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2158 2159 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2160 2161 /* Give the log a push to start the unpinning I/O */ 2162 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2163 2164 } 2165 2166 void 2167 xfs_iunpin_wait( 2168 struct xfs_inode *ip) 2169 { 2170 if (xfs_ipincount(ip)) { 2171 xfs_iunpin_nowait(ip); 2172 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); 2173 } 2174 } 2175 2176 /* 2177 * xfs_iextents_copy() 2178 * 2179 * This is called to copy the REAL extents (as opposed to the delayed 2180 * allocation extents) from the inode into the given buffer. It 2181 * returns the number of bytes copied into the buffer. 2182 * 2183 * If there are no delayed allocation extents, then we can just 2184 * memcpy() the extents into the buffer. Otherwise, we need to 2185 * examine each extent in turn and skip those which are delayed. 2186 */ 2187 int 2188 xfs_iextents_copy( 2189 xfs_inode_t *ip, 2190 xfs_bmbt_rec_t *dp, 2191 int whichfork) 2192 { 2193 int copied; 2194 int i; 2195 xfs_ifork_t *ifp; 2196 int nrecs; 2197 xfs_fsblock_t start_block; 2198 2199 ifp = XFS_IFORK_PTR(ip, whichfork); 2200 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2201 ASSERT(ifp->if_bytes > 0); 2202 2203 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2204 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2205 ASSERT(nrecs > 0); 2206 2207 /* 2208 * There are some delayed allocation extents in the 2209 * inode, so copy the extents one at a time and skip 2210 * the delayed ones. There must be at least one 2211 * non-delayed extent. 2212 */ 2213 copied = 0; 2214 for (i = 0; i < nrecs; i++) { 2215 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2216 start_block = xfs_bmbt_get_startblock(ep); 2217 if (isnullstartblock(start_block)) { 2218 /* 2219 * It's a delayed allocation extent, so skip it. 2220 */ 2221 continue; 2222 } 2223 2224 /* Translate to on disk format */ 2225 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2226 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2227 dp++; 2228 copied++; 2229 } 2230 ASSERT(copied != 0); 2231 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); 2232 2233 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2234 } 2235 2236 /* 2237 * Each of the following cases stores data into the same region 2238 * of the on-disk inode, so only one of them can be valid at 2239 * any given time. While it is possible to have conflicting formats 2240 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2241 * in EXTENTS format, this can only happen when the fork has 2242 * changed formats after being modified but before being flushed. 2243 * In these cases, the format always takes precedence, because the 2244 * format indicates the current state of the fork. 2245 */ 2246 /*ARGSUSED*/ 2247 STATIC void 2248 xfs_iflush_fork( 2249 xfs_inode_t *ip, 2250 xfs_dinode_t *dip, 2251 xfs_inode_log_item_t *iip, 2252 int whichfork, 2253 xfs_buf_t *bp) 2254 { 2255 char *cp; 2256 xfs_ifork_t *ifp; 2257 xfs_mount_t *mp; 2258 #ifdef XFS_TRANS_DEBUG 2259 int first; 2260 #endif 2261 static const short brootflag[2] = 2262 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2263 static const short dataflag[2] = 2264 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2265 static const short extflag[2] = 2266 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2267 2268 if (!iip) 2269 return; 2270 ifp = XFS_IFORK_PTR(ip, whichfork); 2271 /* 2272 * This can happen if we gave up in iformat in an error path, 2273 * for the attribute fork. 2274 */ 2275 if (!ifp) { 2276 ASSERT(whichfork == XFS_ATTR_FORK); 2277 return; 2278 } 2279 cp = XFS_DFORK_PTR(dip, whichfork); 2280 mp = ip->i_mount; 2281 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2282 case XFS_DINODE_FMT_LOCAL: 2283 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2284 (ifp->if_bytes > 0)) { 2285 ASSERT(ifp->if_u1.if_data != NULL); 2286 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2287 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2288 } 2289 break; 2290 2291 case XFS_DINODE_FMT_EXTENTS: 2292 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2293 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2294 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2295 (ifp->if_bytes > 0)) { 2296 ASSERT(xfs_iext_get_ext(ifp, 0)); 2297 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2298 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2299 whichfork); 2300 } 2301 break; 2302 2303 case XFS_DINODE_FMT_BTREE: 2304 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2305 (ifp->if_broot_bytes > 0)) { 2306 ASSERT(ifp->if_broot != NULL); 2307 ASSERT(ifp->if_broot_bytes <= 2308 (XFS_IFORK_SIZE(ip, whichfork) + 2309 XFS_BROOT_SIZE_ADJ)); 2310 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2311 (xfs_bmdr_block_t *)cp, 2312 XFS_DFORK_SIZE(dip, mp, whichfork)); 2313 } 2314 break; 2315 2316 case XFS_DINODE_FMT_DEV: 2317 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2318 ASSERT(whichfork == XFS_DATA_FORK); 2319 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2320 } 2321 break; 2322 2323 case XFS_DINODE_FMT_UUID: 2324 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2325 ASSERT(whichfork == XFS_DATA_FORK); 2326 memcpy(XFS_DFORK_DPTR(dip), 2327 &ip->i_df.if_u2.if_uuid, 2328 sizeof(uuid_t)); 2329 } 2330 break; 2331 2332 default: 2333 ASSERT(0); 2334 break; 2335 } 2336 } 2337 2338 STATIC int 2339 xfs_iflush_cluster( 2340 xfs_inode_t *ip, 2341 xfs_buf_t *bp) 2342 { 2343 xfs_mount_t *mp = ip->i_mount; 2344 struct xfs_perag *pag; 2345 unsigned long first_index, mask; 2346 unsigned long inodes_per_cluster; 2347 int ilist_size; 2348 xfs_inode_t **ilist; 2349 xfs_inode_t *iq; 2350 int nr_found; 2351 int clcount = 0; 2352 int bufwasdelwri; 2353 int i; 2354 2355 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2356 2357 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2358 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2359 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2360 if (!ilist) 2361 goto out_put; 2362 2363 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2364 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2365 rcu_read_lock(); 2366 /* really need a gang lookup range call here */ 2367 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2368 first_index, inodes_per_cluster); 2369 if (nr_found == 0) 2370 goto out_free; 2371 2372 for (i = 0; i < nr_found; i++) { 2373 iq = ilist[i]; 2374 if (iq == ip) 2375 continue; 2376 2377 /* 2378 * because this is an RCU protected lookup, we could find a 2379 * recently freed or even reallocated inode during the lookup. 2380 * We need to check under the i_flags_lock for a valid inode 2381 * here. Skip it if it is not valid or the wrong inode. 2382 */ 2383 spin_lock(&ip->i_flags_lock); 2384 if (!ip->i_ino || 2385 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { 2386 spin_unlock(&ip->i_flags_lock); 2387 continue; 2388 } 2389 spin_unlock(&ip->i_flags_lock); 2390 2391 /* 2392 * Do an un-protected check to see if the inode is dirty and 2393 * is a candidate for flushing. These checks will be repeated 2394 * later after the appropriate locks are acquired. 2395 */ 2396 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2397 continue; 2398 2399 /* 2400 * Try to get locks. If any are unavailable or it is pinned, 2401 * then this inode cannot be flushed and is skipped. 2402 */ 2403 2404 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2405 continue; 2406 if (!xfs_iflock_nowait(iq)) { 2407 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2408 continue; 2409 } 2410 if (xfs_ipincount(iq)) { 2411 xfs_ifunlock(iq); 2412 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2413 continue; 2414 } 2415 2416 /* 2417 * arriving here means that this inode can be flushed. First 2418 * re-check that it's dirty before flushing. 2419 */ 2420 if (!xfs_inode_clean(iq)) { 2421 int error; 2422 error = xfs_iflush_int(iq, bp); 2423 if (error) { 2424 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2425 goto cluster_corrupt_out; 2426 } 2427 clcount++; 2428 } else { 2429 xfs_ifunlock(iq); 2430 } 2431 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2432 } 2433 2434 if (clcount) { 2435 XFS_STATS_INC(xs_icluster_flushcnt); 2436 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2437 } 2438 2439 out_free: 2440 rcu_read_unlock(); 2441 kmem_free(ilist); 2442 out_put: 2443 xfs_perag_put(pag); 2444 return 0; 2445 2446 2447 cluster_corrupt_out: 2448 /* 2449 * Corruption detected in the clustering loop. Invalidate the 2450 * inode buffer and shut down the filesystem. 2451 */ 2452 rcu_read_unlock(); 2453 /* 2454 * Clean up the buffer. If it was B_DELWRI, just release it -- 2455 * brelse can handle it with no problems. If not, shut down the 2456 * filesystem before releasing the buffer. 2457 */ 2458 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2459 if (bufwasdelwri) 2460 xfs_buf_relse(bp); 2461 2462 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2463 2464 if (!bufwasdelwri) { 2465 /* 2466 * Just like incore_relse: if we have b_iodone functions, 2467 * mark the buffer as an error and call them. Otherwise 2468 * mark it as stale and brelse. 2469 */ 2470 if (bp->b_iodone) { 2471 XFS_BUF_UNDONE(bp); 2472 xfs_buf_stale(bp); 2473 xfs_buf_ioerror(bp, EIO); 2474 xfs_buf_ioend(bp, 0); 2475 } else { 2476 xfs_buf_stale(bp); 2477 xfs_buf_relse(bp); 2478 } 2479 } 2480 2481 /* 2482 * Unlocks the flush lock 2483 */ 2484 xfs_iflush_abort(iq); 2485 kmem_free(ilist); 2486 xfs_perag_put(pag); 2487 return XFS_ERROR(EFSCORRUPTED); 2488 } 2489 2490 /* 2491 * xfs_iflush() will write a modified inode's changes out to the 2492 * inode's on disk home. The caller must have the inode lock held 2493 * in at least shared mode and the inode flush completion must be 2494 * active as well. The inode lock will still be held upon return from 2495 * the call and the caller is free to unlock it. 2496 * The inode flush will be completed when the inode reaches the disk. 2497 * The flags indicate how the inode's buffer should be written out. 2498 */ 2499 int 2500 xfs_iflush( 2501 xfs_inode_t *ip, 2502 uint flags) 2503 { 2504 xfs_inode_log_item_t *iip; 2505 xfs_buf_t *bp; 2506 xfs_dinode_t *dip; 2507 xfs_mount_t *mp; 2508 int error; 2509 2510 XFS_STATS_INC(xs_iflush_count); 2511 2512 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2513 ASSERT(!completion_done(&ip->i_flush)); 2514 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2515 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2516 2517 iip = ip->i_itemp; 2518 mp = ip->i_mount; 2519 2520 /* 2521 * We can't flush the inode until it is unpinned, so wait for it if we 2522 * are allowed to block. We know no one new can pin it, because we are 2523 * holding the inode lock shared and you need to hold it exclusively to 2524 * pin the inode. 2525 * 2526 * If we are not allowed to block, force the log out asynchronously so 2527 * that when we come back the inode will be unpinned. If other inodes 2528 * in the same cluster are dirty, they will probably write the inode 2529 * out for us if they occur after the log force completes. 2530 */ 2531 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2532 xfs_iunpin_nowait(ip); 2533 xfs_ifunlock(ip); 2534 return EAGAIN; 2535 } 2536 xfs_iunpin_wait(ip); 2537 2538 /* 2539 * For stale inodes we cannot rely on the backing buffer remaining 2540 * stale in cache for the remaining life of the stale inode and so 2541 * xfs_itobp() below may give us a buffer that no longer contains 2542 * inodes below. We have to check this after ensuring the inode is 2543 * unpinned so that it is safe to reclaim the stale inode after the 2544 * flush call. 2545 */ 2546 if (xfs_iflags_test(ip, XFS_ISTALE)) { 2547 xfs_ifunlock(ip); 2548 return 0; 2549 } 2550 2551 /* 2552 * This may have been unpinned because the filesystem is shutting 2553 * down forcibly. If that's the case we must not write this inode 2554 * to disk, because the log record didn't make it to disk! 2555 */ 2556 if (XFS_FORCED_SHUTDOWN(mp)) { 2557 ip->i_update_core = 0; 2558 if (iip) 2559 iip->ili_format.ilf_fields = 0; 2560 xfs_ifunlock(ip); 2561 return XFS_ERROR(EIO); 2562 } 2563 2564 /* 2565 * Get the buffer containing the on-disk inode. 2566 */ 2567 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2568 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK); 2569 if (error || !bp) { 2570 xfs_ifunlock(ip); 2571 return error; 2572 } 2573 2574 /* 2575 * First flush out the inode that xfs_iflush was called with. 2576 */ 2577 error = xfs_iflush_int(ip, bp); 2578 if (error) 2579 goto corrupt_out; 2580 2581 /* 2582 * If the buffer is pinned then push on the log now so we won't 2583 * get stuck waiting in the write for too long. 2584 */ 2585 if (xfs_buf_ispinned(bp)) 2586 xfs_log_force(mp, 0); 2587 2588 /* 2589 * inode clustering: 2590 * see if other inodes can be gathered into this write 2591 */ 2592 error = xfs_iflush_cluster(ip, bp); 2593 if (error) 2594 goto cluster_corrupt_out; 2595 2596 if (flags & SYNC_WAIT) 2597 error = xfs_bwrite(bp); 2598 else 2599 xfs_buf_delwri_queue(bp); 2600 2601 xfs_buf_relse(bp); 2602 return error; 2603 2604 corrupt_out: 2605 xfs_buf_relse(bp); 2606 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2607 cluster_corrupt_out: 2608 /* 2609 * Unlocks the flush lock 2610 */ 2611 xfs_iflush_abort(ip); 2612 return XFS_ERROR(EFSCORRUPTED); 2613 } 2614 2615 2616 STATIC int 2617 xfs_iflush_int( 2618 xfs_inode_t *ip, 2619 xfs_buf_t *bp) 2620 { 2621 xfs_inode_log_item_t *iip; 2622 xfs_dinode_t *dip; 2623 xfs_mount_t *mp; 2624 #ifdef XFS_TRANS_DEBUG 2625 int first; 2626 #endif 2627 2628 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2629 ASSERT(!completion_done(&ip->i_flush)); 2630 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2631 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2632 2633 iip = ip->i_itemp; 2634 mp = ip->i_mount; 2635 2636 /* set *dip = inode's place in the buffer */ 2637 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2638 2639 /* 2640 * Clear i_update_core before copying out the data. 2641 * This is for coordination with our timestamp updates 2642 * that don't hold the inode lock. They will always 2643 * update the timestamps BEFORE setting i_update_core, 2644 * so if we clear i_update_core after they set it we 2645 * are guaranteed to see their updates to the timestamps. 2646 * I believe that this depends on strongly ordered memory 2647 * semantics, but we have that. We use the SYNCHRONIZE 2648 * macro to make sure that the compiler does not reorder 2649 * the i_update_core access below the data copy below. 2650 */ 2651 ip->i_update_core = 0; 2652 SYNCHRONIZE(); 2653 2654 /* 2655 * Make sure to get the latest timestamps from the Linux inode. 2656 */ 2657 xfs_synchronize_times(ip); 2658 2659 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 2660 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2661 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2662 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2663 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2664 goto corrupt_out; 2665 } 2666 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2667 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2668 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2669 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2670 __func__, ip->i_ino, ip, ip->i_d.di_magic); 2671 goto corrupt_out; 2672 } 2673 if (S_ISREG(ip->i_d.di_mode)) { 2674 if (XFS_TEST_ERROR( 2675 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2676 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2677 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2678 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2679 "%s: Bad regular inode %Lu, ptr 0x%p", 2680 __func__, ip->i_ino, ip); 2681 goto corrupt_out; 2682 } 2683 } else if (S_ISDIR(ip->i_d.di_mode)) { 2684 if (XFS_TEST_ERROR( 2685 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2686 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2687 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2688 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2689 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2690 "%s: Bad directory inode %Lu, ptr 0x%p", 2691 __func__, ip->i_ino, ip); 2692 goto corrupt_out; 2693 } 2694 } 2695 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2696 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2697 XFS_RANDOM_IFLUSH_5)) { 2698 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2699 "%s: detected corrupt incore inode %Lu, " 2700 "total extents = %d, nblocks = %Ld, ptr 0x%p", 2701 __func__, ip->i_ino, 2702 ip->i_d.di_nextents + ip->i_d.di_anextents, 2703 ip->i_d.di_nblocks, ip); 2704 goto corrupt_out; 2705 } 2706 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2707 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2708 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2709 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2710 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2711 goto corrupt_out; 2712 } 2713 /* 2714 * bump the flush iteration count, used to detect flushes which 2715 * postdate a log record during recovery. 2716 */ 2717 2718 ip->i_d.di_flushiter++; 2719 2720 /* 2721 * Copy the dirty parts of the inode into the on-disk 2722 * inode. We always copy out the core of the inode, 2723 * because if the inode is dirty at all the core must 2724 * be. 2725 */ 2726 xfs_dinode_to_disk(dip, &ip->i_d); 2727 2728 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 2729 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 2730 ip->i_d.di_flushiter = 0; 2731 2732 /* 2733 * If this is really an old format inode and the superblock version 2734 * has not been updated to support only new format inodes, then 2735 * convert back to the old inode format. If the superblock version 2736 * has been updated, then make the conversion permanent. 2737 */ 2738 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 2739 if (ip->i_d.di_version == 1) { 2740 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 2741 /* 2742 * Convert it back. 2743 */ 2744 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 2745 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 2746 } else { 2747 /* 2748 * The superblock version has already been bumped, 2749 * so just make the conversion to the new inode 2750 * format permanent. 2751 */ 2752 ip->i_d.di_version = 2; 2753 dip->di_version = 2; 2754 ip->i_d.di_onlink = 0; 2755 dip->di_onlink = 0; 2756 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 2757 memset(&(dip->di_pad[0]), 0, 2758 sizeof(dip->di_pad)); 2759 ASSERT(xfs_get_projid(ip) == 0); 2760 } 2761 } 2762 2763 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 2764 if (XFS_IFORK_Q(ip)) 2765 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 2766 xfs_inobp_check(mp, bp); 2767 2768 /* 2769 * We've recorded everything logged in the inode, so we'd 2770 * like to clear the ilf_fields bits so we don't log and 2771 * flush things unnecessarily. However, we can't stop 2772 * logging all this information until the data we've copied 2773 * into the disk buffer is written to disk. If we did we might 2774 * overwrite the copy of the inode in the log with all the 2775 * data after re-logging only part of it, and in the face of 2776 * a crash we wouldn't have all the data we need to recover. 2777 * 2778 * What we do is move the bits to the ili_last_fields field. 2779 * When logging the inode, these bits are moved back to the 2780 * ilf_fields field. In the xfs_iflush_done() routine we 2781 * clear ili_last_fields, since we know that the information 2782 * those bits represent is permanently on disk. As long as 2783 * the flush completes before the inode is logged again, then 2784 * both ilf_fields and ili_last_fields will be cleared. 2785 * 2786 * We can play with the ilf_fields bits here, because the inode 2787 * lock must be held exclusively in order to set bits there 2788 * and the flush lock protects the ili_last_fields bits. 2789 * Set ili_logged so the flush done 2790 * routine can tell whether or not to look in the AIL. 2791 * Also, store the current LSN of the inode so that we can tell 2792 * whether the item has moved in the AIL from xfs_iflush_done(). 2793 * In order to read the lsn we need the AIL lock, because 2794 * it is a 64 bit value that cannot be read atomically. 2795 */ 2796 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 2797 iip->ili_last_fields = iip->ili_format.ilf_fields; 2798 iip->ili_format.ilf_fields = 0; 2799 iip->ili_logged = 1; 2800 2801 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2802 &iip->ili_item.li_lsn); 2803 2804 /* 2805 * Attach the function xfs_iflush_done to the inode's 2806 * buffer. This will remove the inode from the AIL 2807 * and unlock the inode's flush lock when the inode is 2808 * completely written to disk. 2809 */ 2810 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 2811 2812 ASSERT(bp->b_fspriv != NULL); 2813 ASSERT(bp->b_iodone != NULL); 2814 } else { 2815 /* 2816 * We're flushing an inode which is not in the AIL and has 2817 * not been logged but has i_update_core set. For this 2818 * case we can use a B_DELWRI flush and immediately drop 2819 * the inode flush lock because we can avoid the whole 2820 * AIL state thing. It's OK to drop the flush lock now, 2821 * because we've already locked the buffer and to do anything 2822 * you really need both. 2823 */ 2824 if (iip != NULL) { 2825 ASSERT(iip->ili_logged == 0); 2826 ASSERT(iip->ili_last_fields == 0); 2827 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 2828 } 2829 xfs_ifunlock(ip); 2830 } 2831 2832 return 0; 2833 2834 corrupt_out: 2835 return XFS_ERROR(EFSCORRUPTED); 2836 } 2837 2838 /* 2839 * Return a pointer to the extent record at file index idx. 2840 */ 2841 xfs_bmbt_rec_host_t * 2842 xfs_iext_get_ext( 2843 xfs_ifork_t *ifp, /* inode fork pointer */ 2844 xfs_extnum_t idx) /* index of target extent */ 2845 { 2846 ASSERT(idx >= 0); 2847 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 2848 2849 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 2850 return ifp->if_u1.if_ext_irec->er_extbuf; 2851 } else if (ifp->if_flags & XFS_IFEXTIREC) { 2852 xfs_ext_irec_t *erp; /* irec pointer */ 2853 int erp_idx = 0; /* irec index */ 2854 xfs_extnum_t page_idx = idx; /* ext index in target list */ 2855 2856 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 2857 return &erp->er_extbuf[page_idx]; 2858 } else if (ifp->if_bytes) { 2859 return &ifp->if_u1.if_extents[idx]; 2860 } else { 2861 return NULL; 2862 } 2863 } 2864 2865 /* 2866 * Insert new item(s) into the extent records for incore inode 2867 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 2868 */ 2869 void 2870 xfs_iext_insert( 2871 xfs_inode_t *ip, /* incore inode pointer */ 2872 xfs_extnum_t idx, /* starting index of new items */ 2873 xfs_extnum_t count, /* number of inserted items */ 2874 xfs_bmbt_irec_t *new, /* items to insert */ 2875 int state) /* type of extent conversion */ 2876 { 2877 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 2878 xfs_extnum_t i; /* extent record index */ 2879 2880 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); 2881 2882 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 2883 xfs_iext_add(ifp, idx, count); 2884 for (i = idx; i < idx + count; i++, new++) 2885 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); 2886 } 2887 2888 /* 2889 * This is called when the amount of space required for incore file 2890 * extents needs to be increased. The ext_diff parameter stores the 2891 * number of new extents being added and the idx parameter contains 2892 * the extent index where the new extents will be added. If the new 2893 * extents are being appended, then we just need to (re)allocate and 2894 * initialize the space. Otherwise, if the new extents are being 2895 * inserted into the middle of the existing entries, a bit more work 2896 * is required to make room for the new extents to be inserted. The 2897 * caller is responsible for filling in the new extent entries upon 2898 * return. 2899 */ 2900 void 2901 xfs_iext_add( 2902 xfs_ifork_t *ifp, /* inode fork pointer */ 2903 xfs_extnum_t idx, /* index to begin adding exts */ 2904 int ext_diff) /* number of extents to add */ 2905 { 2906 int byte_diff; /* new bytes being added */ 2907 int new_size; /* size of extents after adding */ 2908 xfs_extnum_t nextents; /* number of extents in file */ 2909 2910 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2911 ASSERT((idx >= 0) && (idx <= nextents)); 2912 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 2913 new_size = ifp->if_bytes + byte_diff; 2914 /* 2915 * If the new number of extents (nextents + ext_diff) 2916 * fits inside the inode, then continue to use the inline 2917 * extent buffer. 2918 */ 2919 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 2920 if (idx < nextents) { 2921 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 2922 &ifp->if_u2.if_inline_ext[idx], 2923 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2924 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 2925 } 2926 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 2927 ifp->if_real_bytes = 0; 2928 } 2929 /* 2930 * Otherwise use a linear (direct) extent list. 2931 * If the extents are currently inside the inode, 2932 * xfs_iext_realloc_direct will switch us from 2933 * inline to direct extent allocation mode. 2934 */ 2935 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 2936 xfs_iext_realloc_direct(ifp, new_size); 2937 if (idx < nextents) { 2938 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 2939 &ifp->if_u1.if_extents[idx], 2940 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2941 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 2942 } 2943 } 2944 /* Indirection array */ 2945 else { 2946 xfs_ext_irec_t *erp; 2947 int erp_idx = 0; 2948 int page_idx = idx; 2949 2950 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 2951 if (ifp->if_flags & XFS_IFEXTIREC) { 2952 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 2953 } else { 2954 xfs_iext_irec_init(ifp); 2955 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 2956 erp = ifp->if_u1.if_ext_irec; 2957 } 2958 /* Extents fit in target extent page */ 2959 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 2960 if (page_idx < erp->er_extcount) { 2961 memmove(&erp->er_extbuf[page_idx + ext_diff], 2962 &erp->er_extbuf[page_idx], 2963 (erp->er_extcount - page_idx) * 2964 sizeof(xfs_bmbt_rec_t)); 2965 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 2966 } 2967 erp->er_extcount += ext_diff; 2968 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 2969 } 2970 /* Insert a new extent page */ 2971 else if (erp) { 2972 xfs_iext_add_indirect_multi(ifp, 2973 erp_idx, page_idx, ext_diff); 2974 } 2975 /* 2976 * If extent(s) are being appended to the last page in 2977 * the indirection array and the new extent(s) don't fit 2978 * in the page, then erp is NULL and erp_idx is set to 2979 * the next index needed in the indirection array. 2980 */ 2981 else { 2982 int count = ext_diff; 2983 2984 while (count) { 2985 erp = xfs_iext_irec_new(ifp, erp_idx); 2986 erp->er_extcount = count; 2987 count -= MIN(count, (int)XFS_LINEAR_EXTS); 2988 if (count) { 2989 erp_idx++; 2990 } 2991 } 2992 } 2993 } 2994 ifp->if_bytes = new_size; 2995 } 2996 2997 /* 2998 * This is called when incore extents are being added to the indirection 2999 * array and the new extents do not fit in the target extent list. The 3000 * erp_idx parameter contains the irec index for the target extent list 3001 * in the indirection array, and the idx parameter contains the extent 3002 * index within the list. The number of extents being added is stored 3003 * in the count parameter. 3004 * 3005 * |-------| |-------| 3006 * | | | | idx - number of extents before idx 3007 * | idx | | count | 3008 * | | | | count - number of extents being inserted at idx 3009 * |-------| |-------| 3010 * | count | | nex2 | nex2 - number of extents after idx + count 3011 * |-------| |-------| 3012 */ 3013 void 3014 xfs_iext_add_indirect_multi( 3015 xfs_ifork_t *ifp, /* inode fork pointer */ 3016 int erp_idx, /* target extent irec index */ 3017 xfs_extnum_t idx, /* index within target list */ 3018 int count) /* new extents being added */ 3019 { 3020 int byte_diff; /* new bytes being added */ 3021 xfs_ext_irec_t *erp; /* pointer to irec entry */ 3022 xfs_extnum_t ext_diff; /* number of extents to add */ 3023 xfs_extnum_t ext_cnt; /* new extents still needed */ 3024 xfs_extnum_t nex2; /* extents after idx + count */ 3025 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 3026 int nlists; /* number of irec's (lists) */ 3027 3028 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3029 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3030 nex2 = erp->er_extcount - idx; 3031 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3032 3033 /* 3034 * Save second part of target extent list 3035 * (all extents past */ 3036 if (nex2) { 3037 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3038 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); 3039 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3040 erp->er_extcount -= nex2; 3041 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3042 memset(&erp->er_extbuf[idx], 0, byte_diff); 3043 } 3044 3045 /* 3046 * Add the new extents to the end of the target 3047 * list, then allocate new irec record(s) and 3048 * extent buffer(s) as needed to store the rest 3049 * of the new extents. 3050 */ 3051 ext_cnt = count; 3052 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 3053 if (ext_diff) { 3054 erp->er_extcount += ext_diff; 3055 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3056 ext_cnt -= ext_diff; 3057 } 3058 while (ext_cnt) { 3059 erp_idx++; 3060 erp = xfs_iext_irec_new(ifp, erp_idx); 3061 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 3062 erp->er_extcount = ext_diff; 3063 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3064 ext_cnt -= ext_diff; 3065 } 3066 3067 /* Add nex2 extents back to indirection array */ 3068 if (nex2) { 3069 xfs_extnum_t ext_avail; 3070 int i; 3071 3072 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3073 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 3074 i = 0; 3075 /* 3076 * If nex2 extents fit in the current page, append 3077 * nex2_ep after the new extents. 3078 */ 3079 if (nex2 <= ext_avail) { 3080 i = erp->er_extcount; 3081 } 3082 /* 3083 * Otherwise, check if space is available in the 3084 * next page. 3085 */ 3086 else if ((erp_idx < nlists - 1) && 3087 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 3088 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 3089 erp_idx++; 3090 erp++; 3091 /* Create a hole for nex2 extents */ 3092 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 3093 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 3094 } 3095 /* 3096 * Final choice, create a new extent page for 3097 * nex2 extents. 3098 */ 3099 else { 3100 erp_idx++; 3101 erp = xfs_iext_irec_new(ifp, erp_idx); 3102 } 3103 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 3104 kmem_free(nex2_ep); 3105 erp->er_extcount += nex2; 3106 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 3107 } 3108 } 3109 3110 /* 3111 * This is called when the amount of space required for incore file 3112 * extents needs to be decreased. The ext_diff parameter stores the 3113 * number of extents to be removed and the idx parameter contains 3114 * the extent index where the extents will be removed from. 3115 * 3116 * If the amount of space needed has decreased below the linear 3117 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 3118 * extent array. Otherwise, use kmem_realloc() to adjust the 3119 * size to what is needed. 3120 */ 3121 void 3122 xfs_iext_remove( 3123 xfs_inode_t *ip, /* incore inode pointer */ 3124 xfs_extnum_t idx, /* index to begin removing exts */ 3125 int ext_diff, /* number of extents to remove */ 3126 int state) /* type of extent conversion */ 3127 { 3128 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3129 xfs_extnum_t nextents; /* number of extents in file */ 3130 int new_size; /* size of extents after removal */ 3131 3132 trace_xfs_iext_remove(ip, idx, state, _RET_IP_); 3133 3134 ASSERT(ext_diff > 0); 3135 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3136 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3137 3138 if (new_size == 0) { 3139 xfs_iext_destroy(ifp); 3140 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3141 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3142 } else if (ifp->if_real_bytes) { 3143 xfs_iext_remove_direct(ifp, idx, ext_diff); 3144 } else { 3145 xfs_iext_remove_inline(ifp, idx, ext_diff); 3146 } 3147 ifp->if_bytes = new_size; 3148 } 3149 3150 /* 3151 * This removes ext_diff extents from the inline buffer, beginning 3152 * at extent index idx. 3153 */ 3154 void 3155 xfs_iext_remove_inline( 3156 xfs_ifork_t *ifp, /* inode fork pointer */ 3157 xfs_extnum_t idx, /* index to begin removing exts */ 3158 int ext_diff) /* number of extents to remove */ 3159 { 3160 int nextents; /* number of extents in file */ 3161 3162 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3163 ASSERT(idx < XFS_INLINE_EXTS); 3164 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3165 ASSERT(((nextents - ext_diff) > 0) && 3166 (nextents - ext_diff) < XFS_INLINE_EXTS); 3167 3168 if (idx + ext_diff < nextents) { 3169 memmove(&ifp->if_u2.if_inline_ext[idx], 3170 &ifp->if_u2.if_inline_ext[idx + ext_diff], 3171 (nextents - (idx + ext_diff)) * 3172 sizeof(xfs_bmbt_rec_t)); 3173 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 3174 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3175 } else { 3176 memset(&ifp->if_u2.if_inline_ext[idx], 0, 3177 ext_diff * sizeof(xfs_bmbt_rec_t)); 3178 } 3179 } 3180 3181 /* 3182 * This removes ext_diff extents from a linear (direct) extent list, 3183 * beginning at extent index idx. If the extents are being removed 3184 * from the end of the list (ie. truncate) then we just need to re- 3185 * allocate the list to remove the extra space. Otherwise, if the 3186 * extents are being removed from the middle of the existing extent 3187 * entries, then we first need to move the extent records beginning 3188 * at idx + ext_diff up in the list to overwrite the records being 3189 * removed, then remove the extra space via kmem_realloc. 3190 */ 3191 void 3192 xfs_iext_remove_direct( 3193 xfs_ifork_t *ifp, /* inode fork pointer */ 3194 xfs_extnum_t idx, /* index to begin removing exts */ 3195 int ext_diff) /* number of extents to remove */ 3196 { 3197 xfs_extnum_t nextents; /* number of extents in file */ 3198 int new_size; /* size of extents after removal */ 3199 3200 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3201 new_size = ifp->if_bytes - 3202 (ext_diff * sizeof(xfs_bmbt_rec_t)); 3203 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3204 3205 if (new_size == 0) { 3206 xfs_iext_destroy(ifp); 3207 return; 3208 } 3209 /* Move extents up in the list (if needed) */ 3210 if (idx + ext_diff < nextents) { 3211 memmove(&ifp->if_u1.if_extents[idx], 3212 &ifp->if_u1.if_extents[idx + ext_diff], 3213 (nextents - (idx + ext_diff)) * 3214 sizeof(xfs_bmbt_rec_t)); 3215 } 3216 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 3217 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3218 /* 3219 * Reallocate the direct extent list. If the extents 3220 * will fit inside the inode then xfs_iext_realloc_direct 3221 * will switch from direct to inline extent allocation 3222 * mode for us. 3223 */ 3224 xfs_iext_realloc_direct(ifp, new_size); 3225 ifp->if_bytes = new_size; 3226 } 3227 3228 /* 3229 * This is called when incore extents are being removed from the 3230 * indirection array and the extents being removed span multiple extent 3231 * buffers. The idx parameter contains the file extent index where we 3232 * want to begin removing extents, and the count parameter contains 3233 * how many extents need to be removed. 3234 * 3235 * |-------| |-------| 3236 * | nex1 | | | nex1 - number of extents before idx 3237 * |-------| | count | 3238 * | | | | count - number of extents being removed at idx 3239 * | count | |-------| 3240 * | | | nex2 | nex2 - number of extents after idx + count 3241 * |-------| |-------| 3242 */ 3243 void 3244 xfs_iext_remove_indirect( 3245 xfs_ifork_t *ifp, /* inode fork pointer */ 3246 xfs_extnum_t idx, /* index to begin removing extents */ 3247 int count) /* number of extents to remove */ 3248 { 3249 xfs_ext_irec_t *erp; /* indirection array pointer */ 3250 int erp_idx = 0; /* indirection array index */ 3251 xfs_extnum_t ext_cnt; /* extents left to remove */ 3252 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3253 xfs_extnum_t nex1; /* number of extents before idx */ 3254 xfs_extnum_t nex2; /* extents after idx + count */ 3255 int page_idx = idx; /* index in target extent list */ 3256 3257 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3258 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3259 ASSERT(erp != NULL); 3260 nex1 = page_idx; 3261 ext_cnt = count; 3262 while (ext_cnt) { 3263 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 3264 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 3265 /* 3266 * Check for deletion of entire list; 3267 * xfs_iext_irec_remove() updates extent offsets. 3268 */ 3269 if (ext_diff == erp->er_extcount) { 3270 xfs_iext_irec_remove(ifp, erp_idx); 3271 ext_cnt -= ext_diff; 3272 nex1 = 0; 3273 if (ext_cnt) { 3274 ASSERT(erp_idx < ifp->if_real_bytes / 3275 XFS_IEXT_BUFSZ); 3276 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3277 nex1 = 0; 3278 continue; 3279 } else { 3280 break; 3281 } 3282 } 3283 /* Move extents up (if needed) */ 3284 if (nex2) { 3285 memmove(&erp->er_extbuf[nex1], 3286 &erp->er_extbuf[nex1 + ext_diff], 3287 nex2 * sizeof(xfs_bmbt_rec_t)); 3288 } 3289 /* Zero out rest of page */ 3290 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 3291 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 3292 /* Update remaining counters */ 3293 erp->er_extcount -= ext_diff; 3294 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 3295 ext_cnt -= ext_diff; 3296 nex1 = 0; 3297 erp_idx++; 3298 erp++; 3299 } 3300 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 3301 xfs_iext_irec_compact(ifp); 3302 } 3303 3304 /* 3305 * Create, destroy, or resize a linear (direct) block of extents. 3306 */ 3307 void 3308 xfs_iext_realloc_direct( 3309 xfs_ifork_t *ifp, /* inode fork pointer */ 3310 int new_size) /* new size of extents */ 3311 { 3312 int rnew_size; /* real new size of extents */ 3313 3314 rnew_size = new_size; 3315 3316 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 3317 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 3318 (new_size != ifp->if_real_bytes))); 3319 3320 /* Free extent records */ 3321 if (new_size == 0) { 3322 xfs_iext_destroy(ifp); 3323 } 3324 /* Resize direct extent list and zero any new bytes */ 3325 else if (ifp->if_real_bytes) { 3326 /* Check if extents will fit inside the inode */ 3327 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 3328 xfs_iext_direct_to_inline(ifp, new_size / 3329 (uint)sizeof(xfs_bmbt_rec_t)); 3330 ifp->if_bytes = new_size; 3331 return; 3332 } 3333 if (!is_power_of_2(new_size)){ 3334 rnew_size = roundup_pow_of_two(new_size); 3335 } 3336 if (rnew_size != ifp->if_real_bytes) { 3337 ifp->if_u1.if_extents = 3338 kmem_realloc(ifp->if_u1.if_extents, 3339 rnew_size, 3340 ifp->if_real_bytes, KM_NOFS); 3341 } 3342 if (rnew_size > ifp->if_real_bytes) { 3343 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 3344 (uint)sizeof(xfs_bmbt_rec_t)], 0, 3345 rnew_size - ifp->if_real_bytes); 3346 } 3347 } 3348 /* 3349 * Switch from the inline extent buffer to a direct 3350 * extent list. Be sure to include the inline extent 3351 * bytes in new_size. 3352 */ 3353 else { 3354 new_size += ifp->if_bytes; 3355 if (!is_power_of_2(new_size)) { 3356 rnew_size = roundup_pow_of_two(new_size); 3357 } 3358 xfs_iext_inline_to_direct(ifp, rnew_size); 3359 } 3360 ifp->if_real_bytes = rnew_size; 3361 ifp->if_bytes = new_size; 3362 } 3363 3364 /* 3365 * Switch from linear (direct) extent records to inline buffer. 3366 */ 3367 void 3368 xfs_iext_direct_to_inline( 3369 xfs_ifork_t *ifp, /* inode fork pointer */ 3370 xfs_extnum_t nextents) /* number of extents in file */ 3371 { 3372 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3373 ASSERT(nextents <= XFS_INLINE_EXTS); 3374 /* 3375 * The inline buffer was zeroed when we switched 3376 * from inline to direct extent allocation mode, 3377 * so we don't need to clear it here. 3378 */ 3379 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 3380 nextents * sizeof(xfs_bmbt_rec_t)); 3381 kmem_free(ifp->if_u1.if_extents); 3382 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3383 ifp->if_real_bytes = 0; 3384 } 3385 3386 /* 3387 * Switch from inline buffer to linear (direct) extent records. 3388 * new_size should already be rounded up to the next power of 2 3389 * by the caller (when appropriate), so use new_size as it is. 3390 * However, since new_size may be rounded up, we can't update 3391 * if_bytes here. It is the caller's responsibility to update 3392 * if_bytes upon return. 3393 */ 3394 void 3395 xfs_iext_inline_to_direct( 3396 xfs_ifork_t *ifp, /* inode fork pointer */ 3397 int new_size) /* number of extents in file */ 3398 { 3399 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); 3400 memset(ifp->if_u1.if_extents, 0, new_size); 3401 if (ifp->if_bytes) { 3402 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 3403 ifp->if_bytes); 3404 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3405 sizeof(xfs_bmbt_rec_t)); 3406 } 3407 ifp->if_real_bytes = new_size; 3408 } 3409 3410 /* 3411 * Resize an extent indirection array to new_size bytes. 3412 */ 3413 STATIC void 3414 xfs_iext_realloc_indirect( 3415 xfs_ifork_t *ifp, /* inode fork pointer */ 3416 int new_size) /* new indirection array size */ 3417 { 3418 int nlists; /* number of irec's (ex lists) */ 3419 int size; /* current indirection array size */ 3420 3421 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3422 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3423 size = nlists * sizeof(xfs_ext_irec_t); 3424 ASSERT(ifp->if_real_bytes); 3425 ASSERT((new_size >= 0) && (new_size != size)); 3426 if (new_size == 0) { 3427 xfs_iext_destroy(ifp); 3428 } else { 3429 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 3430 kmem_realloc(ifp->if_u1.if_ext_irec, 3431 new_size, size, KM_NOFS); 3432 } 3433 } 3434 3435 /* 3436 * Switch from indirection array to linear (direct) extent allocations. 3437 */ 3438 STATIC void 3439 xfs_iext_indirect_to_direct( 3440 xfs_ifork_t *ifp) /* inode fork pointer */ 3441 { 3442 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3443 xfs_extnum_t nextents; /* number of extents in file */ 3444 int size; /* size of file extents */ 3445 3446 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3447 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3448 ASSERT(nextents <= XFS_LINEAR_EXTS); 3449 size = nextents * sizeof(xfs_bmbt_rec_t); 3450 3451 xfs_iext_irec_compact_pages(ifp); 3452 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 3453 3454 ep = ifp->if_u1.if_ext_irec->er_extbuf; 3455 kmem_free(ifp->if_u1.if_ext_irec); 3456 ifp->if_flags &= ~XFS_IFEXTIREC; 3457 ifp->if_u1.if_extents = ep; 3458 ifp->if_bytes = size; 3459 if (nextents < XFS_LINEAR_EXTS) { 3460 xfs_iext_realloc_direct(ifp, size); 3461 } 3462 } 3463 3464 /* 3465 * Free incore file extents. 3466 */ 3467 void 3468 xfs_iext_destroy( 3469 xfs_ifork_t *ifp) /* inode fork pointer */ 3470 { 3471 if (ifp->if_flags & XFS_IFEXTIREC) { 3472 int erp_idx; 3473 int nlists; 3474 3475 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3476 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 3477 xfs_iext_irec_remove(ifp, erp_idx); 3478 } 3479 ifp->if_flags &= ~XFS_IFEXTIREC; 3480 } else if (ifp->if_real_bytes) { 3481 kmem_free(ifp->if_u1.if_extents); 3482 } else if (ifp->if_bytes) { 3483 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3484 sizeof(xfs_bmbt_rec_t)); 3485 } 3486 ifp->if_u1.if_extents = NULL; 3487 ifp->if_real_bytes = 0; 3488 ifp->if_bytes = 0; 3489 } 3490 3491 /* 3492 * Return a pointer to the extent record for file system block bno. 3493 */ 3494 xfs_bmbt_rec_host_t * /* pointer to found extent record */ 3495 xfs_iext_bno_to_ext( 3496 xfs_ifork_t *ifp, /* inode fork pointer */ 3497 xfs_fileoff_t bno, /* block number to search for */ 3498 xfs_extnum_t *idxp) /* index of target extent */ 3499 { 3500 xfs_bmbt_rec_host_t *base; /* pointer to first extent */ 3501 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 3502 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ 3503 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3504 int high; /* upper boundary in search */ 3505 xfs_extnum_t idx = 0; /* index of target extent */ 3506 int low; /* lower boundary in search */ 3507 xfs_extnum_t nextents; /* number of file extents */ 3508 xfs_fileoff_t startoff = 0; /* start offset of extent */ 3509 3510 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3511 if (nextents == 0) { 3512 *idxp = 0; 3513 return NULL; 3514 } 3515 low = 0; 3516 if (ifp->if_flags & XFS_IFEXTIREC) { 3517 /* Find target extent list */ 3518 int erp_idx = 0; 3519 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 3520 base = erp->er_extbuf; 3521 high = erp->er_extcount - 1; 3522 } else { 3523 base = ifp->if_u1.if_extents; 3524 high = nextents - 1; 3525 } 3526 /* Binary search extent records */ 3527 while (low <= high) { 3528 idx = (low + high) >> 1; 3529 ep = base + idx; 3530 startoff = xfs_bmbt_get_startoff(ep); 3531 blockcount = xfs_bmbt_get_blockcount(ep); 3532 if (bno < startoff) { 3533 high = idx - 1; 3534 } else if (bno >= startoff + blockcount) { 3535 low = idx + 1; 3536 } else { 3537 /* Convert back to file-based extent index */ 3538 if (ifp->if_flags & XFS_IFEXTIREC) { 3539 idx += erp->er_extoff; 3540 } 3541 *idxp = idx; 3542 return ep; 3543 } 3544 } 3545 /* Convert back to file-based extent index */ 3546 if (ifp->if_flags & XFS_IFEXTIREC) { 3547 idx += erp->er_extoff; 3548 } 3549 if (bno >= startoff + blockcount) { 3550 if (++idx == nextents) { 3551 ep = NULL; 3552 } else { 3553 ep = xfs_iext_get_ext(ifp, idx); 3554 } 3555 } 3556 *idxp = idx; 3557 return ep; 3558 } 3559 3560 /* 3561 * Return a pointer to the indirection array entry containing the 3562 * extent record for filesystem block bno. Store the index of the 3563 * target irec in *erp_idxp. 3564 */ 3565 xfs_ext_irec_t * /* pointer to found extent record */ 3566 xfs_iext_bno_to_irec( 3567 xfs_ifork_t *ifp, /* inode fork pointer */ 3568 xfs_fileoff_t bno, /* block number to search for */ 3569 int *erp_idxp) /* irec index of target ext list */ 3570 { 3571 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3572 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 3573 int erp_idx; /* indirection array index */ 3574 int nlists; /* number of extent irec's (lists) */ 3575 int high; /* binary search upper limit */ 3576 int low; /* binary search lower limit */ 3577 3578 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3579 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3580 erp_idx = 0; 3581 low = 0; 3582 high = nlists - 1; 3583 while (low <= high) { 3584 erp_idx = (low + high) >> 1; 3585 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3586 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 3587 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 3588 high = erp_idx - 1; 3589 } else if (erp_next && bno >= 3590 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 3591 low = erp_idx + 1; 3592 } else { 3593 break; 3594 } 3595 } 3596 *erp_idxp = erp_idx; 3597 return erp; 3598 } 3599 3600 /* 3601 * Return a pointer to the indirection array entry containing the 3602 * extent record at file extent index *idxp. Store the index of the 3603 * target irec in *erp_idxp and store the page index of the target 3604 * extent record in *idxp. 3605 */ 3606 xfs_ext_irec_t * 3607 xfs_iext_idx_to_irec( 3608 xfs_ifork_t *ifp, /* inode fork pointer */ 3609 xfs_extnum_t *idxp, /* extent index (file -> page) */ 3610 int *erp_idxp, /* pointer to target irec */ 3611 int realloc) /* new bytes were just added */ 3612 { 3613 xfs_ext_irec_t *prev; /* pointer to previous irec */ 3614 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 3615 int erp_idx; /* indirection array index */ 3616 int nlists; /* number of irec's (ex lists) */ 3617 int high; /* binary search upper limit */ 3618 int low; /* binary search lower limit */ 3619 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3620 3621 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3622 ASSERT(page_idx >= 0); 3623 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 3624 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); 3625 3626 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3627 erp_idx = 0; 3628 low = 0; 3629 high = nlists - 1; 3630 3631 /* Binary search extent irec's */ 3632 while (low <= high) { 3633 erp_idx = (low + high) >> 1; 3634 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3635 prev = erp_idx > 0 ? erp - 1 : NULL; 3636 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 3637 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 3638 high = erp_idx - 1; 3639 } else if (page_idx > erp->er_extoff + erp->er_extcount || 3640 (page_idx == erp->er_extoff + erp->er_extcount && 3641 !realloc)) { 3642 low = erp_idx + 1; 3643 } else if (page_idx == erp->er_extoff + erp->er_extcount && 3644 erp->er_extcount == XFS_LINEAR_EXTS) { 3645 ASSERT(realloc); 3646 page_idx = 0; 3647 erp_idx++; 3648 erp = erp_idx < nlists ? erp + 1 : NULL; 3649 break; 3650 } else { 3651 page_idx -= erp->er_extoff; 3652 break; 3653 } 3654 } 3655 *idxp = page_idx; 3656 *erp_idxp = erp_idx; 3657 return(erp); 3658 } 3659 3660 /* 3661 * Allocate and initialize an indirection array once the space needed 3662 * for incore extents increases above XFS_IEXT_BUFSZ. 3663 */ 3664 void 3665 xfs_iext_irec_init( 3666 xfs_ifork_t *ifp) /* inode fork pointer */ 3667 { 3668 xfs_ext_irec_t *erp; /* indirection array pointer */ 3669 xfs_extnum_t nextents; /* number of extents in file */ 3670 3671 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3672 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3673 ASSERT(nextents <= XFS_LINEAR_EXTS); 3674 3675 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); 3676 3677 if (nextents == 0) { 3678 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3679 } else if (!ifp->if_real_bytes) { 3680 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 3681 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 3682 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 3683 } 3684 erp->er_extbuf = ifp->if_u1.if_extents; 3685 erp->er_extcount = nextents; 3686 erp->er_extoff = 0; 3687 3688 ifp->if_flags |= XFS_IFEXTIREC; 3689 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 3690 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 3691 ifp->if_u1.if_ext_irec = erp; 3692 3693 return; 3694 } 3695 3696 /* 3697 * Allocate and initialize a new entry in the indirection array. 3698 */ 3699 xfs_ext_irec_t * 3700 xfs_iext_irec_new( 3701 xfs_ifork_t *ifp, /* inode fork pointer */ 3702 int erp_idx) /* index for new irec */ 3703 { 3704 xfs_ext_irec_t *erp; /* indirection array pointer */ 3705 int i; /* loop counter */ 3706 int nlists; /* number of irec's (ex lists) */ 3707 3708 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3709 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3710 3711 /* Resize indirection array */ 3712 xfs_iext_realloc_indirect(ifp, ++nlists * 3713 sizeof(xfs_ext_irec_t)); 3714 /* 3715 * Move records down in the array so the 3716 * new page can use erp_idx. 3717 */ 3718 erp = ifp->if_u1.if_ext_irec; 3719 for (i = nlists - 1; i > erp_idx; i--) { 3720 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 3721 } 3722 ASSERT(i == erp_idx); 3723 3724 /* Initialize new extent record */ 3725 erp = ifp->if_u1.if_ext_irec; 3726 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3727 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3728 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 3729 erp[erp_idx].er_extcount = 0; 3730 erp[erp_idx].er_extoff = erp_idx > 0 ? 3731 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 3732 return (&erp[erp_idx]); 3733 } 3734 3735 /* 3736 * Remove a record from the indirection array. 3737 */ 3738 void 3739 xfs_iext_irec_remove( 3740 xfs_ifork_t *ifp, /* inode fork pointer */ 3741 int erp_idx) /* irec index to remove */ 3742 { 3743 xfs_ext_irec_t *erp; /* indirection array pointer */ 3744 int i; /* loop counter */ 3745 int nlists; /* number of irec's (ex lists) */ 3746 3747 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3748 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3749 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3750 if (erp->er_extbuf) { 3751 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 3752 -erp->er_extcount); 3753 kmem_free(erp->er_extbuf); 3754 } 3755 /* Compact extent records */ 3756 erp = ifp->if_u1.if_ext_irec; 3757 for (i = erp_idx; i < nlists - 1; i++) { 3758 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 3759 } 3760 /* 3761 * Manually free the last extent record from the indirection 3762 * array. A call to xfs_iext_realloc_indirect() with a size 3763 * of zero would result in a call to xfs_iext_destroy() which 3764 * would in turn call this function again, creating a nasty 3765 * infinite loop. 3766 */ 3767 if (--nlists) { 3768 xfs_iext_realloc_indirect(ifp, 3769 nlists * sizeof(xfs_ext_irec_t)); 3770 } else { 3771 kmem_free(ifp->if_u1.if_ext_irec); 3772 } 3773 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3774 } 3775 3776 /* 3777 * This is called to clean up large amounts of unused memory allocated 3778 * by the indirection array. Before compacting anything though, verify 3779 * that the indirection array is still needed and switch back to the 3780 * linear extent list (or even the inline buffer) if possible. The 3781 * compaction policy is as follows: 3782 * 3783 * Full Compaction: Extents fit into a single page (or inline buffer) 3784 * Partial Compaction: Extents occupy less than 50% of allocated space 3785 * No Compaction: Extents occupy at least 50% of allocated space 3786 */ 3787 void 3788 xfs_iext_irec_compact( 3789 xfs_ifork_t *ifp) /* inode fork pointer */ 3790 { 3791 xfs_extnum_t nextents; /* number of extents in file */ 3792 int nlists; /* number of irec's (ex lists) */ 3793 3794 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3795 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3796 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3797 3798 if (nextents == 0) { 3799 xfs_iext_destroy(ifp); 3800 } else if (nextents <= XFS_INLINE_EXTS) { 3801 xfs_iext_indirect_to_direct(ifp); 3802 xfs_iext_direct_to_inline(ifp, nextents); 3803 } else if (nextents <= XFS_LINEAR_EXTS) { 3804 xfs_iext_indirect_to_direct(ifp); 3805 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 3806 xfs_iext_irec_compact_pages(ifp); 3807 } 3808 } 3809 3810 /* 3811 * Combine extents from neighboring extent pages. 3812 */ 3813 void 3814 xfs_iext_irec_compact_pages( 3815 xfs_ifork_t *ifp) /* inode fork pointer */ 3816 { 3817 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 3818 int erp_idx = 0; /* indirection array index */ 3819 int nlists; /* number of irec's (ex lists) */ 3820 3821 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3822 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3823 while (erp_idx < nlists - 1) { 3824 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3825 erp_next = erp + 1; 3826 if (erp_next->er_extcount <= 3827 (XFS_LINEAR_EXTS - erp->er_extcount)) { 3828 memcpy(&erp->er_extbuf[erp->er_extcount], 3829 erp_next->er_extbuf, erp_next->er_extcount * 3830 sizeof(xfs_bmbt_rec_t)); 3831 erp->er_extcount += erp_next->er_extcount; 3832 /* 3833 * Free page before removing extent record 3834 * so er_extoffs don't get modified in 3835 * xfs_iext_irec_remove. 3836 */ 3837 kmem_free(erp_next->er_extbuf); 3838 erp_next->er_extbuf = NULL; 3839 xfs_iext_irec_remove(ifp, erp_idx + 1); 3840 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3841 } else { 3842 erp_idx++; 3843 } 3844 } 3845 } 3846 3847 /* 3848 * This is called to update the er_extoff field in the indirection 3849 * array when extents have been added or removed from one of the 3850 * extent lists. erp_idx contains the irec index to begin updating 3851 * at and ext_diff contains the number of extents that were added 3852 * or removed. 3853 */ 3854 void 3855 xfs_iext_irec_update_extoffs( 3856 xfs_ifork_t *ifp, /* inode fork pointer */ 3857 int erp_idx, /* irec index to update */ 3858 int ext_diff) /* number of new extents */ 3859 { 3860 int i; /* loop counter */ 3861 int nlists; /* number of irec's (ex lists */ 3862 3863 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3864 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3865 for (i = erp_idx; i < nlists; i++) { 3866 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3867 } 3868 } 3869