1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_bit.h" 24 #include "xfs_log.h" 25 #include "xfs_inum.h" 26 #include "xfs_trans.h" 27 #include "xfs_trans_priv.h" 28 #include "xfs_sb.h" 29 #include "xfs_ag.h" 30 #include "xfs_mount.h" 31 #include "xfs_bmap_btree.h" 32 #include "xfs_alloc_btree.h" 33 #include "xfs_ialloc_btree.h" 34 #include "xfs_attr_sf.h" 35 #include "xfs_dinode.h" 36 #include "xfs_inode.h" 37 #include "xfs_buf_item.h" 38 #include "xfs_inode_item.h" 39 #include "xfs_btree.h" 40 #include "xfs_btree_trace.h" 41 #include "xfs_alloc.h" 42 #include "xfs_ialloc.h" 43 #include "xfs_bmap.h" 44 #include "xfs_error.h" 45 #include "xfs_utils.h" 46 #include "xfs_quota.h" 47 #include "xfs_filestream.h" 48 #include "xfs_vnodeops.h" 49 #include "xfs_trace.h" 50 51 kmem_zone_t *xfs_ifork_zone; 52 kmem_zone_t *xfs_inode_zone; 53 54 /* 55 * Used in xfs_itruncate(). This is the maximum number of extents 56 * freed from a file in a single transaction. 57 */ 58 #define XFS_ITRUNC_MAX_EXTENTS 2 59 60 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 61 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 62 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 63 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 64 65 #ifdef DEBUG 66 /* 67 * Make sure that the extents in the given memory buffer 68 * are valid. 69 */ 70 STATIC void 71 xfs_validate_extents( 72 xfs_ifork_t *ifp, 73 int nrecs, 74 xfs_exntfmt_t fmt) 75 { 76 xfs_bmbt_irec_t irec; 77 xfs_bmbt_rec_host_t rec; 78 int i; 79 80 for (i = 0; i < nrecs; i++) { 81 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 82 rec.l0 = get_unaligned(&ep->l0); 83 rec.l1 = get_unaligned(&ep->l1); 84 xfs_bmbt_get_all(&rec, &irec); 85 if (fmt == XFS_EXTFMT_NOSTATE) 86 ASSERT(irec.br_state == XFS_EXT_NORM); 87 } 88 } 89 #else /* DEBUG */ 90 #define xfs_validate_extents(ifp, nrecs, fmt) 91 #endif /* DEBUG */ 92 93 /* 94 * Check that none of the inode's in the buffer have a next 95 * unlinked field of 0. 96 */ 97 #if defined(DEBUG) 98 void 99 xfs_inobp_check( 100 xfs_mount_t *mp, 101 xfs_buf_t *bp) 102 { 103 int i; 104 int j; 105 xfs_dinode_t *dip; 106 107 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 108 109 for (i = 0; i < j; i++) { 110 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 111 i * mp->m_sb.sb_inodesize); 112 if (!dip->di_next_unlinked) { 113 xfs_alert(mp, 114 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", 115 bp); 116 ASSERT(dip->di_next_unlinked); 117 } 118 } 119 } 120 #endif 121 122 /* 123 * Find the buffer associated with the given inode map 124 * We do basic validation checks on the buffer once it has been 125 * retrieved from disk. 126 */ 127 STATIC int 128 xfs_imap_to_bp( 129 xfs_mount_t *mp, 130 xfs_trans_t *tp, 131 struct xfs_imap *imap, 132 xfs_buf_t **bpp, 133 uint buf_flags, 134 uint iget_flags) 135 { 136 int error; 137 int i; 138 int ni; 139 xfs_buf_t *bp; 140 141 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 142 (int)imap->im_len, buf_flags, &bp); 143 if (error) { 144 if (error != EAGAIN) { 145 xfs_warn(mp, 146 "%s: xfs_trans_read_buf() returned error %d.", 147 __func__, error); 148 } else { 149 ASSERT(buf_flags & XBF_TRYLOCK); 150 } 151 return error; 152 } 153 154 /* 155 * Validate the magic number and version of every inode in the buffer 156 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 157 */ 158 #ifdef DEBUG 159 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; 160 #else /* usual case */ 161 ni = 1; 162 #endif 163 164 for (i = 0; i < ni; i++) { 165 int di_ok; 166 xfs_dinode_t *dip; 167 168 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 169 (i << mp->m_sb.sb_inodelog)); 170 di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && 171 XFS_DINODE_GOOD_VERSION(dip->di_version); 172 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 173 XFS_ERRTAG_ITOBP_INOTOBP, 174 XFS_RANDOM_ITOBP_INOTOBP))) { 175 if (iget_flags & XFS_IGET_UNTRUSTED) { 176 xfs_trans_brelse(tp, bp); 177 return XFS_ERROR(EINVAL); 178 } 179 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 180 XFS_ERRLEVEL_HIGH, mp, dip); 181 #ifdef DEBUG 182 xfs_emerg(mp, 183 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 184 (unsigned long long)imap->im_blkno, i, 185 be16_to_cpu(dip->di_magic)); 186 ASSERT(0); 187 #endif 188 xfs_trans_brelse(tp, bp); 189 return XFS_ERROR(EFSCORRUPTED); 190 } 191 } 192 193 xfs_inobp_check(mp, bp); 194 195 /* 196 * Mark the buffer as an inode buffer now that it looks good 197 */ 198 XFS_BUF_SET_VTYPE(bp, B_FS_INO); 199 200 *bpp = bp; 201 return 0; 202 } 203 204 /* 205 * This routine is called to map an inode number within a file 206 * system to the buffer containing the on-disk version of the 207 * inode. It returns a pointer to the buffer containing the 208 * on-disk inode in the bpp parameter, and in the dip parameter 209 * it returns a pointer to the on-disk inode within that buffer. 210 * 211 * If a non-zero error is returned, then the contents of bpp and 212 * dipp are undefined. 213 * 214 * Use xfs_imap() to determine the size and location of the 215 * buffer to read from disk. 216 */ 217 int 218 xfs_inotobp( 219 xfs_mount_t *mp, 220 xfs_trans_t *tp, 221 xfs_ino_t ino, 222 xfs_dinode_t **dipp, 223 xfs_buf_t **bpp, 224 int *offset, 225 uint imap_flags) 226 { 227 struct xfs_imap imap; 228 xfs_buf_t *bp; 229 int error; 230 231 imap.im_blkno = 0; 232 error = xfs_imap(mp, tp, ino, &imap, imap_flags); 233 if (error) 234 return error; 235 236 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); 237 if (error) 238 return error; 239 240 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 241 *bpp = bp; 242 *offset = imap.im_boffset; 243 return 0; 244 } 245 246 247 /* 248 * This routine is called to map an inode to the buffer containing 249 * the on-disk version of the inode. It returns a pointer to the 250 * buffer containing the on-disk inode in the bpp parameter, and in 251 * the dip parameter it returns a pointer to the on-disk inode within 252 * that buffer. 253 * 254 * If a non-zero error is returned, then the contents of bpp and 255 * dipp are undefined. 256 * 257 * The inode is expected to already been mapped to its buffer and read 258 * in once, thus we can use the mapping information stored in the inode 259 * rather than calling xfs_imap(). This allows us to avoid the overhead 260 * of looking at the inode btree for small block file systems 261 * (see xfs_imap()). 262 */ 263 int 264 xfs_itobp( 265 xfs_mount_t *mp, 266 xfs_trans_t *tp, 267 xfs_inode_t *ip, 268 xfs_dinode_t **dipp, 269 xfs_buf_t **bpp, 270 uint buf_flags) 271 { 272 xfs_buf_t *bp; 273 int error; 274 275 ASSERT(ip->i_imap.im_blkno != 0); 276 277 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); 278 if (error) 279 return error; 280 281 if (!bp) { 282 ASSERT(buf_flags & XBF_TRYLOCK); 283 ASSERT(tp == NULL); 284 *bpp = NULL; 285 return EAGAIN; 286 } 287 288 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 289 *bpp = bp; 290 return 0; 291 } 292 293 /* 294 * Move inode type and inode format specific information from the 295 * on-disk inode to the in-core inode. For fifos, devs, and sockets 296 * this means set if_rdev to the proper value. For files, directories, 297 * and symlinks this means to bring in the in-line data or extent 298 * pointers. For a file in B-tree format, only the root is immediately 299 * brought in-core. The rest will be in-lined in if_extents when it 300 * is first referenced (see xfs_iread_extents()). 301 */ 302 STATIC int 303 xfs_iformat( 304 xfs_inode_t *ip, 305 xfs_dinode_t *dip) 306 { 307 xfs_attr_shortform_t *atp; 308 int size; 309 int error; 310 xfs_fsize_t di_size; 311 ip->i_df.if_ext_max = 312 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 313 error = 0; 314 315 if (unlikely(be32_to_cpu(dip->di_nextents) + 316 be16_to_cpu(dip->di_anextents) > 317 be64_to_cpu(dip->di_nblocks))) { 318 xfs_warn(ip->i_mount, 319 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 320 (unsigned long long)ip->i_ino, 321 (int)(be32_to_cpu(dip->di_nextents) + 322 be16_to_cpu(dip->di_anextents)), 323 (unsigned long long) 324 be64_to_cpu(dip->di_nblocks)); 325 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 326 ip->i_mount, dip); 327 return XFS_ERROR(EFSCORRUPTED); 328 } 329 330 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 331 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", 332 (unsigned long long)ip->i_ino, 333 dip->di_forkoff); 334 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 335 ip->i_mount, dip); 336 return XFS_ERROR(EFSCORRUPTED); 337 } 338 339 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 340 !ip->i_mount->m_rtdev_targp)) { 341 xfs_warn(ip->i_mount, 342 "corrupt dinode %Lu, has realtime flag set.", 343 ip->i_ino); 344 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 345 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 346 return XFS_ERROR(EFSCORRUPTED); 347 } 348 349 switch (ip->i_d.di_mode & S_IFMT) { 350 case S_IFIFO: 351 case S_IFCHR: 352 case S_IFBLK: 353 case S_IFSOCK: 354 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 355 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 356 ip->i_mount, dip); 357 return XFS_ERROR(EFSCORRUPTED); 358 } 359 ip->i_d.di_size = 0; 360 ip->i_size = 0; 361 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 362 break; 363 364 case S_IFREG: 365 case S_IFLNK: 366 case S_IFDIR: 367 switch (dip->di_format) { 368 case XFS_DINODE_FMT_LOCAL: 369 /* 370 * no local regular files yet 371 */ 372 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 373 xfs_warn(ip->i_mount, 374 "corrupt inode %Lu (local format for regular file).", 375 (unsigned long long) ip->i_ino); 376 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 377 XFS_ERRLEVEL_LOW, 378 ip->i_mount, dip); 379 return XFS_ERROR(EFSCORRUPTED); 380 } 381 382 di_size = be64_to_cpu(dip->di_size); 383 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 384 xfs_warn(ip->i_mount, 385 "corrupt inode %Lu (bad size %Ld for local inode).", 386 (unsigned long long) ip->i_ino, 387 (long long) di_size); 388 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 389 XFS_ERRLEVEL_LOW, 390 ip->i_mount, dip); 391 return XFS_ERROR(EFSCORRUPTED); 392 } 393 394 size = (int)di_size; 395 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 396 break; 397 case XFS_DINODE_FMT_EXTENTS: 398 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 399 break; 400 case XFS_DINODE_FMT_BTREE: 401 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 402 break; 403 default: 404 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 405 ip->i_mount); 406 return XFS_ERROR(EFSCORRUPTED); 407 } 408 break; 409 410 default: 411 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 412 return XFS_ERROR(EFSCORRUPTED); 413 } 414 if (error) { 415 return error; 416 } 417 if (!XFS_DFORK_Q(dip)) 418 return 0; 419 ASSERT(ip->i_afp == NULL); 420 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 421 ip->i_afp->if_ext_max = 422 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 423 switch (dip->di_aformat) { 424 case XFS_DINODE_FMT_LOCAL: 425 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 426 size = be16_to_cpu(atp->hdr.totsize); 427 428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 429 xfs_warn(ip->i_mount, 430 "corrupt inode %Lu (bad attr fork size %Ld).", 431 (unsigned long long) ip->i_ino, 432 (long long) size); 433 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 434 XFS_ERRLEVEL_LOW, 435 ip->i_mount, dip); 436 return XFS_ERROR(EFSCORRUPTED); 437 } 438 439 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 440 break; 441 case XFS_DINODE_FMT_EXTENTS: 442 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 443 break; 444 case XFS_DINODE_FMT_BTREE: 445 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 446 break; 447 default: 448 error = XFS_ERROR(EFSCORRUPTED); 449 break; 450 } 451 if (error) { 452 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 453 ip->i_afp = NULL; 454 xfs_idestroy_fork(ip, XFS_DATA_FORK); 455 } 456 return error; 457 } 458 459 /* 460 * The file is in-lined in the on-disk inode. 461 * If it fits into if_inline_data, then copy 462 * it there, otherwise allocate a buffer for it 463 * and copy the data there. Either way, set 464 * if_data to point at the data. 465 * If we allocate a buffer for the data, make 466 * sure that its size is a multiple of 4 and 467 * record the real size in i_real_bytes. 468 */ 469 STATIC int 470 xfs_iformat_local( 471 xfs_inode_t *ip, 472 xfs_dinode_t *dip, 473 int whichfork, 474 int size) 475 { 476 xfs_ifork_t *ifp; 477 int real_size; 478 479 /* 480 * If the size is unreasonable, then something 481 * is wrong and we just bail out rather than crash in 482 * kmem_alloc() or memcpy() below. 483 */ 484 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 485 xfs_warn(ip->i_mount, 486 "corrupt inode %Lu (bad size %d for local fork, size = %d).", 487 (unsigned long long) ip->i_ino, size, 488 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 489 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 490 ip->i_mount, dip); 491 return XFS_ERROR(EFSCORRUPTED); 492 } 493 ifp = XFS_IFORK_PTR(ip, whichfork); 494 real_size = 0; 495 if (size == 0) 496 ifp->if_u1.if_data = NULL; 497 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 498 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 499 else { 500 real_size = roundup(size, 4); 501 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 502 } 503 ifp->if_bytes = size; 504 ifp->if_real_bytes = real_size; 505 if (size) 506 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 507 ifp->if_flags &= ~XFS_IFEXTENTS; 508 ifp->if_flags |= XFS_IFINLINE; 509 return 0; 510 } 511 512 /* 513 * The file consists of a set of extents all 514 * of which fit into the on-disk inode. 515 * If there are few enough extents to fit into 516 * the if_inline_ext, then copy them there. 517 * Otherwise allocate a buffer for them and copy 518 * them into it. Either way, set if_extents 519 * to point at the extents. 520 */ 521 STATIC int 522 xfs_iformat_extents( 523 xfs_inode_t *ip, 524 xfs_dinode_t *dip, 525 int whichfork) 526 { 527 xfs_bmbt_rec_t *dp; 528 xfs_ifork_t *ifp; 529 int nex; 530 int size; 531 int i; 532 533 ifp = XFS_IFORK_PTR(ip, whichfork); 534 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 535 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 536 537 /* 538 * If the number of extents is unreasonable, then something 539 * is wrong and we just bail out rather than crash in 540 * kmem_alloc() or memcpy() below. 541 */ 542 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 543 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", 544 (unsigned long long) ip->i_ino, nex); 545 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 546 ip->i_mount, dip); 547 return XFS_ERROR(EFSCORRUPTED); 548 } 549 550 ifp->if_real_bytes = 0; 551 if (nex == 0) 552 ifp->if_u1.if_extents = NULL; 553 else if (nex <= XFS_INLINE_EXTS) 554 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 555 else 556 xfs_iext_add(ifp, 0, nex); 557 558 ifp->if_bytes = size; 559 if (size) { 560 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 561 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 562 for (i = 0; i < nex; i++, dp++) { 563 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 564 ep->l0 = get_unaligned_be64(&dp->l0); 565 ep->l1 = get_unaligned_be64(&dp->l1); 566 } 567 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 568 if (whichfork != XFS_DATA_FORK || 569 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 570 if (unlikely(xfs_check_nostate_extents( 571 ifp, 0, nex))) { 572 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 573 XFS_ERRLEVEL_LOW, 574 ip->i_mount); 575 return XFS_ERROR(EFSCORRUPTED); 576 } 577 } 578 ifp->if_flags |= XFS_IFEXTENTS; 579 return 0; 580 } 581 582 /* 583 * The file has too many extents to fit into 584 * the inode, so they are in B-tree format. 585 * Allocate a buffer for the root of the B-tree 586 * and copy the root into it. The i_extents 587 * field will remain NULL until all of the 588 * extents are read in (when they are needed). 589 */ 590 STATIC int 591 xfs_iformat_btree( 592 xfs_inode_t *ip, 593 xfs_dinode_t *dip, 594 int whichfork) 595 { 596 xfs_bmdr_block_t *dfp; 597 xfs_ifork_t *ifp; 598 /* REFERENCED */ 599 int nrecs; 600 int size; 601 602 ifp = XFS_IFORK_PTR(ip, whichfork); 603 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 604 size = XFS_BMAP_BROOT_SPACE(dfp); 605 nrecs = be16_to_cpu(dfp->bb_numrecs); 606 607 /* 608 * blow out if -- fork has less extents than can fit in 609 * fork (fork shouldn't be a btree format), root btree 610 * block has more records than can fit into the fork, 611 * or the number of extents is greater than the number of 612 * blocks. 613 */ 614 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 615 || XFS_BMDR_SPACE_CALC(nrecs) > 616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 617 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 618 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 619 (unsigned long long) ip->i_ino); 620 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 621 ip->i_mount, dip); 622 return XFS_ERROR(EFSCORRUPTED); 623 } 624 625 ifp->if_broot_bytes = size; 626 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 627 ASSERT(ifp->if_broot != NULL); 628 /* 629 * Copy and convert from the on-disk structure 630 * to the in-memory structure. 631 */ 632 xfs_bmdr_to_bmbt(ip->i_mount, dfp, 633 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 634 ifp->if_broot, size); 635 ifp->if_flags &= ~XFS_IFEXTENTS; 636 ifp->if_flags |= XFS_IFBROOT; 637 638 return 0; 639 } 640 641 STATIC void 642 xfs_dinode_from_disk( 643 xfs_icdinode_t *to, 644 xfs_dinode_t *from) 645 { 646 to->di_magic = be16_to_cpu(from->di_magic); 647 to->di_mode = be16_to_cpu(from->di_mode); 648 to->di_version = from ->di_version; 649 to->di_format = from->di_format; 650 to->di_onlink = be16_to_cpu(from->di_onlink); 651 to->di_uid = be32_to_cpu(from->di_uid); 652 to->di_gid = be32_to_cpu(from->di_gid); 653 to->di_nlink = be32_to_cpu(from->di_nlink); 654 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 655 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 656 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 657 to->di_flushiter = be16_to_cpu(from->di_flushiter); 658 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 659 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 660 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 661 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 662 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 663 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 664 to->di_size = be64_to_cpu(from->di_size); 665 to->di_nblocks = be64_to_cpu(from->di_nblocks); 666 to->di_extsize = be32_to_cpu(from->di_extsize); 667 to->di_nextents = be32_to_cpu(from->di_nextents); 668 to->di_anextents = be16_to_cpu(from->di_anextents); 669 to->di_forkoff = from->di_forkoff; 670 to->di_aformat = from->di_aformat; 671 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 672 to->di_dmstate = be16_to_cpu(from->di_dmstate); 673 to->di_flags = be16_to_cpu(from->di_flags); 674 to->di_gen = be32_to_cpu(from->di_gen); 675 } 676 677 void 678 xfs_dinode_to_disk( 679 xfs_dinode_t *to, 680 xfs_icdinode_t *from) 681 { 682 to->di_magic = cpu_to_be16(from->di_magic); 683 to->di_mode = cpu_to_be16(from->di_mode); 684 to->di_version = from ->di_version; 685 to->di_format = from->di_format; 686 to->di_onlink = cpu_to_be16(from->di_onlink); 687 to->di_uid = cpu_to_be32(from->di_uid); 688 to->di_gid = cpu_to_be32(from->di_gid); 689 to->di_nlink = cpu_to_be32(from->di_nlink); 690 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 691 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 692 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 693 to->di_flushiter = cpu_to_be16(from->di_flushiter); 694 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 695 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 696 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 697 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 698 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 699 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 700 to->di_size = cpu_to_be64(from->di_size); 701 to->di_nblocks = cpu_to_be64(from->di_nblocks); 702 to->di_extsize = cpu_to_be32(from->di_extsize); 703 to->di_nextents = cpu_to_be32(from->di_nextents); 704 to->di_anextents = cpu_to_be16(from->di_anextents); 705 to->di_forkoff = from->di_forkoff; 706 to->di_aformat = from->di_aformat; 707 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 708 to->di_dmstate = cpu_to_be16(from->di_dmstate); 709 to->di_flags = cpu_to_be16(from->di_flags); 710 to->di_gen = cpu_to_be32(from->di_gen); 711 } 712 713 STATIC uint 714 _xfs_dic2xflags( 715 __uint16_t di_flags) 716 { 717 uint flags = 0; 718 719 if (di_flags & XFS_DIFLAG_ANY) { 720 if (di_flags & XFS_DIFLAG_REALTIME) 721 flags |= XFS_XFLAG_REALTIME; 722 if (di_flags & XFS_DIFLAG_PREALLOC) 723 flags |= XFS_XFLAG_PREALLOC; 724 if (di_flags & XFS_DIFLAG_IMMUTABLE) 725 flags |= XFS_XFLAG_IMMUTABLE; 726 if (di_flags & XFS_DIFLAG_APPEND) 727 flags |= XFS_XFLAG_APPEND; 728 if (di_flags & XFS_DIFLAG_SYNC) 729 flags |= XFS_XFLAG_SYNC; 730 if (di_flags & XFS_DIFLAG_NOATIME) 731 flags |= XFS_XFLAG_NOATIME; 732 if (di_flags & XFS_DIFLAG_NODUMP) 733 flags |= XFS_XFLAG_NODUMP; 734 if (di_flags & XFS_DIFLAG_RTINHERIT) 735 flags |= XFS_XFLAG_RTINHERIT; 736 if (di_flags & XFS_DIFLAG_PROJINHERIT) 737 flags |= XFS_XFLAG_PROJINHERIT; 738 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 739 flags |= XFS_XFLAG_NOSYMLINKS; 740 if (di_flags & XFS_DIFLAG_EXTSIZE) 741 flags |= XFS_XFLAG_EXTSIZE; 742 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 743 flags |= XFS_XFLAG_EXTSZINHERIT; 744 if (di_flags & XFS_DIFLAG_NODEFRAG) 745 flags |= XFS_XFLAG_NODEFRAG; 746 if (di_flags & XFS_DIFLAG_FILESTREAM) 747 flags |= XFS_XFLAG_FILESTREAM; 748 } 749 750 return flags; 751 } 752 753 uint 754 xfs_ip2xflags( 755 xfs_inode_t *ip) 756 { 757 xfs_icdinode_t *dic = &ip->i_d; 758 759 return _xfs_dic2xflags(dic->di_flags) | 760 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 761 } 762 763 uint 764 xfs_dic2xflags( 765 xfs_dinode_t *dip) 766 { 767 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 768 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 769 } 770 771 /* 772 * Read the disk inode attributes into the in-core inode structure. 773 */ 774 int 775 xfs_iread( 776 xfs_mount_t *mp, 777 xfs_trans_t *tp, 778 xfs_inode_t *ip, 779 uint iget_flags) 780 { 781 xfs_buf_t *bp; 782 xfs_dinode_t *dip; 783 int error; 784 785 /* 786 * Fill in the location information in the in-core inode. 787 */ 788 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 789 if (error) 790 return error; 791 792 /* 793 * Get pointers to the on-disk inode and the buffer containing it. 794 */ 795 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 796 XBF_LOCK, iget_flags); 797 if (error) 798 return error; 799 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 800 801 /* 802 * If we got something that isn't an inode it means someone 803 * (nfs or dmi) has a stale handle. 804 */ 805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 806 #ifdef DEBUG 807 xfs_alert(mp, 808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", 809 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); 810 #endif /* DEBUG */ 811 error = XFS_ERROR(EINVAL); 812 goto out_brelse; 813 } 814 815 /* 816 * If the on-disk inode is already linked to a directory 817 * entry, copy all of the inode into the in-core inode. 818 * xfs_iformat() handles copying in the inode format 819 * specific information. 820 * Otherwise, just get the truly permanent information. 821 */ 822 if (dip->di_mode) { 823 xfs_dinode_from_disk(&ip->i_d, dip); 824 error = xfs_iformat(ip, dip); 825 if (error) { 826 #ifdef DEBUG 827 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 828 __func__, error); 829 #endif /* DEBUG */ 830 goto out_brelse; 831 } 832 } else { 833 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 834 ip->i_d.di_version = dip->di_version; 835 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 836 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 837 /* 838 * Make sure to pull in the mode here as well in 839 * case the inode is released without being used. 840 * This ensures that xfs_inactive() will see that 841 * the inode is already free and not try to mess 842 * with the uninitialized part of it. 843 */ 844 ip->i_d.di_mode = 0; 845 /* 846 * Initialize the per-fork minima and maxima for a new 847 * inode here. xfs_iformat will do it for old inodes. 848 */ 849 ip->i_df.if_ext_max = 850 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 851 } 852 853 /* 854 * The inode format changed when we moved the link count and 855 * made it 32 bits long. If this is an old format inode, 856 * convert it in memory to look like a new one. If it gets 857 * flushed to disk we will convert back before flushing or 858 * logging it. We zero out the new projid field and the old link 859 * count field. We'll handle clearing the pad field (the remains 860 * of the old uuid field) when we actually convert the inode to 861 * the new format. We don't change the version number so that we 862 * can distinguish this from a real new format inode. 863 */ 864 if (ip->i_d.di_version == 1) { 865 ip->i_d.di_nlink = ip->i_d.di_onlink; 866 ip->i_d.di_onlink = 0; 867 xfs_set_projid(ip, 0); 868 } 869 870 ip->i_delayed_blks = 0; 871 ip->i_size = ip->i_d.di_size; 872 873 /* 874 * Mark the buffer containing the inode as something to keep 875 * around for a while. This helps to keep recently accessed 876 * meta-data in-core longer. 877 */ 878 xfs_buf_set_ref(bp, XFS_INO_REF); 879 880 /* 881 * Use xfs_trans_brelse() to release the buffer containing the 882 * on-disk inode, because it was acquired with xfs_trans_read_buf() 883 * in xfs_itobp() above. If tp is NULL, this is just a normal 884 * brelse(). If we're within a transaction, then xfs_trans_brelse() 885 * will only release the buffer if it is not dirty within the 886 * transaction. It will be OK to release the buffer in this case, 887 * because inodes on disk are never destroyed and we will be 888 * locking the new in-core inode before putting it in the hash 889 * table where other processes can find it. Thus we don't have 890 * to worry about the inode being changed just because we released 891 * the buffer. 892 */ 893 out_brelse: 894 xfs_trans_brelse(tp, bp); 895 return error; 896 } 897 898 /* 899 * Read in extents from a btree-format inode. 900 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 901 */ 902 int 903 xfs_iread_extents( 904 xfs_trans_t *tp, 905 xfs_inode_t *ip, 906 int whichfork) 907 { 908 int error; 909 xfs_ifork_t *ifp; 910 xfs_extnum_t nextents; 911 912 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 913 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 914 ip->i_mount); 915 return XFS_ERROR(EFSCORRUPTED); 916 } 917 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 918 ifp = XFS_IFORK_PTR(ip, whichfork); 919 920 /* 921 * We know that the size is valid (it's checked in iformat_btree) 922 */ 923 ifp->if_lastex = NULLEXTNUM; 924 ifp->if_bytes = ifp->if_real_bytes = 0; 925 ifp->if_flags |= XFS_IFEXTENTS; 926 xfs_iext_add(ifp, 0, nextents); 927 error = xfs_bmap_read_extents(tp, ip, whichfork); 928 if (error) { 929 xfs_iext_destroy(ifp); 930 ifp->if_flags &= ~XFS_IFEXTENTS; 931 return error; 932 } 933 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 934 return 0; 935 } 936 937 /* 938 * Allocate an inode on disk and return a copy of its in-core version. 939 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 940 * appropriately within the inode. The uid and gid for the inode are 941 * set according to the contents of the given cred structure. 942 * 943 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 944 * has a free inode available, call xfs_iget() 945 * to obtain the in-core version of the allocated inode. Finally, 946 * fill in the inode and log its initial contents. In this case, 947 * ialloc_context would be set to NULL and call_again set to false. 948 * 949 * If xfs_dialloc() does not have an available inode, 950 * it will replenish its supply by doing an allocation. Since we can 951 * only do one allocation within a transaction without deadlocks, we 952 * must commit the current transaction before returning the inode itself. 953 * In this case, therefore, we will set call_again to true and return. 954 * The caller should then commit the current transaction, start a new 955 * transaction, and call xfs_ialloc() again to actually get the inode. 956 * 957 * To ensure that some other process does not grab the inode that 958 * was allocated during the first call to xfs_ialloc(), this routine 959 * also returns the [locked] bp pointing to the head of the freelist 960 * as ialloc_context. The caller should hold this buffer across 961 * the commit and pass it back into this routine on the second call. 962 * 963 * If we are allocating quota inodes, we do not have a parent inode 964 * to attach to or associate with (i.e. pip == NULL) because they 965 * are not linked into the directory structure - they are attached 966 * directly to the superblock - and so have no parent. 967 */ 968 int 969 xfs_ialloc( 970 xfs_trans_t *tp, 971 xfs_inode_t *pip, 972 mode_t mode, 973 xfs_nlink_t nlink, 974 xfs_dev_t rdev, 975 prid_t prid, 976 int okalloc, 977 xfs_buf_t **ialloc_context, 978 boolean_t *call_again, 979 xfs_inode_t **ipp) 980 { 981 xfs_ino_t ino; 982 xfs_inode_t *ip; 983 uint flags; 984 int error; 985 timespec_t tv; 986 int filestreams = 0; 987 988 /* 989 * Call the space management code to pick 990 * the on-disk inode to be allocated. 991 */ 992 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 993 ialloc_context, call_again, &ino); 994 if (error) 995 return error; 996 if (*call_again || ino == NULLFSINO) { 997 *ipp = NULL; 998 return 0; 999 } 1000 ASSERT(*ialloc_context == NULL); 1001 1002 /* 1003 * Get the in-core inode with the lock held exclusively. 1004 * This is because we're setting fields here we need 1005 * to prevent others from looking at until we're done. 1006 */ 1007 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, 1008 XFS_ILOCK_EXCL, &ip); 1009 if (error) 1010 return error; 1011 ASSERT(ip != NULL); 1012 1013 ip->i_d.di_mode = (__uint16_t)mode; 1014 ip->i_d.di_onlink = 0; 1015 ip->i_d.di_nlink = nlink; 1016 ASSERT(ip->i_d.di_nlink == nlink); 1017 ip->i_d.di_uid = current_fsuid(); 1018 ip->i_d.di_gid = current_fsgid(); 1019 xfs_set_projid(ip, prid); 1020 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1021 1022 /* 1023 * If the superblock version is up to where we support new format 1024 * inodes and this is currently an old format inode, then change 1025 * the inode version number now. This way we only do the conversion 1026 * here rather than here and in the flush/logging code. 1027 */ 1028 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1029 ip->i_d.di_version == 1) { 1030 ip->i_d.di_version = 2; 1031 /* 1032 * We've already zeroed the old link count, the projid field, 1033 * and the pad field. 1034 */ 1035 } 1036 1037 /* 1038 * Project ids won't be stored on disk if we are using a version 1 inode. 1039 */ 1040 if ((prid != 0) && (ip->i_d.di_version == 1)) 1041 xfs_bump_ino_vers2(tp, ip); 1042 1043 if (pip && XFS_INHERIT_GID(pip)) { 1044 ip->i_d.di_gid = pip->i_d.di_gid; 1045 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1046 ip->i_d.di_mode |= S_ISGID; 1047 } 1048 } 1049 1050 /* 1051 * If the group ID of the new file does not match the effective group 1052 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1053 * (and only if the irix_sgid_inherit compatibility variable is set). 1054 */ 1055 if ((irix_sgid_inherit) && 1056 (ip->i_d.di_mode & S_ISGID) && 1057 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1058 ip->i_d.di_mode &= ~S_ISGID; 1059 } 1060 1061 ip->i_d.di_size = 0; 1062 ip->i_size = 0; 1063 ip->i_d.di_nextents = 0; 1064 ASSERT(ip->i_d.di_nblocks == 0); 1065 1066 nanotime(&tv); 1067 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 1068 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 1069 ip->i_d.di_atime = ip->i_d.di_mtime; 1070 ip->i_d.di_ctime = ip->i_d.di_mtime; 1071 1072 /* 1073 * di_gen will have been taken care of in xfs_iread. 1074 */ 1075 ip->i_d.di_extsize = 0; 1076 ip->i_d.di_dmevmask = 0; 1077 ip->i_d.di_dmstate = 0; 1078 ip->i_d.di_flags = 0; 1079 flags = XFS_ILOG_CORE; 1080 switch (mode & S_IFMT) { 1081 case S_IFIFO: 1082 case S_IFCHR: 1083 case S_IFBLK: 1084 case S_IFSOCK: 1085 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1086 ip->i_df.if_u2.if_rdev = rdev; 1087 ip->i_df.if_flags = 0; 1088 flags |= XFS_ILOG_DEV; 1089 break; 1090 case S_IFREG: 1091 /* 1092 * we can't set up filestreams until after the VFS inode 1093 * is set up properly. 1094 */ 1095 if (pip && xfs_inode_is_filestream(pip)) 1096 filestreams = 1; 1097 /* fall through */ 1098 case S_IFDIR: 1099 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1100 uint di_flags = 0; 1101 1102 if ((mode & S_IFMT) == S_IFDIR) { 1103 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1104 di_flags |= XFS_DIFLAG_RTINHERIT; 1105 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1106 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1107 ip->i_d.di_extsize = pip->i_d.di_extsize; 1108 } 1109 } else if ((mode & S_IFMT) == S_IFREG) { 1110 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1111 di_flags |= XFS_DIFLAG_REALTIME; 1112 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1113 di_flags |= XFS_DIFLAG_EXTSIZE; 1114 ip->i_d.di_extsize = pip->i_d.di_extsize; 1115 } 1116 } 1117 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1118 xfs_inherit_noatime) 1119 di_flags |= XFS_DIFLAG_NOATIME; 1120 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1121 xfs_inherit_nodump) 1122 di_flags |= XFS_DIFLAG_NODUMP; 1123 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1124 xfs_inherit_sync) 1125 di_flags |= XFS_DIFLAG_SYNC; 1126 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1127 xfs_inherit_nosymlinks) 1128 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1129 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1130 di_flags |= XFS_DIFLAG_PROJINHERIT; 1131 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1132 xfs_inherit_nodefrag) 1133 di_flags |= XFS_DIFLAG_NODEFRAG; 1134 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 1135 di_flags |= XFS_DIFLAG_FILESTREAM; 1136 ip->i_d.di_flags |= di_flags; 1137 } 1138 /* FALLTHROUGH */ 1139 case S_IFLNK: 1140 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1141 ip->i_df.if_flags = XFS_IFEXTENTS; 1142 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1143 ip->i_df.if_u1.if_extents = NULL; 1144 break; 1145 default: 1146 ASSERT(0); 1147 } 1148 /* 1149 * Attribute fork settings for new inode. 1150 */ 1151 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1152 ip->i_d.di_anextents = 0; 1153 1154 /* 1155 * Log the new values stuffed into the inode. 1156 */ 1157 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL); 1158 xfs_trans_log_inode(tp, ip, flags); 1159 1160 /* now that we have an i_mode we can setup inode ops and unlock */ 1161 xfs_setup_inode(ip); 1162 1163 /* now we have set up the vfs inode we can associate the filestream */ 1164 if (filestreams) { 1165 error = xfs_filestream_associate(pip, ip); 1166 if (error < 0) 1167 return -error; 1168 if (!error) 1169 xfs_iflags_set(ip, XFS_IFILESTREAM); 1170 } 1171 1172 *ipp = ip; 1173 return 0; 1174 } 1175 1176 /* 1177 * Check to make sure that there are no blocks allocated to the 1178 * file beyond the size of the file. We don't check this for 1179 * files with fixed size extents or real time extents, but we 1180 * at least do it for regular files. 1181 */ 1182 #ifdef DEBUG 1183 void 1184 xfs_isize_check( 1185 xfs_mount_t *mp, 1186 xfs_inode_t *ip, 1187 xfs_fsize_t isize) 1188 { 1189 xfs_fileoff_t map_first; 1190 int nimaps; 1191 xfs_bmbt_irec_t imaps[2]; 1192 1193 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1194 return; 1195 1196 if (XFS_IS_REALTIME_INODE(ip)) 1197 return; 1198 1199 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 1200 return; 1201 1202 nimaps = 2; 1203 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1204 /* 1205 * The filesystem could be shutting down, so bmapi may return 1206 * an error. 1207 */ 1208 if (xfs_bmapi(NULL, ip, map_first, 1209 (XFS_B_TO_FSB(mp, 1210 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1211 map_first), 1212 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1213 NULL)) 1214 return; 1215 ASSERT(nimaps == 1); 1216 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1217 } 1218 #endif /* DEBUG */ 1219 1220 /* 1221 * Calculate the last possible buffered byte in a file. This must 1222 * include data that was buffered beyond the EOF by the write code. 1223 * This also needs to deal with overflowing the xfs_fsize_t type 1224 * which can happen for sizes near the limit. 1225 * 1226 * We also need to take into account any blocks beyond the EOF. It 1227 * may be the case that they were buffered by a write which failed. 1228 * In that case the pages will still be in memory, but the inode size 1229 * will never have been updated. 1230 */ 1231 STATIC xfs_fsize_t 1232 xfs_file_last_byte( 1233 xfs_inode_t *ip) 1234 { 1235 xfs_mount_t *mp; 1236 xfs_fsize_t last_byte; 1237 xfs_fileoff_t last_block; 1238 xfs_fileoff_t size_last_block; 1239 int error; 1240 1241 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); 1242 1243 mp = ip->i_mount; 1244 /* 1245 * Only check for blocks beyond the EOF if the extents have 1246 * been read in. This eliminates the need for the inode lock, 1247 * and it also saves us from looking when it really isn't 1248 * necessary. 1249 */ 1250 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1251 xfs_ilock(ip, XFS_ILOCK_SHARED); 1252 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1253 XFS_DATA_FORK); 1254 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1255 if (error) { 1256 last_block = 0; 1257 } 1258 } else { 1259 last_block = 0; 1260 } 1261 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); 1262 last_block = XFS_FILEOFF_MAX(last_block, size_last_block); 1263 1264 last_byte = XFS_FSB_TO_B(mp, last_block); 1265 if (last_byte < 0) { 1266 return XFS_MAXIOFFSET(mp); 1267 } 1268 last_byte += (1 << mp->m_writeio_log); 1269 if (last_byte < 0) { 1270 return XFS_MAXIOFFSET(mp); 1271 } 1272 return last_byte; 1273 } 1274 1275 /* 1276 * Start the truncation of the file to new_size. The new size 1277 * must be smaller than the current size. This routine will 1278 * clear the buffer and page caches of file data in the removed 1279 * range, and xfs_itruncate_finish() will remove the underlying 1280 * disk blocks. 1281 * 1282 * The inode must have its I/O lock locked EXCLUSIVELY, and it 1283 * must NOT have the inode lock held at all. This is because we're 1284 * calling into the buffer/page cache code and we can't hold the 1285 * inode lock when we do so. 1286 * 1287 * We need to wait for any direct I/Os in flight to complete before we 1288 * proceed with the truncate. This is needed to prevent the extents 1289 * being read or written by the direct I/Os from being removed while the 1290 * I/O is in flight as there is no other method of synchronising 1291 * direct I/O with the truncate operation. Also, because we hold 1292 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being 1293 * started until the truncate completes and drops the lock. Essentially, 1294 * the xfs_ioend_wait() call forms an I/O barrier that provides strict 1295 * ordering between direct I/Os and the truncate operation. 1296 * 1297 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1298 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1299 * in the case that the caller is locking things out of order and 1300 * may not be able to call xfs_itruncate_finish() with the inode lock 1301 * held without dropping the I/O lock. If the caller must drop the 1302 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() 1303 * must be called again with all the same restrictions as the initial 1304 * call. 1305 */ 1306 int 1307 xfs_itruncate_start( 1308 xfs_inode_t *ip, 1309 uint flags, 1310 xfs_fsize_t new_size) 1311 { 1312 xfs_fsize_t last_byte; 1313 xfs_off_t toss_start; 1314 xfs_mount_t *mp; 1315 int error = 0; 1316 1317 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1318 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1319 ASSERT((flags == XFS_ITRUNC_DEFINITE) || 1320 (flags == XFS_ITRUNC_MAYBE)); 1321 1322 mp = ip->i_mount; 1323 1324 /* wait for the completion of any pending DIOs */ 1325 if (new_size == 0 || new_size < ip->i_size) 1326 xfs_ioend_wait(ip); 1327 1328 /* 1329 * Call toss_pages or flushinval_pages to get rid of pages 1330 * overlapping the region being removed. We have to use 1331 * the less efficient flushinval_pages in the case that the 1332 * caller may not be able to finish the truncate without 1333 * dropping the inode's I/O lock. Make sure 1334 * to catch any pages brought in by buffers overlapping 1335 * the EOF by searching out beyond the isize by our 1336 * block size. We round new_size up to a block boundary 1337 * so that we don't toss things on the same block as 1338 * new_size but before it. 1339 * 1340 * Before calling toss_page or flushinval_pages, make sure to 1341 * call remapf() over the same region if the file is mapped. 1342 * This frees up mapped file references to the pages in the 1343 * given range and for the flushinval_pages case it ensures 1344 * that we get the latest mapped changes flushed out. 1345 */ 1346 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1347 toss_start = XFS_FSB_TO_B(mp, toss_start); 1348 if (toss_start < 0) { 1349 /* 1350 * The place to start tossing is beyond our maximum 1351 * file size, so there is no way that the data extended 1352 * out there. 1353 */ 1354 return 0; 1355 } 1356 last_byte = xfs_file_last_byte(ip); 1357 trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); 1358 if (last_byte > toss_start) { 1359 if (flags & XFS_ITRUNC_DEFINITE) { 1360 xfs_tosspages(ip, toss_start, 1361 -1, FI_REMAPF_LOCKED); 1362 } else { 1363 error = xfs_flushinval_pages(ip, toss_start, 1364 -1, FI_REMAPF_LOCKED); 1365 } 1366 } 1367 1368 #ifdef DEBUG 1369 if (new_size == 0) { 1370 ASSERT(VN_CACHED(VFS_I(ip)) == 0); 1371 } 1372 #endif 1373 return error; 1374 } 1375 1376 /* 1377 * Shrink the file to the given new_size. The new size must be smaller than 1378 * the current size. This will free up the underlying blocks in the removed 1379 * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). 1380 * 1381 * The transaction passed to this routine must have made a permanent log 1382 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1383 * given transaction and start new ones, so make sure everything involved in 1384 * the transaction is tidy before calling here. Some transaction will be 1385 * returned to the caller to be committed. The incoming transaction must 1386 * already include the inode, and both inode locks must be held exclusively. 1387 * The inode must also be "held" within the transaction. On return the inode 1388 * will be "held" within the returned transaction. This routine does NOT 1389 * require any disk space to be reserved for it within the transaction. 1390 * 1391 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it 1392 * indicates the fork which is to be truncated. For the attribute fork we only 1393 * support truncation to size 0. 1394 * 1395 * We use the sync parameter to indicate whether or not the first transaction 1396 * we perform might have to be synchronous. For the attr fork, it needs to be 1397 * so if the unlink of the inode is not yet known to be permanent in the log. 1398 * This keeps us from freeing and reusing the blocks of the attribute fork 1399 * before the unlink of the inode becomes permanent. 1400 * 1401 * For the data fork, we normally have to run synchronously if we're being 1402 * called out of the inactive path or we're being called out of the create path 1403 * where we're truncating an existing file. Either way, the truncate needs to 1404 * be sync so blocks don't reappear in the file with altered data in case of a 1405 * crash. wsync filesystems can run the first case async because anything that 1406 * shrinks the inode has to run sync so by the time we're called here from 1407 * inactive, the inode size is permanently set to 0. 1408 * 1409 * Calls from the truncate path always need to be sync unless we're in a wsync 1410 * filesystem and the file has already been unlinked. 1411 * 1412 * The caller is responsible for correctly setting the sync parameter. It gets 1413 * too hard for us to guess here which path we're being called out of just 1414 * based on inode state. 1415 * 1416 * If we get an error, we must return with the inode locked and linked into the 1417 * current transaction. This keeps things simple for the higher level code, 1418 * because it always knows that the inode is locked and held in the transaction 1419 * that returns to it whether errors occur or not. We don't mark the inode 1420 * dirty on error so that transactions can be easily aborted if possible. 1421 */ 1422 int 1423 xfs_itruncate_finish( 1424 xfs_trans_t **tp, 1425 xfs_inode_t *ip, 1426 xfs_fsize_t new_size, 1427 int fork, 1428 int sync) 1429 { 1430 xfs_fsblock_t first_block; 1431 xfs_fileoff_t first_unmap_block; 1432 xfs_fileoff_t last_block; 1433 xfs_filblks_t unmap_len=0; 1434 xfs_mount_t *mp; 1435 xfs_trans_t *ntp; 1436 int done; 1437 int committed; 1438 xfs_bmap_free_t free_list; 1439 int error; 1440 1441 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1442 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1443 ASSERT(*tp != NULL); 1444 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1445 ASSERT(ip->i_transp == *tp); 1446 ASSERT(ip->i_itemp != NULL); 1447 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1448 1449 1450 ntp = *tp; 1451 mp = (ntp)->t_mountp; 1452 ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); 1453 1454 /* 1455 * We only support truncating the entire attribute fork. 1456 */ 1457 if (fork == XFS_ATTR_FORK) { 1458 new_size = 0LL; 1459 } 1460 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1461 trace_xfs_itruncate_finish_start(ip, new_size); 1462 1463 /* 1464 * The first thing we do is set the size to new_size permanently 1465 * on disk. This way we don't have to worry about anyone ever 1466 * being able to look at the data being freed even in the face 1467 * of a crash. What we're getting around here is the case where 1468 * we free a block, it is allocated to another file, it is written 1469 * to, and then we crash. If the new data gets written to the 1470 * file but the log buffers containing the free and reallocation 1471 * don't, then we'd end up with garbage in the blocks being freed. 1472 * As long as we make the new_size permanent before actually 1473 * freeing any blocks it doesn't matter if they get writtten to. 1474 * 1475 * The callers must signal into us whether or not the size 1476 * setting here must be synchronous. There are a few cases 1477 * where it doesn't have to be synchronous. Those cases 1478 * occur if the file is unlinked and we know the unlink is 1479 * permanent or if the blocks being truncated are guaranteed 1480 * to be beyond the inode eof (regardless of the link count) 1481 * and the eof value is permanent. Both of these cases occur 1482 * only on wsync-mounted filesystems. In those cases, we're 1483 * guaranteed that no user will ever see the data in the blocks 1484 * that are being truncated so the truncate can run async. 1485 * In the free beyond eof case, the file may wind up with 1486 * more blocks allocated to it than it needs if we crash 1487 * and that won't get fixed until the next time the file 1488 * is re-opened and closed but that's ok as that shouldn't 1489 * be too many blocks. 1490 * 1491 * However, we can't just make all wsync xactions run async 1492 * because there's one call out of the create path that needs 1493 * to run sync where it's truncating an existing file to size 1494 * 0 whose size is > 0. 1495 * 1496 * It's probably possible to come up with a test in this 1497 * routine that would correctly distinguish all the above 1498 * cases from the values of the function parameters and the 1499 * inode state but for sanity's sake, I've decided to let the 1500 * layers above just tell us. It's simpler to correctly figure 1501 * out in the layer above exactly under what conditions we 1502 * can run async and I think it's easier for others read and 1503 * follow the logic in case something has to be changed. 1504 * cscope is your friend -- rcc. 1505 * 1506 * The attribute fork is much simpler. 1507 * 1508 * For the attribute fork we allow the caller to tell us whether 1509 * the unlink of the inode that led to this call is yet permanent 1510 * in the on disk log. If it is not and we will be freeing extents 1511 * in this inode then we make the first transaction synchronous 1512 * to make sure that the unlink is permanent by the time we free 1513 * the blocks. 1514 */ 1515 if (fork == XFS_DATA_FORK) { 1516 if (ip->i_d.di_nextents > 0) { 1517 /* 1518 * If we are not changing the file size then do 1519 * not update the on-disk file size - we may be 1520 * called from xfs_inactive_free_eofblocks(). If we 1521 * update the on-disk file size and then the system 1522 * crashes before the contents of the file are 1523 * flushed to disk then the files may be full of 1524 * holes (ie NULL files bug). 1525 */ 1526 if (ip->i_size != new_size) { 1527 ip->i_d.di_size = new_size; 1528 ip->i_size = new_size; 1529 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1530 } 1531 } 1532 } else if (sync) { 1533 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); 1534 if (ip->i_d.di_anextents > 0) 1535 xfs_trans_set_sync(ntp); 1536 } 1537 ASSERT(fork == XFS_DATA_FORK || 1538 (fork == XFS_ATTR_FORK && 1539 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || 1540 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); 1541 1542 /* 1543 * Since it is possible for space to become allocated beyond 1544 * the end of the file (in a crash where the space is allocated 1545 * but the inode size is not yet updated), simply remove any 1546 * blocks which show up between the new EOF and the maximum 1547 * possible file size. If the first block to be removed is 1548 * beyond the maximum file size (ie it is the same as last_block), 1549 * then there is nothing to do. 1550 */ 1551 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1552 ASSERT(first_unmap_block <= last_block); 1553 done = 0; 1554 if (last_block == first_unmap_block) { 1555 done = 1; 1556 } else { 1557 unmap_len = last_block - first_unmap_block + 1; 1558 } 1559 while (!done) { 1560 /* 1561 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() 1562 * will tell us whether it freed the entire range or 1563 * not. If this is a synchronous mount (wsync), 1564 * then we can tell bunmapi to keep all the 1565 * transactions asynchronous since the unlink 1566 * transaction that made this inode inactive has 1567 * already hit the disk. There's no danger of 1568 * the freed blocks being reused, there being a 1569 * crash, and the reused blocks suddenly reappearing 1570 * in this file with garbage in them once recovery 1571 * runs. 1572 */ 1573 xfs_bmap_init(&free_list, &first_block); 1574 error = xfs_bunmapi(ntp, ip, 1575 first_unmap_block, unmap_len, 1576 xfs_bmapi_aflag(fork), 1577 XFS_ITRUNC_MAX_EXTENTS, 1578 &first_block, &free_list, 1579 &done); 1580 if (error) { 1581 /* 1582 * If the bunmapi call encounters an error, 1583 * return to the caller where the transaction 1584 * can be properly aborted. We just need to 1585 * make sure we're not holding any resources 1586 * that we were not when we came in. 1587 */ 1588 xfs_bmap_cancel(&free_list); 1589 return error; 1590 } 1591 1592 /* 1593 * Duplicate the transaction that has the permanent 1594 * reservation and commit the old transaction. 1595 */ 1596 error = xfs_bmap_finish(tp, &free_list, &committed); 1597 ntp = *tp; 1598 if (committed) 1599 xfs_trans_ijoin(ntp, ip); 1600 1601 if (error) { 1602 /* 1603 * If the bmap finish call encounters an error, return 1604 * to the caller where the transaction can be properly 1605 * aborted. We just need to make sure we're not 1606 * holding any resources that we were not when we came 1607 * in. 1608 * 1609 * Aborting from this point might lose some blocks in 1610 * the file system, but oh well. 1611 */ 1612 xfs_bmap_cancel(&free_list); 1613 return error; 1614 } 1615 1616 if (committed) { 1617 /* 1618 * Mark the inode dirty so it will be logged and 1619 * moved forward in the log as part of every commit. 1620 */ 1621 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1622 } 1623 1624 ntp = xfs_trans_dup(ntp); 1625 error = xfs_trans_commit(*tp, 0); 1626 *tp = ntp; 1627 1628 xfs_trans_ijoin(ntp, ip); 1629 1630 if (error) 1631 return error; 1632 /* 1633 * transaction commit worked ok so we can drop the extra ticket 1634 * reference that we gained in xfs_trans_dup() 1635 */ 1636 xfs_log_ticket_put(ntp->t_ticket); 1637 error = xfs_trans_reserve(ntp, 0, 1638 XFS_ITRUNCATE_LOG_RES(mp), 0, 1639 XFS_TRANS_PERM_LOG_RES, 1640 XFS_ITRUNCATE_LOG_COUNT); 1641 if (error) 1642 return error; 1643 } 1644 /* 1645 * Only update the size in the case of the data fork, but 1646 * always re-log the inode so that our permanent transaction 1647 * can keep on rolling it forward in the log. 1648 */ 1649 if (fork == XFS_DATA_FORK) { 1650 xfs_isize_check(mp, ip, new_size); 1651 /* 1652 * If we are not changing the file size then do 1653 * not update the on-disk file size - we may be 1654 * called from xfs_inactive_free_eofblocks(). If we 1655 * update the on-disk file size and then the system 1656 * crashes before the contents of the file are 1657 * flushed to disk then the files may be full of 1658 * holes (ie NULL files bug). 1659 */ 1660 if (ip->i_size != new_size) { 1661 ip->i_d.di_size = new_size; 1662 ip->i_size = new_size; 1663 } 1664 } 1665 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1666 ASSERT((new_size != 0) || 1667 (fork == XFS_ATTR_FORK) || 1668 (ip->i_delayed_blks == 0)); 1669 ASSERT((new_size != 0) || 1670 (fork == XFS_ATTR_FORK) || 1671 (ip->i_d.di_nextents == 0)); 1672 trace_xfs_itruncate_finish_end(ip, new_size); 1673 return 0; 1674 } 1675 1676 /* 1677 * This is called when the inode's link count goes to 0. 1678 * We place the on-disk inode on a list in the AGI. It 1679 * will be pulled from this list when the inode is freed. 1680 */ 1681 int 1682 xfs_iunlink( 1683 xfs_trans_t *tp, 1684 xfs_inode_t *ip) 1685 { 1686 xfs_mount_t *mp; 1687 xfs_agi_t *agi; 1688 xfs_dinode_t *dip; 1689 xfs_buf_t *agibp; 1690 xfs_buf_t *ibp; 1691 xfs_agino_t agino; 1692 short bucket_index; 1693 int offset; 1694 int error; 1695 1696 ASSERT(ip->i_d.di_nlink == 0); 1697 ASSERT(ip->i_d.di_mode != 0); 1698 ASSERT(ip->i_transp == tp); 1699 1700 mp = tp->t_mountp; 1701 1702 /* 1703 * Get the agi buffer first. It ensures lock ordering 1704 * on the list. 1705 */ 1706 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1707 if (error) 1708 return error; 1709 agi = XFS_BUF_TO_AGI(agibp); 1710 1711 /* 1712 * Get the index into the agi hash table for the 1713 * list this inode will go on. 1714 */ 1715 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1716 ASSERT(agino != 0); 1717 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1718 ASSERT(agi->agi_unlinked[bucket_index]); 1719 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1720 1721 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1722 /* 1723 * There is already another inode in the bucket we need 1724 * to add ourselves to. Add us at the front of the list. 1725 * Here we put the head pointer into our next pointer, 1726 * and then we fall through to point the head at us. 1727 */ 1728 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1729 if (error) 1730 return error; 1731 1732 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); 1733 /* both on-disk, don't endian flip twice */ 1734 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1735 offset = ip->i_imap.im_boffset + 1736 offsetof(xfs_dinode_t, di_next_unlinked); 1737 xfs_trans_inode_buf(tp, ibp); 1738 xfs_trans_log_buf(tp, ibp, offset, 1739 (offset + sizeof(xfs_agino_t) - 1)); 1740 xfs_inobp_check(mp, ibp); 1741 } 1742 1743 /* 1744 * Point the bucket head pointer at the inode being inserted. 1745 */ 1746 ASSERT(agino != 0); 1747 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1748 offset = offsetof(xfs_agi_t, agi_unlinked) + 1749 (sizeof(xfs_agino_t) * bucket_index); 1750 xfs_trans_log_buf(tp, agibp, offset, 1751 (offset + sizeof(xfs_agino_t) - 1)); 1752 return 0; 1753 } 1754 1755 /* 1756 * Pull the on-disk inode from the AGI unlinked list. 1757 */ 1758 STATIC int 1759 xfs_iunlink_remove( 1760 xfs_trans_t *tp, 1761 xfs_inode_t *ip) 1762 { 1763 xfs_ino_t next_ino; 1764 xfs_mount_t *mp; 1765 xfs_agi_t *agi; 1766 xfs_dinode_t *dip; 1767 xfs_buf_t *agibp; 1768 xfs_buf_t *ibp; 1769 xfs_agnumber_t agno; 1770 xfs_agino_t agino; 1771 xfs_agino_t next_agino; 1772 xfs_buf_t *last_ibp; 1773 xfs_dinode_t *last_dip = NULL; 1774 short bucket_index; 1775 int offset, last_offset = 0; 1776 int error; 1777 1778 mp = tp->t_mountp; 1779 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1780 1781 /* 1782 * Get the agi buffer first. It ensures lock ordering 1783 * on the list. 1784 */ 1785 error = xfs_read_agi(mp, tp, agno, &agibp); 1786 if (error) 1787 return error; 1788 1789 agi = XFS_BUF_TO_AGI(agibp); 1790 1791 /* 1792 * Get the index into the agi hash table for the 1793 * list this inode will go on. 1794 */ 1795 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1796 ASSERT(agino != 0); 1797 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1798 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); 1799 ASSERT(agi->agi_unlinked[bucket_index]); 1800 1801 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1802 /* 1803 * We're at the head of the list. Get the inode's 1804 * on-disk buffer to see if there is anyone after us 1805 * on the list. Only modify our next pointer if it 1806 * is not already NULLAGINO. This saves us the overhead 1807 * of dealing with the buffer when there is no need to 1808 * change it. 1809 */ 1810 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1811 if (error) { 1812 xfs_warn(mp, "%s: xfs_itobp() returned error %d.", 1813 __func__, error); 1814 return error; 1815 } 1816 next_agino = be32_to_cpu(dip->di_next_unlinked); 1817 ASSERT(next_agino != 0); 1818 if (next_agino != NULLAGINO) { 1819 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1820 offset = ip->i_imap.im_boffset + 1821 offsetof(xfs_dinode_t, di_next_unlinked); 1822 xfs_trans_inode_buf(tp, ibp); 1823 xfs_trans_log_buf(tp, ibp, offset, 1824 (offset + sizeof(xfs_agino_t) - 1)); 1825 xfs_inobp_check(mp, ibp); 1826 } else { 1827 xfs_trans_brelse(tp, ibp); 1828 } 1829 /* 1830 * Point the bucket head pointer at the next inode. 1831 */ 1832 ASSERT(next_agino != 0); 1833 ASSERT(next_agino != agino); 1834 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 1835 offset = offsetof(xfs_agi_t, agi_unlinked) + 1836 (sizeof(xfs_agino_t) * bucket_index); 1837 xfs_trans_log_buf(tp, agibp, offset, 1838 (offset + sizeof(xfs_agino_t) - 1)); 1839 } else { 1840 /* 1841 * We need to search the list for the inode being freed. 1842 */ 1843 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1844 last_ibp = NULL; 1845 while (next_agino != agino) { 1846 /* 1847 * If the last inode wasn't the one pointing to 1848 * us, then release its buffer since we're not 1849 * going to do anything with it. 1850 */ 1851 if (last_ibp != NULL) { 1852 xfs_trans_brelse(tp, last_ibp); 1853 } 1854 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1855 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1856 &last_ibp, &last_offset, 0); 1857 if (error) { 1858 xfs_warn(mp, 1859 "%s: xfs_inotobp() returned error %d.", 1860 __func__, error); 1861 return error; 1862 } 1863 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1864 ASSERT(next_agino != NULLAGINO); 1865 ASSERT(next_agino != 0); 1866 } 1867 /* 1868 * Now last_ibp points to the buffer previous to us on 1869 * the unlinked list. Pull us from the list. 1870 */ 1871 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1872 if (error) { 1873 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", 1874 __func__, error); 1875 return error; 1876 } 1877 next_agino = be32_to_cpu(dip->di_next_unlinked); 1878 ASSERT(next_agino != 0); 1879 ASSERT(next_agino != agino); 1880 if (next_agino != NULLAGINO) { 1881 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1882 offset = ip->i_imap.im_boffset + 1883 offsetof(xfs_dinode_t, di_next_unlinked); 1884 xfs_trans_inode_buf(tp, ibp); 1885 xfs_trans_log_buf(tp, ibp, offset, 1886 (offset + sizeof(xfs_agino_t) - 1)); 1887 xfs_inobp_check(mp, ibp); 1888 } else { 1889 xfs_trans_brelse(tp, ibp); 1890 } 1891 /* 1892 * Point the previous inode on the list to the next inode. 1893 */ 1894 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1895 ASSERT(next_agino != 0); 1896 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1897 xfs_trans_inode_buf(tp, last_ibp); 1898 xfs_trans_log_buf(tp, last_ibp, offset, 1899 (offset + sizeof(xfs_agino_t) - 1)); 1900 xfs_inobp_check(mp, last_ibp); 1901 } 1902 return 0; 1903 } 1904 1905 /* 1906 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 1907 * inodes that are in memory - they all must be marked stale and attached to 1908 * the cluster buffer. 1909 */ 1910 STATIC void 1911 xfs_ifree_cluster( 1912 xfs_inode_t *free_ip, 1913 xfs_trans_t *tp, 1914 xfs_ino_t inum) 1915 { 1916 xfs_mount_t *mp = free_ip->i_mount; 1917 int blks_per_cluster; 1918 int nbufs; 1919 int ninodes; 1920 int i, j; 1921 xfs_daddr_t blkno; 1922 xfs_buf_t *bp; 1923 xfs_inode_t *ip; 1924 xfs_inode_log_item_t *iip; 1925 xfs_log_item_t *lip; 1926 struct xfs_perag *pag; 1927 1928 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 1929 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1930 blks_per_cluster = 1; 1931 ninodes = mp->m_sb.sb_inopblock; 1932 nbufs = XFS_IALLOC_BLOCKS(mp); 1933 } else { 1934 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 1935 mp->m_sb.sb_blocksize; 1936 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 1937 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1938 } 1939 1940 for (j = 0; j < nbufs; j++, inum += ninodes) { 1941 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1942 XFS_INO_TO_AGBNO(mp, inum)); 1943 1944 /* 1945 * We obtain and lock the backing buffer first in the process 1946 * here, as we have to ensure that any dirty inode that we 1947 * can't get the flush lock on is attached to the buffer. 1948 * If we scan the in-memory inodes first, then buffer IO can 1949 * complete before we get a lock on it, and hence we may fail 1950 * to mark all the active inodes on the buffer stale. 1951 */ 1952 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1953 mp->m_bsize * blks_per_cluster, 1954 XBF_LOCK); 1955 1956 /* 1957 * Walk the inodes already attached to the buffer and mark them 1958 * stale. These will all have the flush locks held, so an 1959 * in-memory inode walk can't lock them. By marking them all 1960 * stale first, we will not attempt to lock them in the loop 1961 * below as the XFS_ISTALE flag will be set. 1962 */ 1963 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1964 while (lip) { 1965 if (lip->li_type == XFS_LI_INODE) { 1966 iip = (xfs_inode_log_item_t *)lip; 1967 ASSERT(iip->ili_logged == 1); 1968 lip->li_cb = xfs_istale_done; 1969 xfs_trans_ail_copy_lsn(mp->m_ail, 1970 &iip->ili_flush_lsn, 1971 &iip->ili_item.li_lsn); 1972 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1973 } 1974 lip = lip->li_bio_list; 1975 } 1976 1977 1978 /* 1979 * For each inode in memory attempt to add it to the inode 1980 * buffer and set it up for being staled on buffer IO 1981 * completion. This is safe as we've locked out tail pushing 1982 * and flushing by locking the buffer. 1983 * 1984 * We have already marked every inode that was part of a 1985 * transaction stale above, which means there is no point in 1986 * even trying to lock them. 1987 */ 1988 for (i = 0; i < ninodes; i++) { 1989 retry: 1990 rcu_read_lock(); 1991 ip = radix_tree_lookup(&pag->pag_ici_root, 1992 XFS_INO_TO_AGINO(mp, (inum + i))); 1993 1994 /* Inode not in memory, nothing to do */ 1995 if (!ip) { 1996 rcu_read_unlock(); 1997 continue; 1998 } 1999 2000 /* 2001 * because this is an RCU protected lookup, we could 2002 * find a recently freed or even reallocated inode 2003 * during the lookup. We need to check under the 2004 * i_flags_lock for a valid inode here. Skip it if it 2005 * is not valid, the wrong inode or stale. 2006 */ 2007 spin_lock(&ip->i_flags_lock); 2008 if (ip->i_ino != inum + i || 2009 __xfs_iflags_test(ip, XFS_ISTALE)) { 2010 spin_unlock(&ip->i_flags_lock); 2011 rcu_read_unlock(); 2012 continue; 2013 } 2014 spin_unlock(&ip->i_flags_lock); 2015 2016 /* 2017 * Don't try to lock/unlock the current inode, but we 2018 * _cannot_ skip the other inodes that we did not find 2019 * in the list attached to the buffer and are not 2020 * already marked stale. If we can't lock it, back off 2021 * and retry. 2022 */ 2023 if (ip != free_ip && 2024 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2025 rcu_read_unlock(); 2026 delay(1); 2027 goto retry; 2028 } 2029 rcu_read_unlock(); 2030 2031 xfs_iflock(ip); 2032 xfs_iflags_set(ip, XFS_ISTALE); 2033 2034 /* 2035 * we don't need to attach clean inodes or those only 2036 * with unlogged changes (which we throw away, anyway). 2037 */ 2038 iip = ip->i_itemp; 2039 if (!iip || xfs_inode_clean(ip)) { 2040 ASSERT(ip != free_ip); 2041 ip->i_update_core = 0; 2042 xfs_ifunlock(ip); 2043 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2044 continue; 2045 } 2046 2047 iip->ili_last_fields = iip->ili_format.ilf_fields; 2048 iip->ili_format.ilf_fields = 0; 2049 iip->ili_logged = 1; 2050 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2051 &iip->ili_item.li_lsn); 2052 2053 xfs_buf_attach_iodone(bp, xfs_istale_done, 2054 &iip->ili_item); 2055 2056 if (ip != free_ip) 2057 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2058 } 2059 2060 xfs_trans_stale_inode_buf(tp, bp); 2061 xfs_trans_binval(tp, bp); 2062 } 2063 2064 xfs_perag_put(pag); 2065 } 2066 2067 /* 2068 * This is called to return an inode to the inode free list. 2069 * The inode should already be truncated to 0 length and have 2070 * no pages associated with it. This routine also assumes that 2071 * the inode is already a part of the transaction. 2072 * 2073 * The on-disk copy of the inode will have been added to the list 2074 * of unlinked inodes in the AGI. We need to remove the inode from 2075 * that list atomically with respect to freeing it here. 2076 */ 2077 int 2078 xfs_ifree( 2079 xfs_trans_t *tp, 2080 xfs_inode_t *ip, 2081 xfs_bmap_free_t *flist) 2082 { 2083 int error; 2084 int delete; 2085 xfs_ino_t first_ino; 2086 xfs_dinode_t *dip; 2087 xfs_buf_t *ibp; 2088 2089 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2090 ASSERT(ip->i_transp == tp); 2091 ASSERT(ip->i_d.di_nlink == 0); 2092 ASSERT(ip->i_d.di_nextents == 0); 2093 ASSERT(ip->i_d.di_anextents == 0); 2094 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 2095 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 2096 ASSERT(ip->i_d.di_nblocks == 0); 2097 2098 /* 2099 * Pull the on-disk inode from the AGI unlinked list. 2100 */ 2101 error = xfs_iunlink_remove(tp, ip); 2102 if (error != 0) { 2103 return error; 2104 } 2105 2106 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2107 if (error != 0) { 2108 return error; 2109 } 2110 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2111 ip->i_d.di_flags = 0; 2112 ip->i_d.di_dmevmask = 0; 2113 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2114 ip->i_df.if_ext_max = 2115 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 2116 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2117 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2118 /* 2119 * Bump the generation count so no one will be confused 2120 * by reincarnations of this inode. 2121 */ 2122 ip->i_d.di_gen++; 2123 2124 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2125 2126 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); 2127 if (error) 2128 return error; 2129 2130 /* 2131 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat 2132 * from picking up this inode when it is reclaimed (its incore state 2133 * initialzed but not flushed to disk yet). The in-core di_mode is 2134 * already cleared and a corresponding transaction logged. 2135 * The hack here just synchronizes the in-core to on-disk 2136 * di_mode value in advance before the actual inode sync to disk. 2137 * This is OK because the inode is already unlinked and would never 2138 * change its di_mode again for this inode generation. 2139 * This is a temporary hack that would require a proper fix 2140 * in the future. 2141 */ 2142 dip->di_mode = 0; 2143 2144 if (delete) { 2145 xfs_ifree_cluster(ip, tp, first_ino); 2146 } 2147 2148 return 0; 2149 } 2150 2151 /* 2152 * Reallocate the space for if_broot based on the number of records 2153 * being added or deleted as indicated in rec_diff. Move the records 2154 * and pointers in if_broot to fit the new size. When shrinking this 2155 * will eliminate holes between the records and pointers created by 2156 * the caller. When growing this will create holes to be filled in 2157 * by the caller. 2158 * 2159 * The caller must not request to add more records than would fit in 2160 * the on-disk inode root. If the if_broot is currently NULL, then 2161 * if we adding records one will be allocated. The caller must also 2162 * not request that the number of records go below zero, although 2163 * it can go to zero. 2164 * 2165 * ip -- the inode whose if_broot area is changing 2166 * ext_diff -- the change in the number of records, positive or negative, 2167 * requested for the if_broot array. 2168 */ 2169 void 2170 xfs_iroot_realloc( 2171 xfs_inode_t *ip, 2172 int rec_diff, 2173 int whichfork) 2174 { 2175 struct xfs_mount *mp = ip->i_mount; 2176 int cur_max; 2177 xfs_ifork_t *ifp; 2178 struct xfs_btree_block *new_broot; 2179 int new_max; 2180 size_t new_size; 2181 char *np; 2182 char *op; 2183 2184 /* 2185 * Handle the degenerate case quietly. 2186 */ 2187 if (rec_diff == 0) { 2188 return; 2189 } 2190 2191 ifp = XFS_IFORK_PTR(ip, whichfork); 2192 if (rec_diff > 0) { 2193 /* 2194 * If there wasn't any memory allocated before, just 2195 * allocate it now and get out. 2196 */ 2197 if (ifp->if_broot_bytes == 0) { 2198 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2199 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 2200 ifp->if_broot_bytes = (int)new_size; 2201 return; 2202 } 2203 2204 /* 2205 * If there is already an existing if_broot, then we need 2206 * to realloc() it and shift the pointers to their new 2207 * location. The records don't change location because 2208 * they are kept butted up against the btree block header. 2209 */ 2210 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2211 new_max = cur_max + rec_diff; 2212 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2213 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 2214 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2215 KM_SLEEP | KM_NOFS); 2216 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2217 ifp->if_broot_bytes); 2218 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2219 (int)new_size); 2220 ifp->if_broot_bytes = (int)new_size; 2221 ASSERT(ifp->if_broot_bytes <= 2222 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2223 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2224 return; 2225 } 2226 2227 /* 2228 * rec_diff is less than 0. In this case, we are shrinking the 2229 * if_broot buffer. It must already exist. If we go to zero 2230 * records, just get rid of the root and clear the status bit. 2231 */ 2232 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2233 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2234 new_max = cur_max + rec_diff; 2235 ASSERT(new_max >= 0); 2236 if (new_max > 0) 2237 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2238 else 2239 new_size = 0; 2240 if (new_size > 0) { 2241 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 2242 /* 2243 * First copy over the btree block header. 2244 */ 2245 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); 2246 } else { 2247 new_broot = NULL; 2248 ifp->if_flags &= ~XFS_IFBROOT; 2249 } 2250 2251 /* 2252 * Only copy the records and pointers if there are any. 2253 */ 2254 if (new_max > 0) { 2255 /* 2256 * First copy the records. 2257 */ 2258 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); 2259 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); 2260 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2261 2262 /* 2263 * Then copy the pointers. 2264 */ 2265 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2266 ifp->if_broot_bytes); 2267 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 2268 (int)new_size); 2269 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2270 } 2271 kmem_free(ifp->if_broot); 2272 ifp->if_broot = new_broot; 2273 ifp->if_broot_bytes = (int)new_size; 2274 ASSERT(ifp->if_broot_bytes <= 2275 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2276 return; 2277 } 2278 2279 2280 /* 2281 * This is called when the amount of space needed for if_data 2282 * is increased or decreased. The change in size is indicated by 2283 * the number of bytes that need to be added or deleted in the 2284 * byte_diff parameter. 2285 * 2286 * If the amount of space needed has decreased below the size of the 2287 * inline buffer, then switch to using the inline buffer. Otherwise, 2288 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2289 * to what is needed. 2290 * 2291 * ip -- the inode whose if_data area is changing 2292 * byte_diff -- the change in the number of bytes, positive or negative, 2293 * requested for the if_data array. 2294 */ 2295 void 2296 xfs_idata_realloc( 2297 xfs_inode_t *ip, 2298 int byte_diff, 2299 int whichfork) 2300 { 2301 xfs_ifork_t *ifp; 2302 int new_size; 2303 int real_size; 2304 2305 if (byte_diff == 0) { 2306 return; 2307 } 2308 2309 ifp = XFS_IFORK_PTR(ip, whichfork); 2310 new_size = (int)ifp->if_bytes + byte_diff; 2311 ASSERT(new_size >= 0); 2312 2313 if (new_size == 0) { 2314 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2315 kmem_free(ifp->if_u1.if_data); 2316 } 2317 ifp->if_u1.if_data = NULL; 2318 real_size = 0; 2319 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2320 /* 2321 * If the valid extents/data can fit in if_inline_ext/data, 2322 * copy them from the malloc'd vector and free it. 2323 */ 2324 if (ifp->if_u1.if_data == NULL) { 2325 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2326 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2327 ASSERT(ifp->if_real_bytes != 0); 2328 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2329 new_size); 2330 kmem_free(ifp->if_u1.if_data); 2331 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2332 } 2333 real_size = 0; 2334 } else { 2335 /* 2336 * Stuck with malloc/realloc. 2337 * For inline data, the underlying buffer must be 2338 * a multiple of 4 bytes in size so that it can be 2339 * logged and stay on word boundaries. We enforce 2340 * that here. 2341 */ 2342 real_size = roundup(new_size, 4); 2343 if (ifp->if_u1.if_data == NULL) { 2344 ASSERT(ifp->if_real_bytes == 0); 2345 ifp->if_u1.if_data = kmem_alloc(real_size, 2346 KM_SLEEP | KM_NOFS); 2347 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2348 /* 2349 * Only do the realloc if the underlying size 2350 * is really changing. 2351 */ 2352 if (ifp->if_real_bytes != real_size) { 2353 ifp->if_u1.if_data = 2354 kmem_realloc(ifp->if_u1.if_data, 2355 real_size, 2356 ifp->if_real_bytes, 2357 KM_SLEEP | KM_NOFS); 2358 } 2359 } else { 2360 ASSERT(ifp->if_real_bytes == 0); 2361 ifp->if_u1.if_data = kmem_alloc(real_size, 2362 KM_SLEEP | KM_NOFS); 2363 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2364 ifp->if_bytes); 2365 } 2366 } 2367 ifp->if_real_bytes = real_size; 2368 ifp->if_bytes = new_size; 2369 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2370 } 2371 2372 void 2373 xfs_idestroy_fork( 2374 xfs_inode_t *ip, 2375 int whichfork) 2376 { 2377 xfs_ifork_t *ifp; 2378 2379 ifp = XFS_IFORK_PTR(ip, whichfork); 2380 if (ifp->if_broot != NULL) { 2381 kmem_free(ifp->if_broot); 2382 ifp->if_broot = NULL; 2383 } 2384 2385 /* 2386 * If the format is local, then we can't have an extents 2387 * array so just look for an inline data array. If we're 2388 * not local then we may or may not have an extents list, 2389 * so check and free it up if we do. 2390 */ 2391 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2392 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2393 (ifp->if_u1.if_data != NULL)) { 2394 ASSERT(ifp->if_real_bytes != 0); 2395 kmem_free(ifp->if_u1.if_data); 2396 ifp->if_u1.if_data = NULL; 2397 ifp->if_real_bytes = 0; 2398 } 2399 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2400 ((ifp->if_flags & XFS_IFEXTIREC) || 2401 ((ifp->if_u1.if_extents != NULL) && 2402 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2403 ASSERT(ifp->if_real_bytes != 0); 2404 xfs_iext_destroy(ifp); 2405 } 2406 ASSERT(ifp->if_u1.if_extents == NULL || 2407 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2408 ASSERT(ifp->if_real_bytes == 0); 2409 if (whichfork == XFS_ATTR_FORK) { 2410 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2411 ip->i_afp = NULL; 2412 } 2413 } 2414 2415 /* 2416 * This is called to unpin an inode. The caller must have the inode locked 2417 * in at least shared mode so that the buffer cannot be subsequently pinned 2418 * once someone is waiting for it to be unpinned. 2419 */ 2420 static void 2421 xfs_iunpin_nowait( 2422 struct xfs_inode *ip) 2423 { 2424 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2425 2426 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2427 2428 /* Give the log a push to start the unpinning I/O */ 2429 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2430 2431 } 2432 2433 void 2434 xfs_iunpin_wait( 2435 struct xfs_inode *ip) 2436 { 2437 if (xfs_ipincount(ip)) { 2438 xfs_iunpin_nowait(ip); 2439 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); 2440 } 2441 } 2442 2443 /* 2444 * xfs_iextents_copy() 2445 * 2446 * This is called to copy the REAL extents (as opposed to the delayed 2447 * allocation extents) from the inode into the given buffer. It 2448 * returns the number of bytes copied into the buffer. 2449 * 2450 * If there are no delayed allocation extents, then we can just 2451 * memcpy() the extents into the buffer. Otherwise, we need to 2452 * examine each extent in turn and skip those which are delayed. 2453 */ 2454 int 2455 xfs_iextents_copy( 2456 xfs_inode_t *ip, 2457 xfs_bmbt_rec_t *dp, 2458 int whichfork) 2459 { 2460 int copied; 2461 int i; 2462 xfs_ifork_t *ifp; 2463 int nrecs; 2464 xfs_fsblock_t start_block; 2465 2466 ifp = XFS_IFORK_PTR(ip, whichfork); 2467 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2468 ASSERT(ifp->if_bytes > 0); 2469 2470 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2471 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2472 ASSERT(nrecs > 0); 2473 2474 /* 2475 * There are some delayed allocation extents in the 2476 * inode, so copy the extents one at a time and skip 2477 * the delayed ones. There must be at least one 2478 * non-delayed extent. 2479 */ 2480 copied = 0; 2481 for (i = 0; i < nrecs; i++) { 2482 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2483 start_block = xfs_bmbt_get_startblock(ep); 2484 if (isnullstartblock(start_block)) { 2485 /* 2486 * It's a delayed allocation extent, so skip it. 2487 */ 2488 continue; 2489 } 2490 2491 /* Translate to on disk format */ 2492 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2493 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2494 dp++; 2495 copied++; 2496 } 2497 ASSERT(copied != 0); 2498 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); 2499 2500 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2501 } 2502 2503 /* 2504 * Each of the following cases stores data into the same region 2505 * of the on-disk inode, so only one of them can be valid at 2506 * any given time. While it is possible to have conflicting formats 2507 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2508 * in EXTENTS format, this can only happen when the fork has 2509 * changed formats after being modified but before being flushed. 2510 * In these cases, the format always takes precedence, because the 2511 * format indicates the current state of the fork. 2512 */ 2513 /*ARGSUSED*/ 2514 STATIC void 2515 xfs_iflush_fork( 2516 xfs_inode_t *ip, 2517 xfs_dinode_t *dip, 2518 xfs_inode_log_item_t *iip, 2519 int whichfork, 2520 xfs_buf_t *bp) 2521 { 2522 char *cp; 2523 xfs_ifork_t *ifp; 2524 xfs_mount_t *mp; 2525 #ifdef XFS_TRANS_DEBUG 2526 int first; 2527 #endif 2528 static const short brootflag[2] = 2529 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2530 static const short dataflag[2] = 2531 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2532 static const short extflag[2] = 2533 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2534 2535 if (!iip) 2536 return; 2537 ifp = XFS_IFORK_PTR(ip, whichfork); 2538 /* 2539 * This can happen if we gave up in iformat in an error path, 2540 * for the attribute fork. 2541 */ 2542 if (!ifp) { 2543 ASSERT(whichfork == XFS_ATTR_FORK); 2544 return; 2545 } 2546 cp = XFS_DFORK_PTR(dip, whichfork); 2547 mp = ip->i_mount; 2548 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2549 case XFS_DINODE_FMT_LOCAL: 2550 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2551 (ifp->if_bytes > 0)) { 2552 ASSERT(ifp->if_u1.if_data != NULL); 2553 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2554 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2555 } 2556 break; 2557 2558 case XFS_DINODE_FMT_EXTENTS: 2559 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2560 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2561 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || 2562 (ifp->if_bytes == 0)); 2563 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || 2564 (ifp->if_bytes > 0)); 2565 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2566 (ifp->if_bytes > 0)) { 2567 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2568 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2569 whichfork); 2570 } 2571 break; 2572 2573 case XFS_DINODE_FMT_BTREE: 2574 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2575 (ifp->if_broot_bytes > 0)) { 2576 ASSERT(ifp->if_broot != NULL); 2577 ASSERT(ifp->if_broot_bytes <= 2578 (XFS_IFORK_SIZE(ip, whichfork) + 2579 XFS_BROOT_SIZE_ADJ)); 2580 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2581 (xfs_bmdr_block_t *)cp, 2582 XFS_DFORK_SIZE(dip, mp, whichfork)); 2583 } 2584 break; 2585 2586 case XFS_DINODE_FMT_DEV: 2587 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2588 ASSERT(whichfork == XFS_DATA_FORK); 2589 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2590 } 2591 break; 2592 2593 case XFS_DINODE_FMT_UUID: 2594 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2595 ASSERT(whichfork == XFS_DATA_FORK); 2596 memcpy(XFS_DFORK_DPTR(dip), 2597 &ip->i_df.if_u2.if_uuid, 2598 sizeof(uuid_t)); 2599 } 2600 break; 2601 2602 default: 2603 ASSERT(0); 2604 break; 2605 } 2606 } 2607 2608 STATIC int 2609 xfs_iflush_cluster( 2610 xfs_inode_t *ip, 2611 xfs_buf_t *bp) 2612 { 2613 xfs_mount_t *mp = ip->i_mount; 2614 struct xfs_perag *pag; 2615 unsigned long first_index, mask; 2616 unsigned long inodes_per_cluster; 2617 int ilist_size; 2618 xfs_inode_t **ilist; 2619 xfs_inode_t *iq; 2620 int nr_found; 2621 int clcount = 0; 2622 int bufwasdelwri; 2623 int i; 2624 2625 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2626 2627 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2628 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2629 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2630 if (!ilist) 2631 goto out_put; 2632 2633 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2634 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2635 rcu_read_lock(); 2636 /* really need a gang lookup range call here */ 2637 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2638 first_index, inodes_per_cluster); 2639 if (nr_found == 0) 2640 goto out_free; 2641 2642 for (i = 0; i < nr_found; i++) { 2643 iq = ilist[i]; 2644 if (iq == ip) 2645 continue; 2646 2647 /* 2648 * because this is an RCU protected lookup, we could find a 2649 * recently freed or even reallocated inode during the lookup. 2650 * We need to check under the i_flags_lock for a valid inode 2651 * here. Skip it if it is not valid or the wrong inode. 2652 */ 2653 spin_lock(&ip->i_flags_lock); 2654 if (!ip->i_ino || 2655 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { 2656 spin_unlock(&ip->i_flags_lock); 2657 continue; 2658 } 2659 spin_unlock(&ip->i_flags_lock); 2660 2661 /* 2662 * Do an un-protected check to see if the inode is dirty and 2663 * is a candidate for flushing. These checks will be repeated 2664 * later after the appropriate locks are acquired. 2665 */ 2666 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2667 continue; 2668 2669 /* 2670 * Try to get locks. If any are unavailable or it is pinned, 2671 * then this inode cannot be flushed and is skipped. 2672 */ 2673 2674 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2675 continue; 2676 if (!xfs_iflock_nowait(iq)) { 2677 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2678 continue; 2679 } 2680 if (xfs_ipincount(iq)) { 2681 xfs_ifunlock(iq); 2682 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2683 continue; 2684 } 2685 2686 /* 2687 * arriving here means that this inode can be flushed. First 2688 * re-check that it's dirty before flushing. 2689 */ 2690 if (!xfs_inode_clean(iq)) { 2691 int error; 2692 error = xfs_iflush_int(iq, bp); 2693 if (error) { 2694 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2695 goto cluster_corrupt_out; 2696 } 2697 clcount++; 2698 } else { 2699 xfs_ifunlock(iq); 2700 } 2701 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2702 } 2703 2704 if (clcount) { 2705 XFS_STATS_INC(xs_icluster_flushcnt); 2706 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2707 } 2708 2709 out_free: 2710 rcu_read_unlock(); 2711 kmem_free(ilist); 2712 out_put: 2713 xfs_perag_put(pag); 2714 return 0; 2715 2716 2717 cluster_corrupt_out: 2718 /* 2719 * Corruption detected in the clustering loop. Invalidate the 2720 * inode buffer and shut down the filesystem. 2721 */ 2722 rcu_read_unlock(); 2723 /* 2724 * Clean up the buffer. If it was B_DELWRI, just release it -- 2725 * brelse can handle it with no problems. If not, shut down the 2726 * filesystem before releasing the buffer. 2727 */ 2728 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2729 if (bufwasdelwri) 2730 xfs_buf_relse(bp); 2731 2732 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2733 2734 if (!bufwasdelwri) { 2735 /* 2736 * Just like incore_relse: if we have b_iodone functions, 2737 * mark the buffer as an error and call them. Otherwise 2738 * mark it as stale and brelse. 2739 */ 2740 if (XFS_BUF_IODONE_FUNC(bp)) { 2741 XFS_BUF_UNDONE(bp); 2742 XFS_BUF_STALE(bp); 2743 XFS_BUF_ERROR(bp,EIO); 2744 xfs_buf_ioend(bp, 0); 2745 } else { 2746 XFS_BUF_STALE(bp); 2747 xfs_buf_relse(bp); 2748 } 2749 } 2750 2751 /* 2752 * Unlocks the flush lock 2753 */ 2754 xfs_iflush_abort(iq); 2755 kmem_free(ilist); 2756 xfs_perag_put(pag); 2757 return XFS_ERROR(EFSCORRUPTED); 2758 } 2759 2760 /* 2761 * xfs_iflush() will write a modified inode's changes out to the 2762 * inode's on disk home. The caller must have the inode lock held 2763 * in at least shared mode and the inode flush completion must be 2764 * active as well. The inode lock will still be held upon return from 2765 * the call and the caller is free to unlock it. 2766 * The inode flush will be completed when the inode reaches the disk. 2767 * The flags indicate how the inode's buffer should be written out. 2768 */ 2769 int 2770 xfs_iflush( 2771 xfs_inode_t *ip, 2772 uint flags) 2773 { 2774 xfs_inode_log_item_t *iip; 2775 xfs_buf_t *bp; 2776 xfs_dinode_t *dip; 2777 xfs_mount_t *mp; 2778 int error; 2779 2780 XFS_STATS_INC(xs_iflush_count); 2781 2782 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2783 ASSERT(!completion_done(&ip->i_flush)); 2784 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2785 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2786 2787 iip = ip->i_itemp; 2788 mp = ip->i_mount; 2789 2790 /* 2791 * We can't flush the inode until it is unpinned, so wait for it if we 2792 * are allowed to block. We know noone new can pin it, because we are 2793 * holding the inode lock shared and you need to hold it exclusively to 2794 * pin the inode. 2795 * 2796 * If we are not allowed to block, force the log out asynchronously so 2797 * that when we come back the inode will be unpinned. If other inodes 2798 * in the same cluster are dirty, they will probably write the inode 2799 * out for us if they occur after the log force completes. 2800 */ 2801 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2802 xfs_iunpin_nowait(ip); 2803 xfs_ifunlock(ip); 2804 return EAGAIN; 2805 } 2806 xfs_iunpin_wait(ip); 2807 2808 /* 2809 * For stale inodes we cannot rely on the backing buffer remaining 2810 * stale in cache for the remaining life of the stale inode and so 2811 * xfs_itobp() below may give us a buffer that no longer contains 2812 * inodes below. We have to check this after ensuring the inode is 2813 * unpinned so that it is safe to reclaim the stale inode after the 2814 * flush call. 2815 */ 2816 if (xfs_iflags_test(ip, XFS_ISTALE)) { 2817 xfs_ifunlock(ip); 2818 return 0; 2819 } 2820 2821 /* 2822 * This may have been unpinned because the filesystem is shutting 2823 * down forcibly. If that's the case we must not write this inode 2824 * to disk, because the log record didn't make it to disk! 2825 */ 2826 if (XFS_FORCED_SHUTDOWN(mp)) { 2827 ip->i_update_core = 0; 2828 if (iip) 2829 iip->ili_format.ilf_fields = 0; 2830 xfs_ifunlock(ip); 2831 return XFS_ERROR(EIO); 2832 } 2833 2834 /* 2835 * Get the buffer containing the on-disk inode. 2836 */ 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2838 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2839 if (error || !bp) { 2840 xfs_ifunlock(ip); 2841 return error; 2842 } 2843 2844 /* 2845 * First flush out the inode that xfs_iflush was called with. 2846 */ 2847 error = xfs_iflush_int(ip, bp); 2848 if (error) 2849 goto corrupt_out; 2850 2851 /* 2852 * If the buffer is pinned then push on the log now so we won't 2853 * get stuck waiting in the write for too long. 2854 */ 2855 if (XFS_BUF_ISPINNED(bp)) 2856 xfs_log_force(mp, 0); 2857 2858 /* 2859 * inode clustering: 2860 * see if other inodes can be gathered into this write 2861 */ 2862 error = xfs_iflush_cluster(ip, bp); 2863 if (error) 2864 goto cluster_corrupt_out; 2865 2866 if (flags & SYNC_WAIT) 2867 error = xfs_bwrite(mp, bp); 2868 else 2869 xfs_bdwrite(mp, bp); 2870 return error; 2871 2872 corrupt_out: 2873 xfs_buf_relse(bp); 2874 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2875 cluster_corrupt_out: 2876 /* 2877 * Unlocks the flush lock 2878 */ 2879 xfs_iflush_abort(ip); 2880 return XFS_ERROR(EFSCORRUPTED); 2881 } 2882 2883 2884 STATIC int 2885 xfs_iflush_int( 2886 xfs_inode_t *ip, 2887 xfs_buf_t *bp) 2888 { 2889 xfs_inode_log_item_t *iip; 2890 xfs_dinode_t *dip; 2891 xfs_mount_t *mp; 2892 #ifdef XFS_TRANS_DEBUG 2893 int first; 2894 #endif 2895 2896 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2897 ASSERT(!completion_done(&ip->i_flush)); 2898 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2899 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2900 2901 iip = ip->i_itemp; 2902 mp = ip->i_mount; 2903 2904 /* set *dip = inode's place in the buffer */ 2905 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2906 2907 /* 2908 * Clear i_update_core before copying out the data. 2909 * This is for coordination with our timestamp updates 2910 * that don't hold the inode lock. They will always 2911 * update the timestamps BEFORE setting i_update_core, 2912 * so if we clear i_update_core after they set it we 2913 * are guaranteed to see their updates to the timestamps. 2914 * I believe that this depends on strongly ordered memory 2915 * semantics, but we have that. We use the SYNCHRONIZE 2916 * macro to make sure that the compiler does not reorder 2917 * the i_update_core access below the data copy below. 2918 */ 2919 ip->i_update_core = 0; 2920 SYNCHRONIZE(); 2921 2922 /* 2923 * Make sure to get the latest timestamps from the Linux inode. 2924 */ 2925 xfs_synchronize_times(ip); 2926 2927 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2928 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2929 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2930 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2931 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2932 goto corrupt_out; 2933 } 2934 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2935 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2936 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2937 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2938 __func__, ip->i_ino, ip, ip->i_d.di_magic); 2939 goto corrupt_out; 2940 } 2941 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2942 if (XFS_TEST_ERROR( 2943 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2944 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2945 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2946 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2947 "%s: Bad regular inode %Lu, ptr 0x%p", 2948 __func__, ip->i_ino, ip); 2949 goto corrupt_out; 2950 } 2951 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2952 if (XFS_TEST_ERROR( 2953 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2954 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2955 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2956 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2957 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2958 "%s: Bad directory inode %Lu, ptr 0x%p", 2959 __func__, ip->i_ino, ip); 2960 goto corrupt_out; 2961 } 2962 } 2963 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2964 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2965 XFS_RANDOM_IFLUSH_5)) { 2966 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2967 "%s: detected corrupt incore inode %Lu, " 2968 "total extents = %d, nblocks = %Ld, ptr 0x%p", 2969 __func__, ip->i_ino, 2970 ip->i_d.di_nextents + ip->i_d.di_anextents, 2971 ip->i_d.di_nblocks, ip); 2972 goto corrupt_out; 2973 } 2974 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2975 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2976 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2977 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2978 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2979 goto corrupt_out; 2980 } 2981 /* 2982 * bump the flush iteration count, used to detect flushes which 2983 * postdate a log record during recovery. 2984 */ 2985 2986 ip->i_d.di_flushiter++; 2987 2988 /* 2989 * Copy the dirty parts of the inode into the on-disk 2990 * inode. We always copy out the core of the inode, 2991 * because if the inode is dirty at all the core must 2992 * be. 2993 */ 2994 xfs_dinode_to_disk(dip, &ip->i_d); 2995 2996 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 2997 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 2998 ip->i_d.di_flushiter = 0; 2999 3000 /* 3001 * If this is really an old format inode and the superblock version 3002 * has not been updated to support only new format inodes, then 3003 * convert back to the old inode format. If the superblock version 3004 * has been updated, then make the conversion permanent. 3005 */ 3006 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 3007 if (ip->i_d.di_version == 1) { 3008 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 3009 /* 3010 * Convert it back. 3011 */ 3012 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3013 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 3014 } else { 3015 /* 3016 * The superblock version has already been bumped, 3017 * so just make the conversion to the new inode 3018 * format permanent. 3019 */ 3020 ip->i_d.di_version = 2; 3021 dip->di_version = 2; 3022 ip->i_d.di_onlink = 0; 3023 dip->di_onlink = 0; 3024 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3025 memset(&(dip->di_pad[0]), 0, 3026 sizeof(dip->di_pad)); 3027 ASSERT(xfs_get_projid(ip) == 0); 3028 } 3029 } 3030 3031 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 3032 if (XFS_IFORK_Q(ip)) 3033 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3034 xfs_inobp_check(mp, bp); 3035 3036 /* 3037 * We've recorded everything logged in the inode, so we'd 3038 * like to clear the ilf_fields bits so we don't log and 3039 * flush things unnecessarily. However, we can't stop 3040 * logging all this information until the data we've copied 3041 * into the disk buffer is written to disk. If we did we might 3042 * overwrite the copy of the inode in the log with all the 3043 * data after re-logging only part of it, and in the face of 3044 * a crash we wouldn't have all the data we need to recover. 3045 * 3046 * What we do is move the bits to the ili_last_fields field. 3047 * When logging the inode, these bits are moved back to the 3048 * ilf_fields field. In the xfs_iflush_done() routine we 3049 * clear ili_last_fields, since we know that the information 3050 * those bits represent is permanently on disk. As long as 3051 * the flush completes before the inode is logged again, then 3052 * both ilf_fields and ili_last_fields will be cleared. 3053 * 3054 * We can play with the ilf_fields bits here, because the inode 3055 * lock must be held exclusively in order to set bits there 3056 * and the flush lock protects the ili_last_fields bits. 3057 * Set ili_logged so the flush done 3058 * routine can tell whether or not to look in the AIL. 3059 * Also, store the current LSN of the inode so that we can tell 3060 * whether the item has moved in the AIL from xfs_iflush_done(). 3061 * In order to read the lsn we need the AIL lock, because 3062 * it is a 64 bit value that cannot be read atomically. 3063 */ 3064 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3065 iip->ili_last_fields = iip->ili_format.ilf_fields; 3066 iip->ili_format.ilf_fields = 0; 3067 iip->ili_logged = 1; 3068 3069 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3070 &iip->ili_item.li_lsn); 3071 3072 /* 3073 * Attach the function xfs_iflush_done to the inode's 3074 * buffer. This will remove the inode from the AIL 3075 * and unlock the inode's flush lock when the inode is 3076 * completely written to disk. 3077 */ 3078 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 3079 3080 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3081 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3082 } else { 3083 /* 3084 * We're flushing an inode which is not in the AIL and has 3085 * not been logged but has i_update_core set. For this 3086 * case we can use a B_DELWRI flush and immediately drop 3087 * the inode flush lock because we can avoid the whole 3088 * AIL state thing. It's OK to drop the flush lock now, 3089 * because we've already locked the buffer and to do anything 3090 * you really need both. 3091 */ 3092 if (iip != NULL) { 3093 ASSERT(iip->ili_logged == 0); 3094 ASSERT(iip->ili_last_fields == 0); 3095 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 3096 } 3097 xfs_ifunlock(ip); 3098 } 3099 3100 return 0; 3101 3102 corrupt_out: 3103 return XFS_ERROR(EFSCORRUPTED); 3104 } 3105 3106 /* 3107 * Return a pointer to the extent record at file index idx. 3108 */ 3109 xfs_bmbt_rec_host_t * 3110 xfs_iext_get_ext( 3111 xfs_ifork_t *ifp, /* inode fork pointer */ 3112 xfs_extnum_t idx) /* index of target extent */ 3113 { 3114 ASSERT(idx >= 0); 3115 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3116 return ifp->if_u1.if_ext_irec->er_extbuf; 3117 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3118 xfs_ext_irec_t *erp; /* irec pointer */ 3119 int erp_idx = 0; /* irec index */ 3120 xfs_extnum_t page_idx = idx; /* ext index in target list */ 3121 3122 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3123 return &erp->er_extbuf[page_idx]; 3124 } else if (ifp->if_bytes) { 3125 return &ifp->if_u1.if_extents[idx]; 3126 } else { 3127 return NULL; 3128 } 3129 } 3130 3131 /* 3132 * Insert new item(s) into the extent records for incore inode 3133 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 3134 */ 3135 void 3136 xfs_iext_insert( 3137 xfs_inode_t *ip, /* incore inode pointer */ 3138 xfs_extnum_t idx, /* starting index of new items */ 3139 xfs_extnum_t count, /* number of inserted items */ 3140 xfs_bmbt_irec_t *new, /* items to insert */ 3141 int state) /* type of extent conversion */ 3142 { 3143 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3144 xfs_extnum_t i; /* extent record index */ 3145 3146 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); 3147 3148 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3149 xfs_iext_add(ifp, idx, count); 3150 for (i = idx; i < idx + count; i++, new++) 3151 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); 3152 } 3153 3154 /* 3155 * This is called when the amount of space required for incore file 3156 * extents needs to be increased. The ext_diff parameter stores the 3157 * number of new extents being added and the idx parameter contains 3158 * the extent index where the new extents will be added. If the new 3159 * extents are being appended, then we just need to (re)allocate and 3160 * initialize the space. Otherwise, if the new extents are being 3161 * inserted into the middle of the existing entries, a bit more work 3162 * is required to make room for the new extents to be inserted. The 3163 * caller is responsible for filling in the new extent entries upon 3164 * return. 3165 */ 3166 void 3167 xfs_iext_add( 3168 xfs_ifork_t *ifp, /* inode fork pointer */ 3169 xfs_extnum_t idx, /* index to begin adding exts */ 3170 int ext_diff) /* number of extents to add */ 3171 { 3172 int byte_diff; /* new bytes being added */ 3173 int new_size; /* size of extents after adding */ 3174 xfs_extnum_t nextents; /* number of extents in file */ 3175 3176 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3177 ASSERT((idx >= 0) && (idx <= nextents)); 3178 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 3179 new_size = ifp->if_bytes + byte_diff; 3180 /* 3181 * If the new number of extents (nextents + ext_diff) 3182 * fits inside the inode, then continue to use the inline 3183 * extent buffer. 3184 */ 3185 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 3186 if (idx < nextents) { 3187 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 3188 &ifp->if_u2.if_inline_ext[idx], 3189 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3190 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 3191 } 3192 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3193 ifp->if_real_bytes = 0; 3194 ifp->if_lastex = nextents + ext_diff; 3195 } 3196 /* 3197 * Otherwise use a linear (direct) extent list. 3198 * If the extents are currently inside the inode, 3199 * xfs_iext_realloc_direct will switch us from 3200 * inline to direct extent allocation mode. 3201 */ 3202 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 3203 xfs_iext_realloc_direct(ifp, new_size); 3204 if (idx < nextents) { 3205 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 3206 &ifp->if_u1.if_extents[idx], 3207 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3208 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 3209 } 3210 } 3211 /* Indirection array */ 3212 else { 3213 xfs_ext_irec_t *erp; 3214 int erp_idx = 0; 3215 int page_idx = idx; 3216 3217 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 3218 if (ifp->if_flags & XFS_IFEXTIREC) { 3219 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 3220 } else { 3221 xfs_iext_irec_init(ifp); 3222 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3223 erp = ifp->if_u1.if_ext_irec; 3224 } 3225 /* Extents fit in target extent page */ 3226 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 3227 if (page_idx < erp->er_extcount) { 3228 memmove(&erp->er_extbuf[page_idx + ext_diff], 3229 &erp->er_extbuf[page_idx], 3230 (erp->er_extcount - page_idx) * 3231 sizeof(xfs_bmbt_rec_t)); 3232 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 3233 } 3234 erp->er_extcount += ext_diff; 3235 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3236 } 3237 /* Insert a new extent page */ 3238 else if (erp) { 3239 xfs_iext_add_indirect_multi(ifp, 3240 erp_idx, page_idx, ext_diff); 3241 } 3242 /* 3243 * If extent(s) are being appended to the last page in 3244 * the indirection array and the new extent(s) don't fit 3245 * in the page, then erp is NULL and erp_idx is set to 3246 * the next index needed in the indirection array. 3247 */ 3248 else { 3249 int count = ext_diff; 3250 3251 while (count) { 3252 erp = xfs_iext_irec_new(ifp, erp_idx); 3253 erp->er_extcount = count; 3254 count -= MIN(count, (int)XFS_LINEAR_EXTS); 3255 if (count) { 3256 erp_idx++; 3257 } 3258 } 3259 } 3260 } 3261 ifp->if_bytes = new_size; 3262 } 3263 3264 /* 3265 * This is called when incore extents are being added to the indirection 3266 * array and the new extents do not fit in the target extent list. The 3267 * erp_idx parameter contains the irec index for the target extent list 3268 * in the indirection array, and the idx parameter contains the extent 3269 * index within the list. The number of extents being added is stored 3270 * in the count parameter. 3271 * 3272 * |-------| |-------| 3273 * | | | | idx - number of extents before idx 3274 * | idx | | count | 3275 * | | | | count - number of extents being inserted at idx 3276 * |-------| |-------| 3277 * | count | | nex2 | nex2 - number of extents after idx + count 3278 * |-------| |-------| 3279 */ 3280 void 3281 xfs_iext_add_indirect_multi( 3282 xfs_ifork_t *ifp, /* inode fork pointer */ 3283 int erp_idx, /* target extent irec index */ 3284 xfs_extnum_t idx, /* index within target list */ 3285 int count) /* new extents being added */ 3286 { 3287 int byte_diff; /* new bytes being added */ 3288 xfs_ext_irec_t *erp; /* pointer to irec entry */ 3289 xfs_extnum_t ext_diff; /* number of extents to add */ 3290 xfs_extnum_t ext_cnt; /* new extents still needed */ 3291 xfs_extnum_t nex2; /* extents after idx + count */ 3292 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 3293 int nlists; /* number of irec's (lists) */ 3294 3295 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3296 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3297 nex2 = erp->er_extcount - idx; 3298 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3299 3300 /* 3301 * Save second part of target extent list 3302 * (all extents past */ 3303 if (nex2) { 3304 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3305 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); 3306 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3307 erp->er_extcount -= nex2; 3308 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3309 memset(&erp->er_extbuf[idx], 0, byte_diff); 3310 } 3311 3312 /* 3313 * Add the new extents to the end of the target 3314 * list, then allocate new irec record(s) and 3315 * extent buffer(s) as needed to store the rest 3316 * of the new extents. 3317 */ 3318 ext_cnt = count; 3319 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 3320 if (ext_diff) { 3321 erp->er_extcount += ext_diff; 3322 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3323 ext_cnt -= ext_diff; 3324 } 3325 while (ext_cnt) { 3326 erp_idx++; 3327 erp = xfs_iext_irec_new(ifp, erp_idx); 3328 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 3329 erp->er_extcount = ext_diff; 3330 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3331 ext_cnt -= ext_diff; 3332 } 3333 3334 /* Add nex2 extents back to indirection array */ 3335 if (nex2) { 3336 xfs_extnum_t ext_avail; 3337 int i; 3338 3339 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3340 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 3341 i = 0; 3342 /* 3343 * If nex2 extents fit in the current page, append 3344 * nex2_ep after the new extents. 3345 */ 3346 if (nex2 <= ext_avail) { 3347 i = erp->er_extcount; 3348 } 3349 /* 3350 * Otherwise, check if space is available in the 3351 * next page. 3352 */ 3353 else if ((erp_idx < nlists - 1) && 3354 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 3355 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 3356 erp_idx++; 3357 erp++; 3358 /* Create a hole for nex2 extents */ 3359 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 3360 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 3361 } 3362 /* 3363 * Final choice, create a new extent page for 3364 * nex2 extents. 3365 */ 3366 else { 3367 erp_idx++; 3368 erp = xfs_iext_irec_new(ifp, erp_idx); 3369 } 3370 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 3371 kmem_free(nex2_ep); 3372 erp->er_extcount += nex2; 3373 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 3374 } 3375 } 3376 3377 /* 3378 * This is called when the amount of space required for incore file 3379 * extents needs to be decreased. The ext_diff parameter stores the 3380 * number of extents to be removed and the idx parameter contains 3381 * the extent index where the extents will be removed from. 3382 * 3383 * If the amount of space needed has decreased below the linear 3384 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 3385 * extent array. Otherwise, use kmem_realloc() to adjust the 3386 * size to what is needed. 3387 */ 3388 void 3389 xfs_iext_remove( 3390 xfs_inode_t *ip, /* incore inode pointer */ 3391 xfs_extnum_t idx, /* index to begin removing exts */ 3392 int ext_diff, /* number of extents to remove */ 3393 int state) /* type of extent conversion */ 3394 { 3395 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3396 xfs_extnum_t nextents; /* number of extents in file */ 3397 int new_size; /* size of extents after removal */ 3398 3399 trace_xfs_iext_remove(ip, idx, state, _RET_IP_); 3400 3401 ASSERT(ext_diff > 0); 3402 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3403 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3404 3405 if (new_size == 0) { 3406 xfs_iext_destroy(ifp); 3407 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3408 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3409 } else if (ifp->if_real_bytes) { 3410 xfs_iext_remove_direct(ifp, idx, ext_diff); 3411 } else { 3412 xfs_iext_remove_inline(ifp, idx, ext_diff); 3413 } 3414 ifp->if_bytes = new_size; 3415 } 3416 3417 /* 3418 * This removes ext_diff extents from the inline buffer, beginning 3419 * at extent index idx. 3420 */ 3421 void 3422 xfs_iext_remove_inline( 3423 xfs_ifork_t *ifp, /* inode fork pointer */ 3424 xfs_extnum_t idx, /* index to begin removing exts */ 3425 int ext_diff) /* number of extents to remove */ 3426 { 3427 int nextents; /* number of extents in file */ 3428 3429 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3430 ASSERT(idx < XFS_INLINE_EXTS); 3431 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3432 ASSERT(((nextents - ext_diff) > 0) && 3433 (nextents - ext_diff) < XFS_INLINE_EXTS); 3434 3435 if (idx + ext_diff < nextents) { 3436 memmove(&ifp->if_u2.if_inline_ext[idx], 3437 &ifp->if_u2.if_inline_ext[idx + ext_diff], 3438 (nextents - (idx + ext_diff)) * 3439 sizeof(xfs_bmbt_rec_t)); 3440 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 3441 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3442 } else { 3443 memset(&ifp->if_u2.if_inline_ext[idx], 0, 3444 ext_diff * sizeof(xfs_bmbt_rec_t)); 3445 } 3446 } 3447 3448 /* 3449 * This removes ext_diff extents from a linear (direct) extent list, 3450 * beginning at extent index idx. If the extents are being removed 3451 * from the end of the list (ie. truncate) then we just need to re- 3452 * allocate the list to remove the extra space. Otherwise, if the 3453 * extents are being removed from the middle of the existing extent 3454 * entries, then we first need to move the extent records beginning 3455 * at idx + ext_diff up in the list to overwrite the records being 3456 * removed, then remove the extra space via kmem_realloc. 3457 */ 3458 void 3459 xfs_iext_remove_direct( 3460 xfs_ifork_t *ifp, /* inode fork pointer */ 3461 xfs_extnum_t idx, /* index to begin removing exts */ 3462 int ext_diff) /* number of extents to remove */ 3463 { 3464 xfs_extnum_t nextents; /* number of extents in file */ 3465 int new_size; /* size of extents after removal */ 3466 3467 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3468 new_size = ifp->if_bytes - 3469 (ext_diff * sizeof(xfs_bmbt_rec_t)); 3470 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3471 3472 if (new_size == 0) { 3473 xfs_iext_destroy(ifp); 3474 return; 3475 } 3476 /* Move extents up in the list (if needed) */ 3477 if (idx + ext_diff < nextents) { 3478 memmove(&ifp->if_u1.if_extents[idx], 3479 &ifp->if_u1.if_extents[idx + ext_diff], 3480 (nextents - (idx + ext_diff)) * 3481 sizeof(xfs_bmbt_rec_t)); 3482 } 3483 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 3484 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3485 /* 3486 * Reallocate the direct extent list. If the extents 3487 * will fit inside the inode then xfs_iext_realloc_direct 3488 * will switch from direct to inline extent allocation 3489 * mode for us. 3490 */ 3491 xfs_iext_realloc_direct(ifp, new_size); 3492 ifp->if_bytes = new_size; 3493 } 3494 3495 /* 3496 * This is called when incore extents are being removed from the 3497 * indirection array and the extents being removed span multiple extent 3498 * buffers. The idx parameter contains the file extent index where we 3499 * want to begin removing extents, and the count parameter contains 3500 * how many extents need to be removed. 3501 * 3502 * |-------| |-------| 3503 * | nex1 | | | nex1 - number of extents before idx 3504 * |-------| | count | 3505 * | | | | count - number of extents being removed at idx 3506 * | count | |-------| 3507 * | | | nex2 | nex2 - number of extents after idx + count 3508 * |-------| |-------| 3509 */ 3510 void 3511 xfs_iext_remove_indirect( 3512 xfs_ifork_t *ifp, /* inode fork pointer */ 3513 xfs_extnum_t idx, /* index to begin removing extents */ 3514 int count) /* number of extents to remove */ 3515 { 3516 xfs_ext_irec_t *erp; /* indirection array pointer */ 3517 int erp_idx = 0; /* indirection array index */ 3518 xfs_extnum_t ext_cnt; /* extents left to remove */ 3519 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3520 xfs_extnum_t nex1; /* number of extents before idx */ 3521 xfs_extnum_t nex2; /* extents after idx + count */ 3522 int page_idx = idx; /* index in target extent list */ 3523 3524 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3525 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3526 ASSERT(erp != NULL); 3527 nex1 = page_idx; 3528 ext_cnt = count; 3529 while (ext_cnt) { 3530 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 3531 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 3532 /* 3533 * Check for deletion of entire list; 3534 * xfs_iext_irec_remove() updates extent offsets. 3535 */ 3536 if (ext_diff == erp->er_extcount) { 3537 xfs_iext_irec_remove(ifp, erp_idx); 3538 ext_cnt -= ext_diff; 3539 nex1 = 0; 3540 if (ext_cnt) { 3541 ASSERT(erp_idx < ifp->if_real_bytes / 3542 XFS_IEXT_BUFSZ); 3543 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3544 nex1 = 0; 3545 continue; 3546 } else { 3547 break; 3548 } 3549 } 3550 /* Move extents up (if needed) */ 3551 if (nex2) { 3552 memmove(&erp->er_extbuf[nex1], 3553 &erp->er_extbuf[nex1 + ext_diff], 3554 nex2 * sizeof(xfs_bmbt_rec_t)); 3555 } 3556 /* Zero out rest of page */ 3557 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 3558 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 3559 /* Update remaining counters */ 3560 erp->er_extcount -= ext_diff; 3561 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 3562 ext_cnt -= ext_diff; 3563 nex1 = 0; 3564 erp_idx++; 3565 erp++; 3566 } 3567 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 3568 xfs_iext_irec_compact(ifp); 3569 } 3570 3571 /* 3572 * Create, destroy, or resize a linear (direct) block of extents. 3573 */ 3574 void 3575 xfs_iext_realloc_direct( 3576 xfs_ifork_t *ifp, /* inode fork pointer */ 3577 int new_size) /* new size of extents */ 3578 { 3579 int rnew_size; /* real new size of extents */ 3580 3581 rnew_size = new_size; 3582 3583 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 3584 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 3585 (new_size != ifp->if_real_bytes))); 3586 3587 /* Free extent records */ 3588 if (new_size == 0) { 3589 xfs_iext_destroy(ifp); 3590 } 3591 /* Resize direct extent list and zero any new bytes */ 3592 else if (ifp->if_real_bytes) { 3593 /* Check if extents will fit inside the inode */ 3594 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 3595 xfs_iext_direct_to_inline(ifp, new_size / 3596 (uint)sizeof(xfs_bmbt_rec_t)); 3597 ifp->if_bytes = new_size; 3598 return; 3599 } 3600 if (!is_power_of_2(new_size)){ 3601 rnew_size = roundup_pow_of_two(new_size); 3602 } 3603 if (rnew_size != ifp->if_real_bytes) { 3604 ifp->if_u1.if_extents = 3605 kmem_realloc(ifp->if_u1.if_extents, 3606 rnew_size, 3607 ifp->if_real_bytes, KM_NOFS); 3608 } 3609 if (rnew_size > ifp->if_real_bytes) { 3610 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 3611 (uint)sizeof(xfs_bmbt_rec_t)], 0, 3612 rnew_size - ifp->if_real_bytes); 3613 } 3614 } 3615 /* 3616 * Switch from the inline extent buffer to a direct 3617 * extent list. Be sure to include the inline extent 3618 * bytes in new_size. 3619 */ 3620 else { 3621 new_size += ifp->if_bytes; 3622 if (!is_power_of_2(new_size)) { 3623 rnew_size = roundup_pow_of_two(new_size); 3624 } 3625 xfs_iext_inline_to_direct(ifp, rnew_size); 3626 } 3627 ifp->if_real_bytes = rnew_size; 3628 ifp->if_bytes = new_size; 3629 } 3630 3631 /* 3632 * Switch from linear (direct) extent records to inline buffer. 3633 */ 3634 void 3635 xfs_iext_direct_to_inline( 3636 xfs_ifork_t *ifp, /* inode fork pointer */ 3637 xfs_extnum_t nextents) /* number of extents in file */ 3638 { 3639 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3640 ASSERT(nextents <= XFS_INLINE_EXTS); 3641 /* 3642 * The inline buffer was zeroed when we switched 3643 * from inline to direct extent allocation mode, 3644 * so we don't need to clear it here. 3645 */ 3646 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 3647 nextents * sizeof(xfs_bmbt_rec_t)); 3648 kmem_free(ifp->if_u1.if_extents); 3649 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3650 ifp->if_real_bytes = 0; 3651 } 3652 3653 /* 3654 * Switch from inline buffer to linear (direct) extent records. 3655 * new_size should already be rounded up to the next power of 2 3656 * by the caller (when appropriate), so use new_size as it is. 3657 * However, since new_size may be rounded up, we can't update 3658 * if_bytes here. It is the caller's responsibility to update 3659 * if_bytes upon return. 3660 */ 3661 void 3662 xfs_iext_inline_to_direct( 3663 xfs_ifork_t *ifp, /* inode fork pointer */ 3664 int new_size) /* number of extents in file */ 3665 { 3666 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); 3667 memset(ifp->if_u1.if_extents, 0, new_size); 3668 if (ifp->if_bytes) { 3669 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 3670 ifp->if_bytes); 3671 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3672 sizeof(xfs_bmbt_rec_t)); 3673 } 3674 ifp->if_real_bytes = new_size; 3675 } 3676 3677 /* 3678 * Resize an extent indirection array to new_size bytes. 3679 */ 3680 STATIC void 3681 xfs_iext_realloc_indirect( 3682 xfs_ifork_t *ifp, /* inode fork pointer */ 3683 int new_size) /* new indirection array size */ 3684 { 3685 int nlists; /* number of irec's (ex lists) */ 3686 int size; /* current indirection array size */ 3687 3688 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3689 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3690 size = nlists * sizeof(xfs_ext_irec_t); 3691 ASSERT(ifp->if_real_bytes); 3692 ASSERT((new_size >= 0) && (new_size != size)); 3693 if (new_size == 0) { 3694 xfs_iext_destroy(ifp); 3695 } else { 3696 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 3697 kmem_realloc(ifp->if_u1.if_ext_irec, 3698 new_size, size, KM_NOFS); 3699 } 3700 } 3701 3702 /* 3703 * Switch from indirection array to linear (direct) extent allocations. 3704 */ 3705 STATIC void 3706 xfs_iext_indirect_to_direct( 3707 xfs_ifork_t *ifp) /* inode fork pointer */ 3708 { 3709 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3710 xfs_extnum_t nextents; /* number of extents in file */ 3711 int size; /* size of file extents */ 3712 3713 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3714 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3715 ASSERT(nextents <= XFS_LINEAR_EXTS); 3716 size = nextents * sizeof(xfs_bmbt_rec_t); 3717 3718 xfs_iext_irec_compact_pages(ifp); 3719 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 3720 3721 ep = ifp->if_u1.if_ext_irec->er_extbuf; 3722 kmem_free(ifp->if_u1.if_ext_irec); 3723 ifp->if_flags &= ~XFS_IFEXTIREC; 3724 ifp->if_u1.if_extents = ep; 3725 ifp->if_bytes = size; 3726 if (nextents < XFS_LINEAR_EXTS) { 3727 xfs_iext_realloc_direct(ifp, size); 3728 } 3729 } 3730 3731 /* 3732 * Free incore file extents. 3733 */ 3734 void 3735 xfs_iext_destroy( 3736 xfs_ifork_t *ifp) /* inode fork pointer */ 3737 { 3738 if (ifp->if_flags & XFS_IFEXTIREC) { 3739 int erp_idx; 3740 int nlists; 3741 3742 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3743 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 3744 xfs_iext_irec_remove(ifp, erp_idx); 3745 } 3746 ifp->if_flags &= ~XFS_IFEXTIREC; 3747 } else if (ifp->if_real_bytes) { 3748 kmem_free(ifp->if_u1.if_extents); 3749 } else if (ifp->if_bytes) { 3750 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3751 sizeof(xfs_bmbt_rec_t)); 3752 } 3753 ifp->if_u1.if_extents = NULL; 3754 ifp->if_real_bytes = 0; 3755 ifp->if_bytes = 0; 3756 } 3757 3758 /* 3759 * Return a pointer to the extent record for file system block bno. 3760 */ 3761 xfs_bmbt_rec_host_t * /* pointer to found extent record */ 3762 xfs_iext_bno_to_ext( 3763 xfs_ifork_t *ifp, /* inode fork pointer */ 3764 xfs_fileoff_t bno, /* block number to search for */ 3765 xfs_extnum_t *idxp) /* index of target extent */ 3766 { 3767 xfs_bmbt_rec_host_t *base; /* pointer to first extent */ 3768 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 3769 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ 3770 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3771 int high; /* upper boundary in search */ 3772 xfs_extnum_t idx = 0; /* index of target extent */ 3773 int low; /* lower boundary in search */ 3774 xfs_extnum_t nextents; /* number of file extents */ 3775 xfs_fileoff_t startoff = 0; /* start offset of extent */ 3776 3777 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3778 if (nextents == 0) { 3779 *idxp = 0; 3780 return NULL; 3781 } 3782 low = 0; 3783 if (ifp->if_flags & XFS_IFEXTIREC) { 3784 /* Find target extent list */ 3785 int erp_idx = 0; 3786 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 3787 base = erp->er_extbuf; 3788 high = erp->er_extcount - 1; 3789 } else { 3790 base = ifp->if_u1.if_extents; 3791 high = nextents - 1; 3792 } 3793 /* Binary search extent records */ 3794 while (low <= high) { 3795 idx = (low + high) >> 1; 3796 ep = base + idx; 3797 startoff = xfs_bmbt_get_startoff(ep); 3798 blockcount = xfs_bmbt_get_blockcount(ep); 3799 if (bno < startoff) { 3800 high = idx - 1; 3801 } else if (bno >= startoff + blockcount) { 3802 low = idx + 1; 3803 } else { 3804 /* Convert back to file-based extent index */ 3805 if (ifp->if_flags & XFS_IFEXTIREC) { 3806 idx += erp->er_extoff; 3807 } 3808 *idxp = idx; 3809 return ep; 3810 } 3811 } 3812 /* Convert back to file-based extent index */ 3813 if (ifp->if_flags & XFS_IFEXTIREC) { 3814 idx += erp->er_extoff; 3815 } 3816 if (bno >= startoff + blockcount) { 3817 if (++idx == nextents) { 3818 ep = NULL; 3819 } else { 3820 ep = xfs_iext_get_ext(ifp, idx); 3821 } 3822 } 3823 *idxp = idx; 3824 return ep; 3825 } 3826 3827 /* 3828 * Return a pointer to the indirection array entry containing the 3829 * extent record for filesystem block bno. Store the index of the 3830 * target irec in *erp_idxp. 3831 */ 3832 xfs_ext_irec_t * /* pointer to found extent record */ 3833 xfs_iext_bno_to_irec( 3834 xfs_ifork_t *ifp, /* inode fork pointer */ 3835 xfs_fileoff_t bno, /* block number to search for */ 3836 int *erp_idxp) /* irec index of target ext list */ 3837 { 3838 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3839 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 3840 int erp_idx; /* indirection array index */ 3841 int nlists; /* number of extent irec's (lists) */ 3842 int high; /* binary search upper limit */ 3843 int low; /* binary search lower limit */ 3844 3845 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3846 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3847 erp_idx = 0; 3848 low = 0; 3849 high = nlists - 1; 3850 while (low <= high) { 3851 erp_idx = (low + high) >> 1; 3852 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3853 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 3854 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 3855 high = erp_idx - 1; 3856 } else if (erp_next && bno >= 3857 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 3858 low = erp_idx + 1; 3859 } else { 3860 break; 3861 } 3862 } 3863 *erp_idxp = erp_idx; 3864 return erp; 3865 } 3866 3867 /* 3868 * Return a pointer to the indirection array entry containing the 3869 * extent record at file extent index *idxp. Store the index of the 3870 * target irec in *erp_idxp and store the page index of the target 3871 * extent record in *idxp. 3872 */ 3873 xfs_ext_irec_t * 3874 xfs_iext_idx_to_irec( 3875 xfs_ifork_t *ifp, /* inode fork pointer */ 3876 xfs_extnum_t *idxp, /* extent index (file -> page) */ 3877 int *erp_idxp, /* pointer to target irec */ 3878 int realloc) /* new bytes were just added */ 3879 { 3880 xfs_ext_irec_t *prev; /* pointer to previous irec */ 3881 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 3882 int erp_idx; /* indirection array index */ 3883 int nlists; /* number of irec's (ex lists) */ 3884 int high; /* binary search upper limit */ 3885 int low; /* binary search lower limit */ 3886 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3887 3888 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3889 ASSERT(page_idx >= 0 && page_idx <= 3890 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 3891 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3892 erp_idx = 0; 3893 low = 0; 3894 high = nlists - 1; 3895 3896 /* Binary search extent irec's */ 3897 while (low <= high) { 3898 erp_idx = (low + high) >> 1; 3899 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3900 prev = erp_idx > 0 ? erp - 1 : NULL; 3901 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 3902 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 3903 high = erp_idx - 1; 3904 } else if (page_idx > erp->er_extoff + erp->er_extcount || 3905 (page_idx == erp->er_extoff + erp->er_extcount && 3906 !realloc)) { 3907 low = erp_idx + 1; 3908 } else if (page_idx == erp->er_extoff + erp->er_extcount && 3909 erp->er_extcount == XFS_LINEAR_EXTS) { 3910 ASSERT(realloc); 3911 page_idx = 0; 3912 erp_idx++; 3913 erp = erp_idx < nlists ? erp + 1 : NULL; 3914 break; 3915 } else { 3916 page_idx -= erp->er_extoff; 3917 break; 3918 } 3919 } 3920 *idxp = page_idx; 3921 *erp_idxp = erp_idx; 3922 return(erp); 3923 } 3924 3925 /* 3926 * Allocate and initialize an indirection array once the space needed 3927 * for incore extents increases above XFS_IEXT_BUFSZ. 3928 */ 3929 void 3930 xfs_iext_irec_init( 3931 xfs_ifork_t *ifp) /* inode fork pointer */ 3932 { 3933 xfs_ext_irec_t *erp; /* indirection array pointer */ 3934 xfs_extnum_t nextents; /* number of extents in file */ 3935 3936 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3937 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3938 ASSERT(nextents <= XFS_LINEAR_EXTS); 3939 3940 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); 3941 3942 if (nextents == 0) { 3943 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3944 } else if (!ifp->if_real_bytes) { 3945 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 3946 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 3947 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 3948 } 3949 erp->er_extbuf = ifp->if_u1.if_extents; 3950 erp->er_extcount = nextents; 3951 erp->er_extoff = 0; 3952 3953 ifp->if_flags |= XFS_IFEXTIREC; 3954 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 3955 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 3956 ifp->if_u1.if_ext_irec = erp; 3957 3958 return; 3959 } 3960 3961 /* 3962 * Allocate and initialize a new entry in the indirection array. 3963 */ 3964 xfs_ext_irec_t * 3965 xfs_iext_irec_new( 3966 xfs_ifork_t *ifp, /* inode fork pointer */ 3967 int erp_idx) /* index for new irec */ 3968 { 3969 xfs_ext_irec_t *erp; /* indirection array pointer */ 3970 int i; /* loop counter */ 3971 int nlists; /* number of irec's (ex lists) */ 3972 3973 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3974 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3975 3976 /* Resize indirection array */ 3977 xfs_iext_realloc_indirect(ifp, ++nlists * 3978 sizeof(xfs_ext_irec_t)); 3979 /* 3980 * Move records down in the array so the 3981 * new page can use erp_idx. 3982 */ 3983 erp = ifp->if_u1.if_ext_irec; 3984 for (i = nlists - 1; i > erp_idx; i--) { 3985 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 3986 } 3987 ASSERT(i == erp_idx); 3988 3989 /* Initialize new extent record */ 3990 erp = ifp->if_u1.if_ext_irec; 3991 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3992 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3993 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 3994 erp[erp_idx].er_extcount = 0; 3995 erp[erp_idx].er_extoff = erp_idx > 0 ? 3996 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 3997 return (&erp[erp_idx]); 3998 } 3999 4000 /* 4001 * Remove a record from the indirection array. 4002 */ 4003 void 4004 xfs_iext_irec_remove( 4005 xfs_ifork_t *ifp, /* inode fork pointer */ 4006 int erp_idx) /* irec index to remove */ 4007 { 4008 xfs_ext_irec_t *erp; /* indirection array pointer */ 4009 int i; /* loop counter */ 4010 int nlists; /* number of irec's (ex lists) */ 4011 4012 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4013 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4014 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4015 if (erp->er_extbuf) { 4016 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 4017 -erp->er_extcount); 4018 kmem_free(erp->er_extbuf); 4019 } 4020 /* Compact extent records */ 4021 erp = ifp->if_u1.if_ext_irec; 4022 for (i = erp_idx; i < nlists - 1; i++) { 4023 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 4024 } 4025 /* 4026 * Manually free the last extent record from the indirection 4027 * array. A call to xfs_iext_realloc_indirect() with a size 4028 * of zero would result in a call to xfs_iext_destroy() which 4029 * would in turn call this function again, creating a nasty 4030 * infinite loop. 4031 */ 4032 if (--nlists) { 4033 xfs_iext_realloc_indirect(ifp, 4034 nlists * sizeof(xfs_ext_irec_t)); 4035 } else { 4036 kmem_free(ifp->if_u1.if_ext_irec); 4037 } 4038 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 4039 } 4040 4041 /* 4042 * This is called to clean up large amounts of unused memory allocated 4043 * by the indirection array. Before compacting anything though, verify 4044 * that the indirection array is still needed and switch back to the 4045 * linear extent list (or even the inline buffer) if possible. The 4046 * compaction policy is as follows: 4047 * 4048 * Full Compaction: Extents fit into a single page (or inline buffer) 4049 * Partial Compaction: Extents occupy less than 50% of allocated space 4050 * No Compaction: Extents occupy at least 50% of allocated space 4051 */ 4052 void 4053 xfs_iext_irec_compact( 4054 xfs_ifork_t *ifp) /* inode fork pointer */ 4055 { 4056 xfs_extnum_t nextents; /* number of extents in file */ 4057 int nlists; /* number of irec's (ex lists) */ 4058 4059 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4060 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4061 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4062 4063 if (nextents == 0) { 4064 xfs_iext_destroy(ifp); 4065 } else if (nextents <= XFS_INLINE_EXTS) { 4066 xfs_iext_indirect_to_direct(ifp); 4067 xfs_iext_direct_to_inline(ifp, nextents); 4068 } else if (nextents <= XFS_LINEAR_EXTS) { 4069 xfs_iext_indirect_to_direct(ifp); 4070 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4071 xfs_iext_irec_compact_pages(ifp); 4072 } 4073 } 4074 4075 /* 4076 * Combine extents from neighboring extent pages. 4077 */ 4078 void 4079 xfs_iext_irec_compact_pages( 4080 xfs_ifork_t *ifp) /* inode fork pointer */ 4081 { 4082 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 4083 int erp_idx = 0; /* indirection array index */ 4084 int nlists; /* number of irec's (ex lists) */ 4085 4086 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4087 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4088 while (erp_idx < nlists - 1) { 4089 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4090 erp_next = erp + 1; 4091 if (erp_next->er_extcount <= 4092 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4093 memcpy(&erp->er_extbuf[erp->er_extcount], 4094 erp_next->er_extbuf, erp_next->er_extcount * 4095 sizeof(xfs_bmbt_rec_t)); 4096 erp->er_extcount += erp_next->er_extcount; 4097 /* 4098 * Free page before removing extent record 4099 * so er_extoffs don't get modified in 4100 * xfs_iext_irec_remove. 4101 */ 4102 kmem_free(erp_next->er_extbuf); 4103 erp_next->er_extbuf = NULL; 4104 xfs_iext_irec_remove(ifp, erp_idx + 1); 4105 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4106 } else { 4107 erp_idx++; 4108 } 4109 } 4110 } 4111 4112 /* 4113 * This is called to update the er_extoff field in the indirection 4114 * array when extents have been added or removed from one of the 4115 * extent lists. erp_idx contains the irec index to begin updating 4116 * at and ext_diff contains the number of extents that were added 4117 * or removed. 4118 */ 4119 void 4120 xfs_iext_irec_update_extoffs( 4121 xfs_ifork_t *ifp, /* inode fork pointer */ 4122 int erp_idx, /* irec index to update */ 4123 int ext_diff) /* number of new extents */ 4124 { 4125 int i; /* loop counter */ 4126 int nlists; /* number of irec's (ex lists */ 4127 4128 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4129 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4130 for (i = erp_idx; i < nlists; i++) { 4131 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 4132 } 4133 } 4134