1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_bit.h" 24 #include "xfs_log.h" 25 #include "xfs_inum.h" 26 #include "xfs_trans.h" 27 #include "xfs_trans_priv.h" 28 #include "xfs_sb.h" 29 #include "xfs_ag.h" 30 #include "xfs_dir2.h" 31 #include "xfs_dmapi.h" 32 #include "xfs_mount.h" 33 #include "xfs_bmap_btree.h" 34 #include "xfs_alloc_btree.h" 35 #include "xfs_ialloc_btree.h" 36 #include "xfs_dir2_sf.h" 37 #include "xfs_attr_sf.h" 38 #include "xfs_dinode.h" 39 #include "xfs_inode.h" 40 #include "xfs_buf_item.h" 41 #include "xfs_inode_item.h" 42 #include "xfs_btree.h" 43 #include "xfs_btree_trace.h" 44 #include "xfs_alloc.h" 45 #include "xfs_ialloc.h" 46 #include "xfs_bmap.h" 47 #include "xfs_rw.h" 48 #include "xfs_error.h" 49 #include "xfs_utils.h" 50 #include "xfs_quota.h" 51 #include "xfs_filestream.h" 52 #include "xfs_vnodeops.h" 53 #include "xfs_trace.h" 54 55 kmem_zone_t *xfs_ifork_zone; 56 kmem_zone_t *xfs_inode_zone; 57 58 /* 59 * Used in xfs_itruncate(). This is the maximum number of extents 60 * freed from a file in a single transaction. 61 */ 62 #define XFS_ITRUNC_MAX_EXTENTS 2 63 64 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 65 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 66 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 67 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 68 69 #ifdef DEBUG 70 /* 71 * Make sure that the extents in the given memory buffer 72 * are valid. 73 */ 74 STATIC void 75 xfs_validate_extents( 76 xfs_ifork_t *ifp, 77 int nrecs, 78 xfs_exntfmt_t fmt) 79 { 80 xfs_bmbt_irec_t irec; 81 xfs_bmbt_rec_host_t rec; 82 int i; 83 84 for (i = 0; i < nrecs; i++) { 85 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 86 rec.l0 = get_unaligned(&ep->l0); 87 rec.l1 = get_unaligned(&ep->l1); 88 xfs_bmbt_get_all(&rec, &irec); 89 if (fmt == XFS_EXTFMT_NOSTATE) 90 ASSERT(irec.br_state == XFS_EXT_NORM); 91 } 92 } 93 #else /* DEBUG */ 94 #define xfs_validate_extents(ifp, nrecs, fmt) 95 #endif /* DEBUG */ 96 97 /* 98 * Check that none of the inode's in the buffer have a next 99 * unlinked field of 0. 100 */ 101 #if defined(DEBUG) 102 void 103 xfs_inobp_check( 104 xfs_mount_t *mp, 105 xfs_buf_t *bp) 106 { 107 int i; 108 int j; 109 xfs_dinode_t *dip; 110 111 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 112 113 for (i = 0; i < j; i++) { 114 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 115 i * mp->m_sb.sb_inodesize); 116 if (!dip->di_next_unlinked) { 117 xfs_fs_cmn_err(CE_ALERT, mp, 118 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 119 bp); 120 ASSERT(dip->di_next_unlinked); 121 } 122 } 123 } 124 #endif 125 126 /* 127 * Find the buffer associated with the given inode map 128 * We do basic validation checks on the buffer once it has been 129 * retrieved from disk. 130 */ 131 STATIC int 132 xfs_imap_to_bp( 133 xfs_mount_t *mp, 134 xfs_trans_t *tp, 135 struct xfs_imap *imap, 136 xfs_buf_t **bpp, 137 uint buf_flags, 138 uint iget_flags) 139 { 140 int error; 141 int i; 142 int ni; 143 xfs_buf_t *bp; 144 145 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 146 (int)imap->im_len, buf_flags, &bp); 147 if (error) { 148 if (error != EAGAIN) { 149 cmn_err(CE_WARN, 150 "xfs_imap_to_bp: xfs_trans_read_buf()returned " 151 "an error %d on %s. Returning error.", 152 error, mp->m_fsname); 153 } else { 154 ASSERT(buf_flags & XBF_TRYLOCK); 155 } 156 return error; 157 } 158 159 /* 160 * Validate the magic number and version of every inode in the buffer 161 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 162 */ 163 #ifdef DEBUG 164 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; 165 #else /* usual case */ 166 ni = 1; 167 #endif 168 169 for (i = 0; i < ni; i++) { 170 int di_ok; 171 xfs_dinode_t *dip; 172 173 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 174 (i << mp->m_sb.sb_inodelog)); 175 di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC && 176 XFS_DINODE_GOOD_VERSION(dip->di_version); 177 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 178 XFS_ERRTAG_ITOBP_INOTOBP, 179 XFS_RANDOM_ITOBP_INOTOBP))) { 180 if (iget_flags & XFS_IGET_UNTRUSTED) { 181 xfs_trans_brelse(tp, bp); 182 return XFS_ERROR(EINVAL); 183 } 184 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 185 XFS_ERRLEVEL_HIGH, mp, dip); 186 #ifdef DEBUG 187 cmn_err(CE_PANIC, 188 "Device %s - bad inode magic/vsn " 189 "daddr %lld #%d (magic=%x)", 190 XFS_BUFTARG_NAME(mp->m_ddev_targp), 191 (unsigned long long)imap->im_blkno, i, 192 be16_to_cpu(dip->di_magic)); 193 #endif 194 xfs_trans_brelse(tp, bp); 195 return XFS_ERROR(EFSCORRUPTED); 196 } 197 } 198 199 xfs_inobp_check(mp, bp); 200 201 /* 202 * Mark the buffer as an inode buffer now that it looks good 203 */ 204 XFS_BUF_SET_VTYPE(bp, B_FS_INO); 205 206 *bpp = bp; 207 return 0; 208 } 209 210 /* 211 * This routine is called to map an inode number within a file 212 * system to the buffer containing the on-disk version of the 213 * inode. It returns a pointer to the buffer containing the 214 * on-disk inode in the bpp parameter, and in the dip parameter 215 * it returns a pointer to the on-disk inode within that buffer. 216 * 217 * If a non-zero error is returned, then the contents of bpp and 218 * dipp are undefined. 219 * 220 * Use xfs_imap() to determine the size and location of the 221 * buffer to read from disk. 222 */ 223 int 224 xfs_inotobp( 225 xfs_mount_t *mp, 226 xfs_trans_t *tp, 227 xfs_ino_t ino, 228 xfs_dinode_t **dipp, 229 xfs_buf_t **bpp, 230 int *offset, 231 uint imap_flags) 232 { 233 struct xfs_imap imap; 234 xfs_buf_t *bp; 235 int error; 236 237 imap.im_blkno = 0; 238 error = xfs_imap(mp, tp, ino, &imap, imap_flags); 239 if (error) 240 return error; 241 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags); 243 if (error) 244 return error; 245 246 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 247 *bpp = bp; 248 *offset = imap.im_boffset; 249 return 0; 250 } 251 252 253 /* 254 * This routine is called to map an inode to the buffer containing 255 * the on-disk version of the inode. It returns a pointer to the 256 * buffer containing the on-disk inode in the bpp parameter, and in 257 * the dip parameter it returns a pointer to the on-disk inode within 258 * that buffer. 259 * 260 * If a non-zero error is returned, then the contents of bpp and 261 * dipp are undefined. 262 * 263 * The inode is expected to already been mapped to its buffer and read 264 * in once, thus we can use the mapping information stored in the inode 265 * rather than calling xfs_imap(). This allows us to avoid the overhead 266 * of looking at the inode btree for small block file systems 267 * (see xfs_imap()). 268 */ 269 int 270 xfs_itobp( 271 xfs_mount_t *mp, 272 xfs_trans_t *tp, 273 xfs_inode_t *ip, 274 xfs_dinode_t **dipp, 275 xfs_buf_t **bpp, 276 uint buf_flags) 277 { 278 xfs_buf_t *bp; 279 int error; 280 281 ASSERT(ip->i_imap.im_blkno != 0); 282 283 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); 284 if (error) 285 return error; 286 287 if (!bp) { 288 ASSERT(buf_flags & XBF_TRYLOCK); 289 ASSERT(tp == NULL); 290 *bpp = NULL; 291 return EAGAIN; 292 } 293 294 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 295 *bpp = bp; 296 return 0; 297 } 298 299 /* 300 * Move inode type and inode format specific information from the 301 * on-disk inode to the in-core inode. For fifos, devs, and sockets 302 * this means set if_rdev to the proper value. For files, directories, 303 * and symlinks this means to bring in the in-line data or extent 304 * pointers. For a file in B-tree format, only the root is immediately 305 * brought in-core. The rest will be in-lined in if_extents when it 306 * is first referenced (see xfs_iread_extents()). 307 */ 308 STATIC int 309 xfs_iformat( 310 xfs_inode_t *ip, 311 xfs_dinode_t *dip) 312 { 313 xfs_attr_shortform_t *atp; 314 int size; 315 int error; 316 xfs_fsize_t di_size; 317 ip->i_df.if_ext_max = 318 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 319 error = 0; 320 321 if (unlikely(be32_to_cpu(dip->di_nextents) + 322 be16_to_cpu(dip->di_anextents) > 323 be64_to_cpu(dip->di_nblocks))) { 324 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 325 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 326 (unsigned long long)ip->i_ino, 327 (int)(be32_to_cpu(dip->di_nextents) + 328 be16_to_cpu(dip->di_anextents)), 329 (unsigned long long) 330 be64_to_cpu(dip->di_nblocks)); 331 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 332 ip->i_mount, dip); 333 return XFS_ERROR(EFSCORRUPTED); 334 } 335 336 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 337 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 338 "corrupt dinode %Lu, forkoff = 0x%x.", 339 (unsigned long long)ip->i_ino, 340 dip->di_forkoff); 341 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 342 ip->i_mount, dip); 343 return XFS_ERROR(EFSCORRUPTED); 344 } 345 346 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 347 !ip->i_mount->m_rtdev_targp)) { 348 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 349 "corrupt dinode %Lu, has realtime flag set.", 350 ip->i_ino); 351 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 352 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 353 return XFS_ERROR(EFSCORRUPTED); 354 } 355 356 switch (ip->i_d.di_mode & S_IFMT) { 357 case S_IFIFO: 358 case S_IFCHR: 359 case S_IFBLK: 360 case S_IFSOCK: 361 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 362 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 363 ip->i_mount, dip); 364 return XFS_ERROR(EFSCORRUPTED); 365 } 366 ip->i_d.di_size = 0; 367 ip->i_size = 0; 368 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 369 break; 370 371 case S_IFREG: 372 case S_IFLNK: 373 case S_IFDIR: 374 switch (dip->di_format) { 375 case XFS_DINODE_FMT_LOCAL: 376 /* 377 * no local regular files yet 378 */ 379 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 380 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 381 "corrupt inode %Lu " 382 "(local format for regular file).", 383 (unsigned long long) ip->i_ino); 384 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 385 XFS_ERRLEVEL_LOW, 386 ip->i_mount, dip); 387 return XFS_ERROR(EFSCORRUPTED); 388 } 389 390 di_size = be64_to_cpu(dip->di_size); 391 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 392 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 393 "corrupt inode %Lu " 394 "(bad size %Ld for local inode).", 395 (unsigned long long) ip->i_ino, 396 (long long) di_size); 397 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 398 XFS_ERRLEVEL_LOW, 399 ip->i_mount, dip); 400 return XFS_ERROR(EFSCORRUPTED); 401 } 402 403 size = (int)di_size; 404 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 405 break; 406 case XFS_DINODE_FMT_EXTENTS: 407 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 408 break; 409 case XFS_DINODE_FMT_BTREE: 410 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 411 break; 412 default: 413 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 414 ip->i_mount); 415 return XFS_ERROR(EFSCORRUPTED); 416 } 417 break; 418 419 default: 420 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 421 return XFS_ERROR(EFSCORRUPTED); 422 } 423 if (error) { 424 return error; 425 } 426 if (!XFS_DFORK_Q(dip)) 427 return 0; 428 ASSERT(ip->i_afp == NULL); 429 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 430 ip->i_afp->if_ext_max = 431 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 432 switch (dip->di_aformat) { 433 case XFS_DINODE_FMT_LOCAL: 434 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 435 size = be16_to_cpu(atp->hdr.totsize); 436 437 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 438 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 439 "corrupt inode %Lu " 440 "(bad attr fork size %Ld).", 441 (unsigned long long) ip->i_ino, 442 (long long) size); 443 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 444 XFS_ERRLEVEL_LOW, 445 ip->i_mount, dip); 446 return XFS_ERROR(EFSCORRUPTED); 447 } 448 449 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 450 break; 451 case XFS_DINODE_FMT_EXTENTS: 452 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 453 break; 454 case XFS_DINODE_FMT_BTREE: 455 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 456 break; 457 default: 458 error = XFS_ERROR(EFSCORRUPTED); 459 break; 460 } 461 if (error) { 462 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 463 ip->i_afp = NULL; 464 xfs_idestroy_fork(ip, XFS_DATA_FORK); 465 } 466 return error; 467 } 468 469 /* 470 * The file is in-lined in the on-disk inode. 471 * If it fits into if_inline_data, then copy 472 * it there, otherwise allocate a buffer for it 473 * and copy the data there. Either way, set 474 * if_data to point at the data. 475 * If we allocate a buffer for the data, make 476 * sure that its size is a multiple of 4 and 477 * record the real size in i_real_bytes. 478 */ 479 STATIC int 480 xfs_iformat_local( 481 xfs_inode_t *ip, 482 xfs_dinode_t *dip, 483 int whichfork, 484 int size) 485 { 486 xfs_ifork_t *ifp; 487 int real_size; 488 489 /* 490 * If the size is unreasonable, then something 491 * is wrong and we just bail out rather than crash in 492 * kmem_alloc() or memcpy() below. 493 */ 494 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 495 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 496 "corrupt inode %Lu " 497 "(bad size %d for local fork, size = %d).", 498 (unsigned long long) ip->i_ino, size, 499 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 500 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 501 ip->i_mount, dip); 502 return XFS_ERROR(EFSCORRUPTED); 503 } 504 ifp = XFS_IFORK_PTR(ip, whichfork); 505 real_size = 0; 506 if (size == 0) 507 ifp->if_u1.if_data = NULL; 508 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 509 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 510 else { 511 real_size = roundup(size, 4); 512 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 513 } 514 ifp->if_bytes = size; 515 ifp->if_real_bytes = real_size; 516 if (size) 517 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 518 ifp->if_flags &= ~XFS_IFEXTENTS; 519 ifp->if_flags |= XFS_IFINLINE; 520 return 0; 521 } 522 523 /* 524 * The file consists of a set of extents all 525 * of which fit into the on-disk inode. 526 * If there are few enough extents to fit into 527 * the if_inline_ext, then copy them there. 528 * Otherwise allocate a buffer for them and copy 529 * them into it. Either way, set if_extents 530 * to point at the extents. 531 */ 532 STATIC int 533 xfs_iformat_extents( 534 xfs_inode_t *ip, 535 xfs_dinode_t *dip, 536 int whichfork) 537 { 538 xfs_bmbt_rec_t *dp; 539 xfs_ifork_t *ifp; 540 int nex; 541 int size; 542 int i; 543 544 ifp = XFS_IFORK_PTR(ip, whichfork); 545 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 546 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 547 548 /* 549 * If the number of extents is unreasonable, then something 550 * is wrong and we just bail out rather than crash in 551 * kmem_alloc() or memcpy() below. 552 */ 553 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 554 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 555 "corrupt inode %Lu ((a)extents = %d).", 556 (unsigned long long) ip->i_ino, nex); 557 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 558 ip->i_mount, dip); 559 return XFS_ERROR(EFSCORRUPTED); 560 } 561 562 ifp->if_real_bytes = 0; 563 if (nex == 0) 564 ifp->if_u1.if_extents = NULL; 565 else if (nex <= XFS_INLINE_EXTS) 566 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 567 else 568 xfs_iext_add(ifp, 0, nex); 569 570 ifp->if_bytes = size; 571 if (size) { 572 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 573 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 574 for (i = 0; i < nex; i++, dp++) { 575 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 576 ep->l0 = get_unaligned_be64(&dp->l0); 577 ep->l1 = get_unaligned_be64(&dp->l1); 578 } 579 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 580 if (whichfork != XFS_DATA_FORK || 581 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 582 if (unlikely(xfs_check_nostate_extents( 583 ifp, 0, nex))) { 584 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 585 XFS_ERRLEVEL_LOW, 586 ip->i_mount); 587 return XFS_ERROR(EFSCORRUPTED); 588 } 589 } 590 ifp->if_flags |= XFS_IFEXTENTS; 591 return 0; 592 } 593 594 /* 595 * The file has too many extents to fit into 596 * the inode, so they are in B-tree format. 597 * Allocate a buffer for the root of the B-tree 598 * and copy the root into it. The i_extents 599 * field will remain NULL until all of the 600 * extents are read in (when they are needed). 601 */ 602 STATIC int 603 xfs_iformat_btree( 604 xfs_inode_t *ip, 605 xfs_dinode_t *dip, 606 int whichfork) 607 { 608 xfs_bmdr_block_t *dfp; 609 xfs_ifork_t *ifp; 610 /* REFERENCED */ 611 int nrecs; 612 int size; 613 614 ifp = XFS_IFORK_PTR(ip, whichfork); 615 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 616 size = XFS_BMAP_BROOT_SPACE(dfp); 617 nrecs = be16_to_cpu(dfp->bb_numrecs); 618 619 /* 620 * blow out if -- fork has less extents than can fit in 621 * fork (fork shouldn't be a btree format), root btree 622 * block has more records than can fit into the fork, 623 * or the number of extents is greater than the number of 624 * blocks. 625 */ 626 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 627 || XFS_BMDR_SPACE_CALC(nrecs) > 628 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 629 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 630 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 631 "corrupt inode %Lu (btree).", 632 (unsigned long long) ip->i_ino); 633 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 634 ip->i_mount); 635 return XFS_ERROR(EFSCORRUPTED); 636 } 637 638 ifp->if_broot_bytes = size; 639 ifp->if_broot = kmem_alloc(size, KM_SLEEP); 640 ASSERT(ifp->if_broot != NULL); 641 /* 642 * Copy and convert from the on-disk structure 643 * to the in-memory structure. 644 */ 645 xfs_bmdr_to_bmbt(ip->i_mount, dfp, 646 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 647 ifp->if_broot, size); 648 ifp->if_flags &= ~XFS_IFEXTENTS; 649 ifp->if_flags |= XFS_IFBROOT; 650 651 return 0; 652 } 653 654 STATIC void 655 xfs_dinode_from_disk( 656 xfs_icdinode_t *to, 657 xfs_dinode_t *from) 658 { 659 to->di_magic = be16_to_cpu(from->di_magic); 660 to->di_mode = be16_to_cpu(from->di_mode); 661 to->di_version = from ->di_version; 662 to->di_format = from->di_format; 663 to->di_onlink = be16_to_cpu(from->di_onlink); 664 to->di_uid = be32_to_cpu(from->di_uid); 665 to->di_gid = be32_to_cpu(from->di_gid); 666 to->di_nlink = be32_to_cpu(from->di_nlink); 667 to->di_projid = be16_to_cpu(from->di_projid); 668 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 669 to->di_flushiter = be16_to_cpu(from->di_flushiter); 670 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 671 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 672 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 673 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 674 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 675 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 676 to->di_size = be64_to_cpu(from->di_size); 677 to->di_nblocks = be64_to_cpu(from->di_nblocks); 678 to->di_extsize = be32_to_cpu(from->di_extsize); 679 to->di_nextents = be32_to_cpu(from->di_nextents); 680 to->di_anextents = be16_to_cpu(from->di_anextents); 681 to->di_forkoff = from->di_forkoff; 682 to->di_aformat = from->di_aformat; 683 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 684 to->di_dmstate = be16_to_cpu(from->di_dmstate); 685 to->di_flags = be16_to_cpu(from->di_flags); 686 to->di_gen = be32_to_cpu(from->di_gen); 687 } 688 689 void 690 xfs_dinode_to_disk( 691 xfs_dinode_t *to, 692 xfs_icdinode_t *from) 693 { 694 to->di_magic = cpu_to_be16(from->di_magic); 695 to->di_mode = cpu_to_be16(from->di_mode); 696 to->di_version = from ->di_version; 697 to->di_format = from->di_format; 698 to->di_onlink = cpu_to_be16(from->di_onlink); 699 to->di_uid = cpu_to_be32(from->di_uid); 700 to->di_gid = cpu_to_be32(from->di_gid); 701 to->di_nlink = cpu_to_be32(from->di_nlink); 702 to->di_projid = cpu_to_be16(from->di_projid); 703 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 704 to->di_flushiter = cpu_to_be16(from->di_flushiter); 705 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 706 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 707 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 708 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 709 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 710 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 711 to->di_size = cpu_to_be64(from->di_size); 712 to->di_nblocks = cpu_to_be64(from->di_nblocks); 713 to->di_extsize = cpu_to_be32(from->di_extsize); 714 to->di_nextents = cpu_to_be32(from->di_nextents); 715 to->di_anextents = cpu_to_be16(from->di_anextents); 716 to->di_forkoff = from->di_forkoff; 717 to->di_aformat = from->di_aformat; 718 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 719 to->di_dmstate = cpu_to_be16(from->di_dmstate); 720 to->di_flags = cpu_to_be16(from->di_flags); 721 to->di_gen = cpu_to_be32(from->di_gen); 722 } 723 724 STATIC uint 725 _xfs_dic2xflags( 726 __uint16_t di_flags) 727 { 728 uint flags = 0; 729 730 if (di_flags & XFS_DIFLAG_ANY) { 731 if (di_flags & XFS_DIFLAG_REALTIME) 732 flags |= XFS_XFLAG_REALTIME; 733 if (di_flags & XFS_DIFLAG_PREALLOC) 734 flags |= XFS_XFLAG_PREALLOC; 735 if (di_flags & XFS_DIFLAG_IMMUTABLE) 736 flags |= XFS_XFLAG_IMMUTABLE; 737 if (di_flags & XFS_DIFLAG_APPEND) 738 flags |= XFS_XFLAG_APPEND; 739 if (di_flags & XFS_DIFLAG_SYNC) 740 flags |= XFS_XFLAG_SYNC; 741 if (di_flags & XFS_DIFLAG_NOATIME) 742 flags |= XFS_XFLAG_NOATIME; 743 if (di_flags & XFS_DIFLAG_NODUMP) 744 flags |= XFS_XFLAG_NODUMP; 745 if (di_flags & XFS_DIFLAG_RTINHERIT) 746 flags |= XFS_XFLAG_RTINHERIT; 747 if (di_flags & XFS_DIFLAG_PROJINHERIT) 748 flags |= XFS_XFLAG_PROJINHERIT; 749 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 750 flags |= XFS_XFLAG_NOSYMLINKS; 751 if (di_flags & XFS_DIFLAG_EXTSIZE) 752 flags |= XFS_XFLAG_EXTSIZE; 753 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 754 flags |= XFS_XFLAG_EXTSZINHERIT; 755 if (di_flags & XFS_DIFLAG_NODEFRAG) 756 flags |= XFS_XFLAG_NODEFRAG; 757 if (di_flags & XFS_DIFLAG_FILESTREAM) 758 flags |= XFS_XFLAG_FILESTREAM; 759 } 760 761 return flags; 762 } 763 764 uint 765 xfs_ip2xflags( 766 xfs_inode_t *ip) 767 { 768 xfs_icdinode_t *dic = &ip->i_d; 769 770 return _xfs_dic2xflags(dic->di_flags) | 771 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 772 } 773 774 uint 775 xfs_dic2xflags( 776 xfs_dinode_t *dip) 777 { 778 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 779 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 780 } 781 782 /* 783 * Read the disk inode attributes into the in-core inode structure. 784 */ 785 int 786 xfs_iread( 787 xfs_mount_t *mp, 788 xfs_trans_t *tp, 789 xfs_inode_t *ip, 790 uint iget_flags) 791 { 792 xfs_buf_t *bp; 793 xfs_dinode_t *dip; 794 int error; 795 796 /* 797 * Fill in the location information in the in-core inode. 798 */ 799 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 800 if (error) 801 return error; 802 803 /* 804 * Get pointers to the on-disk inode and the buffer containing it. 805 */ 806 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 807 XBF_LOCK, iget_flags); 808 if (error) 809 return error; 810 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 811 812 /* 813 * If we got something that isn't an inode it means someone 814 * (nfs or dmi) has a stale handle. 815 */ 816 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 817 #ifdef DEBUG 818 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 819 "dip->di_magic (0x%x) != " 820 "XFS_DINODE_MAGIC (0x%x)", 821 be16_to_cpu(dip->di_magic), 822 XFS_DINODE_MAGIC); 823 #endif /* DEBUG */ 824 error = XFS_ERROR(EINVAL); 825 goto out_brelse; 826 } 827 828 /* 829 * If the on-disk inode is already linked to a directory 830 * entry, copy all of the inode into the in-core inode. 831 * xfs_iformat() handles copying in the inode format 832 * specific information. 833 * Otherwise, just get the truly permanent information. 834 */ 835 if (dip->di_mode) { 836 xfs_dinode_from_disk(&ip->i_d, dip); 837 error = xfs_iformat(ip, dip); 838 if (error) { 839 #ifdef DEBUG 840 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 841 "xfs_iformat() returned error %d", 842 error); 843 #endif /* DEBUG */ 844 goto out_brelse; 845 } 846 } else { 847 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 848 ip->i_d.di_version = dip->di_version; 849 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 850 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 851 /* 852 * Make sure to pull in the mode here as well in 853 * case the inode is released without being used. 854 * This ensures that xfs_inactive() will see that 855 * the inode is already free and not try to mess 856 * with the uninitialized part of it. 857 */ 858 ip->i_d.di_mode = 0; 859 /* 860 * Initialize the per-fork minima and maxima for a new 861 * inode here. xfs_iformat will do it for old inodes. 862 */ 863 ip->i_df.if_ext_max = 864 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 865 } 866 867 /* 868 * The inode format changed when we moved the link count and 869 * made it 32 bits long. If this is an old format inode, 870 * convert it in memory to look like a new one. If it gets 871 * flushed to disk we will convert back before flushing or 872 * logging it. We zero out the new projid field and the old link 873 * count field. We'll handle clearing the pad field (the remains 874 * of the old uuid field) when we actually convert the inode to 875 * the new format. We don't change the version number so that we 876 * can distinguish this from a real new format inode. 877 */ 878 if (ip->i_d.di_version == 1) { 879 ip->i_d.di_nlink = ip->i_d.di_onlink; 880 ip->i_d.di_onlink = 0; 881 ip->i_d.di_projid = 0; 882 } 883 884 ip->i_delayed_blks = 0; 885 ip->i_size = ip->i_d.di_size; 886 887 /* 888 * Mark the buffer containing the inode as something to keep 889 * around for a while. This helps to keep recently accessed 890 * meta-data in-core longer. 891 */ 892 XFS_BUF_SET_REF(bp, XFS_INO_REF); 893 894 /* 895 * Use xfs_trans_brelse() to release the buffer containing the 896 * on-disk inode, because it was acquired with xfs_trans_read_buf() 897 * in xfs_itobp() above. If tp is NULL, this is just a normal 898 * brelse(). If we're within a transaction, then xfs_trans_brelse() 899 * will only release the buffer if it is not dirty within the 900 * transaction. It will be OK to release the buffer in this case, 901 * because inodes on disk are never destroyed and we will be 902 * locking the new in-core inode before putting it in the hash 903 * table where other processes can find it. Thus we don't have 904 * to worry about the inode being changed just because we released 905 * the buffer. 906 */ 907 out_brelse: 908 xfs_trans_brelse(tp, bp); 909 return error; 910 } 911 912 /* 913 * Read in extents from a btree-format inode. 914 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 915 */ 916 int 917 xfs_iread_extents( 918 xfs_trans_t *tp, 919 xfs_inode_t *ip, 920 int whichfork) 921 { 922 int error; 923 xfs_ifork_t *ifp; 924 xfs_extnum_t nextents; 925 size_t size; 926 927 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 928 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 929 ip->i_mount); 930 return XFS_ERROR(EFSCORRUPTED); 931 } 932 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 933 size = nextents * sizeof(xfs_bmbt_rec_t); 934 ifp = XFS_IFORK_PTR(ip, whichfork); 935 936 /* 937 * We know that the size is valid (it's checked in iformat_btree) 938 */ 939 ifp->if_lastex = NULLEXTNUM; 940 ifp->if_bytes = ifp->if_real_bytes = 0; 941 ifp->if_flags |= XFS_IFEXTENTS; 942 xfs_iext_add(ifp, 0, nextents); 943 error = xfs_bmap_read_extents(tp, ip, whichfork); 944 if (error) { 945 xfs_iext_destroy(ifp); 946 ifp->if_flags &= ~XFS_IFEXTENTS; 947 return error; 948 } 949 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 950 return 0; 951 } 952 953 /* 954 * Allocate an inode on disk and return a copy of its in-core version. 955 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 956 * appropriately within the inode. The uid and gid for the inode are 957 * set according to the contents of the given cred structure. 958 * 959 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 960 * has a free inode available, call xfs_iget() 961 * to obtain the in-core version of the allocated inode. Finally, 962 * fill in the inode and log its initial contents. In this case, 963 * ialloc_context would be set to NULL and call_again set to false. 964 * 965 * If xfs_dialloc() does not have an available inode, 966 * it will replenish its supply by doing an allocation. Since we can 967 * only do one allocation within a transaction without deadlocks, we 968 * must commit the current transaction before returning the inode itself. 969 * In this case, therefore, we will set call_again to true and return. 970 * The caller should then commit the current transaction, start a new 971 * transaction, and call xfs_ialloc() again to actually get the inode. 972 * 973 * To ensure that some other process does not grab the inode that 974 * was allocated during the first call to xfs_ialloc(), this routine 975 * also returns the [locked] bp pointing to the head of the freelist 976 * as ialloc_context. The caller should hold this buffer across 977 * the commit and pass it back into this routine on the second call. 978 * 979 * If we are allocating quota inodes, we do not have a parent inode 980 * to attach to or associate with (i.e. pip == NULL) because they 981 * are not linked into the directory structure - they are attached 982 * directly to the superblock - and so have no parent. 983 */ 984 int 985 xfs_ialloc( 986 xfs_trans_t *tp, 987 xfs_inode_t *pip, 988 mode_t mode, 989 xfs_nlink_t nlink, 990 xfs_dev_t rdev, 991 cred_t *cr, 992 xfs_prid_t prid, 993 int okalloc, 994 xfs_buf_t **ialloc_context, 995 boolean_t *call_again, 996 xfs_inode_t **ipp) 997 { 998 xfs_ino_t ino; 999 xfs_inode_t *ip; 1000 uint flags; 1001 int error; 1002 timespec_t tv; 1003 int filestreams = 0; 1004 1005 /* 1006 * Call the space management code to pick 1007 * the on-disk inode to be allocated. 1008 */ 1009 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 1010 ialloc_context, call_again, &ino); 1011 if (error) 1012 return error; 1013 if (*call_again || ino == NULLFSINO) { 1014 *ipp = NULL; 1015 return 0; 1016 } 1017 ASSERT(*ialloc_context == NULL); 1018 1019 /* 1020 * Get the in-core inode with the lock held exclusively. 1021 * This is because we're setting fields here we need 1022 * to prevent others from looking at until we're done. 1023 */ 1024 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1025 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1026 if (error) 1027 return error; 1028 ASSERT(ip != NULL); 1029 1030 ip->i_d.di_mode = (__uint16_t)mode; 1031 ip->i_d.di_onlink = 0; 1032 ip->i_d.di_nlink = nlink; 1033 ASSERT(ip->i_d.di_nlink == nlink); 1034 ip->i_d.di_uid = current_fsuid(); 1035 ip->i_d.di_gid = current_fsgid(); 1036 ip->i_d.di_projid = prid; 1037 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1038 1039 /* 1040 * If the superblock version is up to where we support new format 1041 * inodes and this is currently an old format inode, then change 1042 * the inode version number now. This way we only do the conversion 1043 * here rather than here and in the flush/logging code. 1044 */ 1045 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1046 ip->i_d.di_version == 1) { 1047 ip->i_d.di_version = 2; 1048 /* 1049 * We've already zeroed the old link count, the projid field, 1050 * and the pad field. 1051 */ 1052 } 1053 1054 /* 1055 * Project ids won't be stored on disk if we are using a version 1 inode. 1056 */ 1057 if ((prid != 0) && (ip->i_d.di_version == 1)) 1058 xfs_bump_ino_vers2(tp, ip); 1059 1060 if (pip && XFS_INHERIT_GID(pip)) { 1061 ip->i_d.di_gid = pip->i_d.di_gid; 1062 if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { 1063 ip->i_d.di_mode |= S_ISGID; 1064 } 1065 } 1066 1067 /* 1068 * If the group ID of the new file does not match the effective group 1069 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1070 * (and only if the irix_sgid_inherit compatibility variable is set). 1071 */ 1072 if ((irix_sgid_inherit) && 1073 (ip->i_d.di_mode & S_ISGID) && 1074 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1075 ip->i_d.di_mode &= ~S_ISGID; 1076 } 1077 1078 ip->i_d.di_size = 0; 1079 ip->i_size = 0; 1080 ip->i_d.di_nextents = 0; 1081 ASSERT(ip->i_d.di_nblocks == 0); 1082 1083 nanotime(&tv); 1084 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 1085 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 1086 ip->i_d.di_atime = ip->i_d.di_mtime; 1087 ip->i_d.di_ctime = ip->i_d.di_mtime; 1088 1089 /* 1090 * di_gen will have been taken care of in xfs_iread. 1091 */ 1092 ip->i_d.di_extsize = 0; 1093 ip->i_d.di_dmevmask = 0; 1094 ip->i_d.di_dmstate = 0; 1095 ip->i_d.di_flags = 0; 1096 flags = XFS_ILOG_CORE; 1097 switch (mode & S_IFMT) { 1098 case S_IFIFO: 1099 case S_IFCHR: 1100 case S_IFBLK: 1101 case S_IFSOCK: 1102 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1103 ip->i_df.if_u2.if_rdev = rdev; 1104 ip->i_df.if_flags = 0; 1105 flags |= XFS_ILOG_DEV; 1106 break; 1107 case S_IFREG: 1108 /* 1109 * we can't set up filestreams until after the VFS inode 1110 * is set up properly. 1111 */ 1112 if (pip && xfs_inode_is_filestream(pip)) 1113 filestreams = 1; 1114 /* fall through */ 1115 case S_IFDIR: 1116 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1117 uint di_flags = 0; 1118 1119 if ((mode & S_IFMT) == S_IFDIR) { 1120 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1121 di_flags |= XFS_DIFLAG_RTINHERIT; 1122 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1123 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1124 ip->i_d.di_extsize = pip->i_d.di_extsize; 1125 } 1126 } else if ((mode & S_IFMT) == S_IFREG) { 1127 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1128 di_flags |= XFS_DIFLAG_REALTIME; 1129 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1130 di_flags |= XFS_DIFLAG_EXTSIZE; 1131 ip->i_d.di_extsize = pip->i_d.di_extsize; 1132 } 1133 } 1134 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1135 xfs_inherit_noatime) 1136 di_flags |= XFS_DIFLAG_NOATIME; 1137 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1138 xfs_inherit_nodump) 1139 di_flags |= XFS_DIFLAG_NODUMP; 1140 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1141 xfs_inherit_sync) 1142 di_flags |= XFS_DIFLAG_SYNC; 1143 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1144 xfs_inherit_nosymlinks) 1145 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1146 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1147 di_flags |= XFS_DIFLAG_PROJINHERIT; 1148 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1149 xfs_inherit_nodefrag) 1150 di_flags |= XFS_DIFLAG_NODEFRAG; 1151 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 1152 di_flags |= XFS_DIFLAG_FILESTREAM; 1153 ip->i_d.di_flags |= di_flags; 1154 } 1155 /* FALLTHROUGH */ 1156 case S_IFLNK: 1157 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1158 ip->i_df.if_flags = XFS_IFEXTENTS; 1159 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1160 ip->i_df.if_u1.if_extents = NULL; 1161 break; 1162 default: 1163 ASSERT(0); 1164 } 1165 /* 1166 * Attribute fork settings for new inode. 1167 */ 1168 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1169 ip->i_d.di_anextents = 0; 1170 1171 /* 1172 * Log the new values stuffed into the inode. 1173 */ 1174 xfs_trans_log_inode(tp, ip, flags); 1175 1176 /* now that we have an i_mode we can setup inode ops and unlock */ 1177 xfs_setup_inode(ip); 1178 1179 /* now we have set up the vfs inode we can associate the filestream */ 1180 if (filestreams) { 1181 error = xfs_filestream_associate(pip, ip); 1182 if (error < 0) 1183 return -error; 1184 if (!error) 1185 xfs_iflags_set(ip, XFS_IFILESTREAM); 1186 } 1187 1188 *ipp = ip; 1189 return 0; 1190 } 1191 1192 /* 1193 * Check to make sure that there are no blocks allocated to the 1194 * file beyond the size of the file. We don't check this for 1195 * files with fixed size extents or real time extents, but we 1196 * at least do it for regular files. 1197 */ 1198 #ifdef DEBUG 1199 void 1200 xfs_isize_check( 1201 xfs_mount_t *mp, 1202 xfs_inode_t *ip, 1203 xfs_fsize_t isize) 1204 { 1205 xfs_fileoff_t map_first; 1206 int nimaps; 1207 xfs_bmbt_irec_t imaps[2]; 1208 1209 if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) 1210 return; 1211 1212 if (XFS_IS_REALTIME_INODE(ip)) 1213 return; 1214 1215 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 1216 return; 1217 1218 nimaps = 2; 1219 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 1220 /* 1221 * The filesystem could be shutting down, so bmapi may return 1222 * an error. 1223 */ 1224 if (xfs_bmapi(NULL, ip, map_first, 1225 (XFS_B_TO_FSB(mp, 1226 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - 1227 map_first), 1228 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, 1229 NULL, NULL)) 1230 return; 1231 ASSERT(nimaps == 1); 1232 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); 1233 } 1234 #endif /* DEBUG */ 1235 1236 /* 1237 * Calculate the last possible buffered byte in a file. This must 1238 * include data that was buffered beyond the EOF by the write code. 1239 * This also needs to deal with overflowing the xfs_fsize_t type 1240 * which can happen for sizes near the limit. 1241 * 1242 * We also need to take into account any blocks beyond the EOF. It 1243 * may be the case that they were buffered by a write which failed. 1244 * In that case the pages will still be in memory, but the inode size 1245 * will never have been updated. 1246 */ 1247 STATIC xfs_fsize_t 1248 xfs_file_last_byte( 1249 xfs_inode_t *ip) 1250 { 1251 xfs_mount_t *mp; 1252 xfs_fsize_t last_byte; 1253 xfs_fileoff_t last_block; 1254 xfs_fileoff_t size_last_block; 1255 int error; 1256 1257 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); 1258 1259 mp = ip->i_mount; 1260 /* 1261 * Only check for blocks beyond the EOF if the extents have 1262 * been read in. This eliminates the need for the inode lock, 1263 * and it also saves us from looking when it really isn't 1264 * necessary. 1265 */ 1266 if (ip->i_df.if_flags & XFS_IFEXTENTS) { 1267 xfs_ilock(ip, XFS_ILOCK_SHARED); 1268 error = xfs_bmap_last_offset(NULL, ip, &last_block, 1269 XFS_DATA_FORK); 1270 xfs_iunlock(ip, XFS_ILOCK_SHARED); 1271 if (error) { 1272 last_block = 0; 1273 } 1274 } else { 1275 last_block = 0; 1276 } 1277 size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_size); 1278 last_block = XFS_FILEOFF_MAX(last_block, size_last_block); 1279 1280 last_byte = XFS_FSB_TO_B(mp, last_block); 1281 if (last_byte < 0) { 1282 return XFS_MAXIOFFSET(mp); 1283 } 1284 last_byte += (1 << mp->m_writeio_log); 1285 if (last_byte < 0) { 1286 return XFS_MAXIOFFSET(mp); 1287 } 1288 return last_byte; 1289 } 1290 1291 /* 1292 * Start the truncation of the file to new_size. The new size 1293 * must be smaller than the current size. This routine will 1294 * clear the buffer and page caches of file data in the removed 1295 * range, and xfs_itruncate_finish() will remove the underlying 1296 * disk blocks. 1297 * 1298 * The inode must have its I/O lock locked EXCLUSIVELY, and it 1299 * must NOT have the inode lock held at all. This is because we're 1300 * calling into the buffer/page cache code and we can't hold the 1301 * inode lock when we do so. 1302 * 1303 * We need to wait for any direct I/Os in flight to complete before we 1304 * proceed with the truncate. This is needed to prevent the extents 1305 * being read or written by the direct I/Os from being removed while the 1306 * I/O is in flight as there is no other method of synchronising 1307 * direct I/O with the truncate operation. Also, because we hold 1308 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being 1309 * started until the truncate completes and drops the lock. Essentially, 1310 * the xfs_ioend_wait() call forms an I/O barrier that provides strict 1311 * ordering between direct I/Os and the truncate operation. 1312 * 1313 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE 1314 * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used 1315 * in the case that the caller is locking things out of order and 1316 * may not be able to call xfs_itruncate_finish() with the inode lock 1317 * held without dropping the I/O lock. If the caller must drop the 1318 * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() 1319 * must be called again with all the same restrictions as the initial 1320 * call. 1321 */ 1322 int 1323 xfs_itruncate_start( 1324 xfs_inode_t *ip, 1325 uint flags, 1326 xfs_fsize_t new_size) 1327 { 1328 xfs_fsize_t last_byte; 1329 xfs_off_t toss_start; 1330 xfs_mount_t *mp; 1331 int error = 0; 1332 1333 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1334 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1335 ASSERT((flags == XFS_ITRUNC_DEFINITE) || 1336 (flags == XFS_ITRUNC_MAYBE)); 1337 1338 mp = ip->i_mount; 1339 1340 /* wait for the completion of any pending DIOs */ 1341 if (new_size == 0 || new_size < ip->i_size) 1342 xfs_ioend_wait(ip); 1343 1344 /* 1345 * Call toss_pages or flushinval_pages to get rid of pages 1346 * overlapping the region being removed. We have to use 1347 * the less efficient flushinval_pages in the case that the 1348 * caller may not be able to finish the truncate without 1349 * dropping the inode's I/O lock. Make sure 1350 * to catch any pages brought in by buffers overlapping 1351 * the EOF by searching out beyond the isize by our 1352 * block size. We round new_size up to a block boundary 1353 * so that we don't toss things on the same block as 1354 * new_size but before it. 1355 * 1356 * Before calling toss_page or flushinval_pages, make sure to 1357 * call remapf() over the same region if the file is mapped. 1358 * This frees up mapped file references to the pages in the 1359 * given range and for the flushinval_pages case it ensures 1360 * that we get the latest mapped changes flushed out. 1361 */ 1362 toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1363 toss_start = XFS_FSB_TO_B(mp, toss_start); 1364 if (toss_start < 0) { 1365 /* 1366 * The place to start tossing is beyond our maximum 1367 * file size, so there is no way that the data extended 1368 * out there. 1369 */ 1370 return 0; 1371 } 1372 last_byte = xfs_file_last_byte(ip); 1373 trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); 1374 if (last_byte > toss_start) { 1375 if (flags & XFS_ITRUNC_DEFINITE) { 1376 xfs_tosspages(ip, toss_start, 1377 -1, FI_REMAPF_LOCKED); 1378 } else { 1379 error = xfs_flushinval_pages(ip, toss_start, 1380 -1, FI_REMAPF_LOCKED); 1381 } 1382 } 1383 1384 #ifdef DEBUG 1385 if (new_size == 0) { 1386 ASSERT(VN_CACHED(VFS_I(ip)) == 0); 1387 } 1388 #endif 1389 return error; 1390 } 1391 1392 /* 1393 * Shrink the file to the given new_size. The new size must be smaller than 1394 * the current size. This will free up the underlying blocks in the removed 1395 * range after a call to xfs_itruncate_start() or xfs_atruncate_start(). 1396 * 1397 * The transaction passed to this routine must have made a permanent log 1398 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1399 * given transaction and start new ones, so make sure everything involved in 1400 * the transaction is tidy before calling here. Some transaction will be 1401 * returned to the caller to be committed. The incoming transaction must 1402 * already include the inode, and both inode locks must be held exclusively. 1403 * The inode must also be "held" within the transaction. On return the inode 1404 * will be "held" within the returned transaction. This routine does NOT 1405 * require any disk space to be reserved for it within the transaction. 1406 * 1407 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it 1408 * indicates the fork which is to be truncated. For the attribute fork we only 1409 * support truncation to size 0. 1410 * 1411 * We use the sync parameter to indicate whether or not the first transaction 1412 * we perform might have to be synchronous. For the attr fork, it needs to be 1413 * so if the unlink of the inode is not yet known to be permanent in the log. 1414 * This keeps us from freeing and reusing the blocks of the attribute fork 1415 * before the unlink of the inode becomes permanent. 1416 * 1417 * For the data fork, we normally have to run synchronously if we're being 1418 * called out of the inactive path or we're being called out of the create path 1419 * where we're truncating an existing file. Either way, the truncate needs to 1420 * be sync so blocks don't reappear in the file with altered data in case of a 1421 * crash. wsync filesystems can run the first case async because anything that 1422 * shrinks the inode has to run sync so by the time we're called here from 1423 * inactive, the inode size is permanently set to 0. 1424 * 1425 * Calls from the truncate path always need to be sync unless we're in a wsync 1426 * filesystem and the file has already been unlinked. 1427 * 1428 * The caller is responsible for correctly setting the sync parameter. It gets 1429 * too hard for us to guess here which path we're being called out of just 1430 * based on inode state. 1431 * 1432 * If we get an error, we must return with the inode locked and linked into the 1433 * current transaction. This keeps things simple for the higher level code, 1434 * because it always knows that the inode is locked and held in the transaction 1435 * that returns to it whether errors occur or not. We don't mark the inode 1436 * dirty on error so that transactions can be easily aborted if possible. 1437 */ 1438 int 1439 xfs_itruncate_finish( 1440 xfs_trans_t **tp, 1441 xfs_inode_t *ip, 1442 xfs_fsize_t new_size, 1443 int fork, 1444 int sync) 1445 { 1446 xfs_fsblock_t first_block; 1447 xfs_fileoff_t first_unmap_block; 1448 xfs_fileoff_t last_block; 1449 xfs_filblks_t unmap_len=0; 1450 xfs_mount_t *mp; 1451 xfs_trans_t *ntp; 1452 int done; 1453 int committed; 1454 xfs_bmap_free_t free_list; 1455 int error; 1456 1457 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1458 ASSERT((new_size == 0) || (new_size <= ip->i_size)); 1459 ASSERT(*tp != NULL); 1460 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 1461 ASSERT(ip->i_transp == *tp); 1462 ASSERT(ip->i_itemp != NULL); 1463 ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); 1464 1465 1466 ntp = *tp; 1467 mp = (ntp)->t_mountp; 1468 ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); 1469 1470 /* 1471 * We only support truncating the entire attribute fork. 1472 */ 1473 if (fork == XFS_ATTR_FORK) { 1474 new_size = 0LL; 1475 } 1476 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1477 trace_xfs_itruncate_finish_start(ip, new_size); 1478 1479 /* 1480 * The first thing we do is set the size to new_size permanently 1481 * on disk. This way we don't have to worry about anyone ever 1482 * being able to look at the data being freed even in the face 1483 * of a crash. What we're getting around here is the case where 1484 * we free a block, it is allocated to another file, it is written 1485 * to, and then we crash. If the new data gets written to the 1486 * file but the log buffers containing the free and reallocation 1487 * don't, then we'd end up with garbage in the blocks being freed. 1488 * As long as we make the new_size permanent before actually 1489 * freeing any blocks it doesn't matter if they get writtten to. 1490 * 1491 * The callers must signal into us whether or not the size 1492 * setting here must be synchronous. There are a few cases 1493 * where it doesn't have to be synchronous. Those cases 1494 * occur if the file is unlinked and we know the unlink is 1495 * permanent or if the blocks being truncated are guaranteed 1496 * to be beyond the inode eof (regardless of the link count) 1497 * and the eof value is permanent. Both of these cases occur 1498 * only on wsync-mounted filesystems. In those cases, we're 1499 * guaranteed that no user will ever see the data in the blocks 1500 * that are being truncated so the truncate can run async. 1501 * In the free beyond eof case, the file may wind up with 1502 * more blocks allocated to it than it needs if we crash 1503 * and that won't get fixed until the next time the file 1504 * is re-opened and closed but that's ok as that shouldn't 1505 * be too many blocks. 1506 * 1507 * However, we can't just make all wsync xactions run async 1508 * because there's one call out of the create path that needs 1509 * to run sync where it's truncating an existing file to size 1510 * 0 whose size is > 0. 1511 * 1512 * It's probably possible to come up with a test in this 1513 * routine that would correctly distinguish all the above 1514 * cases from the values of the function parameters and the 1515 * inode state but for sanity's sake, I've decided to let the 1516 * layers above just tell us. It's simpler to correctly figure 1517 * out in the layer above exactly under what conditions we 1518 * can run async and I think it's easier for others read and 1519 * follow the logic in case something has to be changed. 1520 * cscope is your friend -- rcc. 1521 * 1522 * The attribute fork is much simpler. 1523 * 1524 * For the attribute fork we allow the caller to tell us whether 1525 * the unlink of the inode that led to this call is yet permanent 1526 * in the on disk log. If it is not and we will be freeing extents 1527 * in this inode then we make the first transaction synchronous 1528 * to make sure that the unlink is permanent by the time we free 1529 * the blocks. 1530 */ 1531 if (fork == XFS_DATA_FORK) { 1532 if (ip->i_d.di_nextents > 0) { 1533 /* 1534 * If we are not changing the file size then do 1535 * not update the on-disk file size - we may be 1536 * called from xfs_inactive_free_eofblocks(). If we 1537 * update the on-disk file size and then the system 1538 * crashes before the contents of the file are 1539 * flushed to disk then the files may be full of 1540 * holes (ie NULL files bug). 1541 */ 1542 if (ip->i_size != new_size) { 1543 ip->i_d.di_size = new_size; 1544 ip->i_size = new_size; 1545 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1546 } 1547 } 1548 } else if (sync) { 1549 ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); 1550 if (ip->i_d.di_anextents > 0) 1551 xfs_trans_set_sync(ntp); 1552 } 1553 ASSERT(fork == XFS_DATA_FORK || 1554 (fork == XFS_ATTR_FORK && 1555 ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || 1556 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); 1557 1558 /* 1559 * Since it is possible for space to become allocated beyond 1560 * the end of the file (in a crash where the space is allocated 1561 * but the inode size is not yet updated), simply remove any 1562 * blocks which show up between the new EOF and the maximum 1563 * possible file size. If the first block to be removed is 1564 * beyond the maximum file size (ie it is the same as last_block), 1565 * then there is nothing to do. 1566 */ 1567 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1568 ASSERT(first_unmap_block <= last_block); 1569 done = 0; 1570 if (last_block == first_unmap_block) { 1571 done = 1; 1572 } else { 1573 unmap_len = last_block - first_unmap_block + 1; 1574 } 1575 while (!done) { 1576 /* 1577 * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() 1578 * will tell us whether it freed the entire range or 1579 * not. If this is a synchronous mount (wsync), 1580 * then we can tell bunmapi to keep all the 1581 * transactions asynchronous since the unlink 1582 * transaction that made this inode inactive has 1583 * already hit the disk. There's no danger of 1584 * the freed blocks being reused, there being a 1585 * crash, and the reused blocks suddenly reappearing 1586 * in this file with garbage in them once recovery 1587 * runs. 1588 */ 1589 xfs_bmap_init(&free_list, &first_block); 1590 error = xfs_bunmapi(ntp, ip, 1591 first_unmap_block, unmap_len, 1592 xfs_bmapi_aflag(fork) | 1593 (sync ? 0 : XFS_BMAPI_ASYNC), 1594 XFS_ITRUNC_MAX_EXTENTS, 1595 &first_block, &free_list, 1596 NULL, &done); 1597 if (error) { 1598 /* 1599 * If the bunmapi call encounters an error, 1600 * return to the caller where the transaction 1601 * can be properly aborted. We just need to 1602 * make sure we're not holding any resources 1603 * that we were not when we came in. 1604 */ 1605 xfs_bmap_cancel(&free_list); 1606 return error; 1607 } 1608 1609 /* 1610 * Duplicate the transaction that has the permanent 1611 * reservation and commit the old transaction. 1612 */ 1613 error = xfs_bmap_finish(tp, &free_list, &committed); 1614 ntp = *tp; 1615 if (committed) { 1616 /* link the inode into the next xact in the chain */ 1617 xfs_trans_ijoin(ntp, ip, 1618 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1619 xfs_trans_ihold(ntp, ip); 1620 } 1621 1622 if (error) { 1623 /* 1624 * If the bmap finish call encounters an error, return 1625 * to the caller where the transaction can be properly 1626 * aborted. We just need to make sure we're not 1627 * holding any resources that we were not when we came 1628 * in. 1629 * 1630 * Aborting from this point might lose some blocks in 1631 * the file system, but oh well. 1632 */ 1633 xfs_bmap_cancel(&free_list); 1634 return error; 1635 } 1636 1637 if (committed) { 1638 /* 1639 * Mark the inode dirty so it will be logged and 1640 * moved forward in the log as part of every commit. 1641 */ 1642 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1643 } 1644 1645 ntp = xfs_trans_dup(ntp); 1646 error = xfs_trans_commit(*tp, 0); 1647 *tp = ntp; 1648 1649 /* link the inode into the next transaction in the chain */ 1650 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1651 xfs_trans_ihold(ntp, ip); 1652 1653 if (error) 1654 return error; 1655 /* 1656 * transaction commit worked ok so we can drop the extra ticket 1657 * reference that we gained in xfs_trans_dup() 1658 */ 1659 xfs_log_ticket_put(ntp->t_ticket); 1660 error = xfs_trans_reserve(ntp, 0, 1661 XFS_ITRUNCATE_LOG_RES(mp), 0, 1662 XFS_TRANS_PERM_LOG_RES, 1663 XFS_ITRUNCATE_LOG_COUNT); 1664 if (error) 1665 return error; 1666 } 1667 /* 1668 * Only update the size in the case of the data fork, but 1669 * always re-log the inode so that our permanent transaction 1670 * can keep on rolling it forward in the log. 1671 */ 1672 if (fork == XFS_DATA_FORK) { 1673 xfs_isize_check(mp, ip, new_size); 1674 /* 1675 * If we are not changing the file size then do 1676 * not update the on-disk file size - we may be 1677 * called from xfs_inactive_free_eofblocks(). If we 1678 * update the on-disk file size and then the system 1679 * crashes before the contents of the file are 1680 * flushed to disk then the files may be full of 1681 * holes (ie NULL files bug). 1682 */ 1683 if (ip->i_size != new_size) { 1684 ip->i_d.di_size = new_size; 1685 ip->i_size = new_size; 1686 } 1687 } 1688 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1689 ASSERT((new_size != 0) || 1690 (fork == XFS_ATTR_FORK) || 1691 (ip->i_delayed_blks == 0)); 1692 ASSERT((new_size != 0) || 1693 (fork == XFS_ATTR_FORK) || 1694 (ip->i_d.di_nextents == 0)); 1695 trace_xfs_itruncate_finish_end(ip, new_size); 1696 return 0; 1697 } 1698 1699 /* 1700 * This is called when the inode's link count goes to 0. 1701 * We place the on-disk inode on a list in the AGI. It 1702 * will be pulled from this list when the inode is freed. 1703 */ 1704 int 1705 xfs_iunlink( 1706 xfs_trans_t *tp, 1707 xfs_inode_t *ip) 1708 { 1709 xfs_mount_t *mp; 1710 xfs_agi_t *agi; 1711 xfs_dinode_t *dip; 1712 xfs_buf_t *agibp; 1713 xfs_buf_t *ibp; 1714 xfs_agino_t agino; 1715 short bucket_index; 1716 int offset; 1717 int error; 1718 1719 ASSERT(ip->i_d.di_nlink == 0); 1720 ASSERT(ip->i_d.di_mode != 0); 1721 ASSERT(ip->i_transp == tp); 1722 1723 mp = tp->t_mountp; 1724 1725 /* 1726 * Get the agi buffer first. It ensures lock ordering 1727 * on the list. 1728 */ 1729 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1730 if (error) 1731 return error; 1732 agi = XFS_BUF_TO_AGI(agibp); 1733 1734 /* 1735 * Get the index into the agi hash table for the 1736 * list this inode will go on. 1737 */ 1738 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1739 ASSERT(agino != 0); 1740 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1741 ASSERT(agi->agi_unlinked[bucket_index]); 1742 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1743 1744 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { 1745 /* 1746 * There is already another inode in the bucket we need 1747 * to add ourselves to. Add us at the front of the list. 1748 * Here we put the head pointer into our next pointer, 1749 * and then we fall through to point the head at us. 1750 */ 1751 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1752 if (error) 1753 return error; 1754 1755 ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO); 1756 /* both on-disk, don't endian flip twice */ 1757 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1758 offset = ip->i_imap.im_boffset + 1759 offsetof(xfs_dinode_t, di_next_unlinked); 1760 xfs_trans_inode_buf(tp, ibp); 1761 xfs_trans_log_buf(tp, ibp, offset, 1762 (offset + sizeof(xfs_agino_t) - 1)); 1763 xfs_inobp_check(mp, ibp); 1764 } 1765 1766 /* 1767 * Point the bucket head pointer at the inode being inserted. 1768 */ 1769 ASSERT(agino != 0); 1770 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1771 offset = offsetof(xfs_agi_t, agi_unlinked) + 1772 (sizeof(xfs_agino_t) * bucket_index); 1773 xfs_trans_log_buf(tp, agibp, offset, 1774 (offset + sizeof(xfs_agino_t) - 1)); 1775 return 0; 1776 } 1777 1778 /* 1779 * Pull the on-disk inode from the AGI unlinked list. 1780 */ 1781 STATIC int 1782 xfs_iunlink_remove( 1783 xfs_trans_t *tp, 1784 xfs_inode_t *ip) 1785 { 1786 xfs_ino_t next_ino; 1787 xfs_mount_t *mp; 1788 xfs_agi_t *agi; 1789 xfs_dinode_t *dip; 1790 xfs_buf_t *agibp; 1791 xfs_buf_t *ibp; 1792 xfs_agnumber_t agno; 1793 xfs_agino_t agino; 1794 xfs_agino_t next_agino; 1795 xfs_buf_t *last_ibp; 1796 xfs_dinode_t *last_dip = NULL; 1797 short bucket_index; 1798 int offset, last_offset = 0; 1799 int error; 1800 1801 mp = tp->t_mountp; 1802 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1803 1804 /* 1805 * Get the agi buffer first. It ensures lock ordering 1806 * on the list. 1807 */ 1808 error = xfs_read_agi(mp, tp, agno, &agibp); 1809 if (error) 1810 return error; 1811 1812 agi = XFS_BUF_TO_AGI(agibp); 1813 1814 /* 1815 * Get the index into the agi hash table for the 1816 * list this inode will go on. 1817 */ 1818 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1819 ASSERT(agino != 0); 1820 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1821 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); 1822 ASSERT(agi->agi_unlinked[bucket_index]); 1823 1824 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1825 /* 1826 * We're at the head of the list. Get the inode's 1827 * on-disk buffer to see if there is anyone after us 1828 * on the list. Only modify our next pointer if it 1829 * is not already NULLAGINO. This saves us the overhead 1830 * of dealing with the buffer when there is no need to 1831 * change it. 1832 */ 1833 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1834 if (error) { 1835 cmn_err(CE_WARN, 1836 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1837 error, mp->m_fsname); 1838 return error; 1839 } 1840 next_agino = be32_to_cpu(dip->di_next_unlinked); 1841 ASSERT(next_agino != 0); 1842 if (next_agino != NULLAGINO) { 1843 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1844 offset = ip->i_imap.im_boffset + 1845 offsetof(xfs_dinode_t, di_next_unlinked); 1846 xfs_trans_inode_buf(tp, ibp); 1847 xfs_trans_log_buf(tp, ibp, offset, 1848 (offset + sizeof(xfs_agino_t) - 1)); 1849 xfs_inobp_check(mp, ibp); 1850 } else { 1851 xfs_trans_brelse(tp, ibp); 1852 } 1853 /* 1854 * Point the bucket head pointer at the next inode. 1855 */ 1856 ASSERT(next_agino != 0); 1857 ASSERT(next_agino != agino); 1858 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 1859 offset = offsetof(xfs_agi_t, agi_unlinked) + 1860 (sizeof(xfs_agino_t) * bucket_index); 1861 xfs_trans_log_buf(tp, agibp, offset, 1862 (offset + sizeof(xfs_agino_t) - 1)); 1863 } else { 1864 /* 1865 * We need to search the list for the inode being freed. 1866 */ 1867 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1868 last_ibp = NULL; 1869 while (next_agino != agino) { 1870 /* 1871 * If the last inode wasn't the one pointing to 1872 * us, then release its buffer since we're not 1873 * going to do anything with it. 1874 */ 1875 if (last_ibp != NULL) { 1876 xfs_trans_brelse(tp, last_ibp); 1877 } 1878 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1879 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1880 &last_ibp, &last_offset, 0); 1881 if (error) { 1882 cmn_err(CE_WARN, 1883 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1884 error, mp->m_fsname); 1885 return error; 1886 } 1887 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1888 ASSERT(next_agino != NULLAGINO); 1889 ASSERT(next_agino != 0); 1890 } 1891 /* 1892 * Now last_ibp points to the buffer previous to us on 1893 * the unlinked list. Pull us from the list. 1894 */ 1895 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1896 if (error) { 1897 cmn_err(CE_WARN, 1898 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1899 error, mp->m_fsname); 1900 return error; 1901 } 1902 next_agino = be32_to_cpu(dip->di_next_unlinked); 1903 ASSERT(next_agino != 0); 1904 ASSERT(next_agino != agino); 1905 if (next_agino != NULLAGINO) { 1906 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1907 offset = ip->i_imap.im_boffset + 1908 offsetof(xfs_dinode_t, di_next_unlinked); 1909 xfs_trans_inode_buf(tp, ibp); 1910 xfs_trans_log_buf(tp, ibp, offset, 1911 (offset + sizeof(xfs_agino_t) - 1)); 1912 xfs_inobp_check(mp, ibp); 1913 } else { 1914 xfs_trans_brelse(tp, ibp); 1915 } 1916 /* 1917 * Point the previous inode on the list to the next inode. 1918 */ 1919 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1920 ASSERT(next_agino != 0); 1921 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1922 xfs_trans_inode_buf(tp, last_ibp); 1923 xfs_trans_log_buf(tp, last_ibp, offset, 1924 (offset + sizeof(xfs_agino_t) - 1)); 1925 xfs_inobp_check(mp, last_ibp); 1926 } 1927 return 0; 1928 } 1929 1930 STATIC void 1931 xfs_ifree_cluster( 1932 xfs_inode_t *free_ip, 1933 xfs_trans_t *tp, 1934 xfs_ino_t inum) 1935 { 1936 xfs_mount_t *mp = free_ip->i_mount; 1937 int blks_per_cluster; 1938 int nbufs; 1939 int ninodes; 1940 int i, j; 1941 xfs_daddr_t blkno; 1942 xfs_buf_t *bp; 1943 xfs_inode_t *ip; 1944 xfs_inode_log_item_t *iip; 1945 xfs_log_item_t *lip; 1946 struct xfs_perag *pag; 1947 1948 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 1949 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1950 blks_per_cluster = 1; 1951 ninodes = mp->m_sb.sb_inopblock; 1952 nbufs = XFS_IALLOC_BLOCKS(mp); 1953 } else { 1954 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 1955 mp->m_sb.sb_blocksize; 1956 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 1957 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1958 } 1959 1960 for (j = 0; j < nbufs; j++, inum += ninodes) { 1961 int found = 0; 1962 1963 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1964 XFS_INO_TO_AGBNO(mp, inum)); 1965 1966 /* 1967 * We obtain and lock the backing buffer first in the process 1968 * here, as we have to ensure that any dirty inode that we 1969 * can't get the flush lock on is attached to the buffer. 1970 * If we scan the in-memory inodes first, then buffer IO can 1971 * complete before we get a lock on it, and hence we may fail 1972 * to mark all the active inodes on the buffer stale. 1973 */ 1974 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1975 mp->m_bsize * blks_per_cluster, 1976 XBF_LOCK); 1977 1978 /* 1979 * Walk the inodes already attached to the buffer and mark them 1980 * stale. These will all have the flush locks held, so an 1981 * in-memory inode walk can't lock them. 1982 */ 1983 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1984 while (lip) { 1985 if (lip->li_type == XFS_LI_INODE) { 1986 iip = (xfs_inode_log_item_t *)lip; 1987 ASSERT(iip->ili_logged == 1); 1988 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; 1989 xfs_trans_ail_copy_lsn(mp->m_ail, 1990 &iip->ili_flush_lsn, 1991 &iip->ili_item.li_lsn); 1992 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1993 found++; 1994 } 1995 lip = lip->li_bio_list; 1996 } 1997 1998 /* 1999 * For each inode in memory attempt to add it to the inode 2000 * buffer and set it up for being staled on buffer IO 2001 * completion. This is safe as we've locked out tail pushing 2002 * and flushing by locking the buffer. 2003 * 2004 * We have already marked every inode that was part of a 2005 * transaction stale above, which means there is no point in 2006 * even trying to lock them. 2007 */ 2008 for (i = 0; i < ninodes; i++) { 2009 read_lock(&pag->pag_ici_lock); 2010 ip = radix_tree_lookup(&pag->pag_ici_root, 2011 XFS_INO_TO_AGINO(mp, (inum + i))); 2012 2013 /* Inode not in memory or stale, nothing to do */ 2014 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2015 read_unlock(&pag->pag_ici_lock); 2016 continue; 2017 } 2018 2019 /* don't try to lock/unlock the current inode */ 2020 if (ip != free_ip && 2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2022 read_unlock(&pag->pag_ici_lock); 2023 continue; 2024 } 2025 read_unlock(&pag->pag_ici_lock); 2026 2027 if (!xfs_iflock_nowait(ip)) { 2028 if (ip != free_ip) 2029 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2030 continue; 2031 } 2032 2033 xfs_iflags_set(ip, XFS_ISTALE); 2034 if (xfs_inode_clean(ip)) { 2035 ASSERT(ip != free_ip); 2036 xfs_ifunlock(ip); 2037 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2038 continue; 2039 } 2040 2041 iip = ip->i_itemp; 2042 if (!iip) { 2043 /* inode with unlogged changes only */ 2044 ASSERT(ip != free_ip); 2045 ip->i_update_core = 0; 2046 xfs_ifunlock(ip); 2047 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2048 continue; 2049 } 2050 found++; 2051 2052 iip->ili_last_fields = iip->ili_format.ilf_fields; 2053 iip->ili_format.ilf_fields = 0; 2054 iip->ili_logged = 1; 2055 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2056 &iip->ili_item.li_lsn); 2057 2058 xfs_buf_attach_iodone(bp, 2059 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2060 xfs_istale_done, (xfs_log_item_t *)iip); 2061 2062 if (ip != free_ip) 2063 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2064 } 2065 2066 if (found) 2067 xfs_trans_stale_inode_buf(tp, bp); 2068 xfs_trans_binval(tp, bp); 2069 } 2070 2071 xfs_perag_put(pag); 2072 } 2073 2074 /* 2075 * This is called to return an inode to the inode free list. 2076 * The inode should already be truncated to 0 length and have 2077 * no pages associated with it. This routine also assumes that 2078 * the inode is already a part of the transaction. 2079 * 2080 * The on-disk copy of the inode will have been added to the list 2081 * of unlinked inodes in the AGI. We need to remove the inode from 2082 * that list atomically with respect to freeing it here. 2083 */ 2084 int 2085 xfs_ifree( 2086 xfs_trans_t *tp, 2087 xfs_inode_t *ip, 2088 xfs_bmap_free_t *flist) 2089 { 2090 int error; 2091 int delete; 2092 xfs_ino_t first_ino; 2093 xfs_dinode_t *dip; 2094 xfs_buf_t *ibp; 2095 2096 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2097 ASSERT(ip->i_transp == tp); 2098 ASSERT(ip->i_d.di_nlink == 0); 2099 ASSERT(ip->i_d.di_nextents == 0); 2100 ASSERT(ip->i_d.di_anextents == 0); 2101 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 2102 ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); 2103 ASSERT(ip->i_d.di_nblocks == 0); 2104 2105 /* 2106 * Pull the on-disk inode from the AGI unlinked list. 2107 */ 2108 error = xfs_iunlink_remove(tp, ip); 2109 if (error != 0) { 2110 return error; 2111 } 2112 2113 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2114 if (error != 0) { 2115 return error; 2116 } 2117 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2118 ip->i_d.di_flags = 0; 2119 ip->i_d.di_dmevmask = 0; 2120 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 2121 ip->i_df.if_ext_max = 2122 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); 2123 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 2124 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 2125 /* 2126 * Bump the generation count so no one will be confused 2127 * by reincarnations of this inode. 2128 */ 2129 ip->i_d.di_gen++; 2130 2131 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2132 2133 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK); 2134 if (error) 2135 return error; 2136 2137 /* 2138 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat 2139 * from picking up this inode when it is reclaimed (its incore state 2140 * initialzed but not flushed to disk yet). The in-core di_mode is 2141 * already cleared and a corresponding transaction logged. 2142 * The hack here just synchronizes the in-core to on-disk 2143 * di_mode value in advance before the actual inode sync to disk. 2144 * This is OK because the inode is already unlinked and would never 2145 * change its di_mode again for this inode generation. 2146 * This is a temporary hack that would require a proper fix 2147 * in the future. 2148 */ 2149 dip->di_mode = 0; 2150 2151 if (delete) { 2152 xfs_ifree_cluster(ip, tp, first_ino); 2153 } 2154 2155 return 0; 2156 } 2157 2158 /* 2159 * Reallocate the space for if_broot based on the number of records 2160 * being added or deleted as indicated in rec_diff. Move the records 2161 * and pointers in if_broot to fit the new size. When shrinking this 2162 * will eliminate holes between the records and pointers created by 2163 * the caller. When growing this will create holes to be filled in 2164 * by the caller. 2165 * 2166 * The caller must not request to add more records than would fit in 2167 * the on-disk inode root. If the if_broot is currently NULL, then 2168 * if we adding records one will be allocated. The caller must also 2169 * not request that the number of records go below zero, although 2170 * it can go to zero. 2171 * 2172 * ip -- the inode whose if_broot area is changing 2173 * ext_diff -- the change in the number of records, positive or negative, 2174 * requested for the if_broot array. 2175 */ 2176 void 2177 xfs_iroot_realloc( 2178 xfs_inode_t *ip, 2179 int rec_diff, 2180 int whichfork) 2181 { 2182 struct xfs_mount *mp = ip->i_mount; 2183 int cur_max; 2184 xfs_ifork_t *ifp; 2185 struct xfs_btree_block *new_broot; 2186 int new_max; 2187 size_t new_size; 2188 char *np; 2189 char *op; 2190 2191 /* 2192 * Handle the degenerate case quietly. 2193 */ 2194 if (rec_diff == 0) { 2195 return; 2196 } 2197 2198 ifp = XFS_IFORK_PTR(ip, whichfork); 2199 if (rec_diff > 0) { 2200 /* 2201 * If there wasn't any memory allocated before, just 2202 * allocate it now and get out. 2203 */ 2204 if (ifp->if_broot_bytes == 0) { 2205 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 2206 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP); 2207 ifp->if_broot_bytes = (int)new_size; 2208 return; 2209 } 2210 2211 /* 2212 * If there is already an existing if_broot, then we need 2213 * to realloc() it and shift the pointers to their new 2214 * location. The records don't change location because 2215 * they are kept butted up against the btree block header. 2216 */ 2217 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2218 new_max = cur_max + rec_diff; 2219 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2220 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 2221 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 2222 KM_SLEEP); 2223 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2224 ifp->if_broot_bytes); 2225 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2226 (int)new_size); 2227 ifp->if_broot_bytes = (int)new_size; 2228 ASSERT(ifp->if_broot_bytes <= 2229 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2230 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2231 return; 2232 } 2233 2234 /* 2235 * rec_diff is less than 0. In this case, we are shrinking the 2236 * if_broot buffer. It must already exist. If we go to zero 2237 * records, just get rid of the root and clear the status bit. 2238 */ 2239 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 2240 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 2241 new_max = cur_max + rec_diff; 2242 ASSERT(new_max >= 0); 2243 if (new_max > 0) 2244 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 2245 else 2246 new_size = 0; 2247 if (new_size > 0) { 2248 new_broot = kmem_alloc(new_size, KM_SLEEP); 2249 /* 2250 * First copy over the btree block header. 2251 */ 2252 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); 2253 } else { 2254 new_broot = NULL; 2255 ifp->if_flags &= ~XFS_IFBROOT; 2256 } 2257 2258 /* 2259 * Only copy the records and pointers if there are any. 2260 */ 2261 if (new_max > 0) { 2262 /* 2263 * First copy the records. 2264 */ 2265 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); 2266 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); 2267 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 2268 2269 /* 2270 * Then copy the pointers. 2271 */ 2272 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2273 ifp->if_broot_bytes); 2274 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 2275 (int)new_size); 2276 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 2277 } 2278 kmem_free(ifp->if_broot); 2279 ifp->if_broot = new_broot; 2280 ifp->if_broot_bytes = (int)new_size; 2281 ASSERT(ifp->if_broot_bytes <= 2282 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 2283 return; 2284 } 2285 2286 2287 /* 2288 * This is called when the amount of space needed for if_data 2289 * is increased or decreased. The change in size is indicated by 2290 * the number of bytes that need to be added or deleted in the 2291 * byte_diff parameter. 2292 * 2293 * If the amount of space needed has decreased below the size of the 2294 * inline buffer, then switch to using the inline buffer. Otherwise, 2295 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 2296 * to what is needed. 2297 * 2298 * ip -- the inode whose if_data area is changing 2299 * byte_diff -- the change in the number of bytes, positive or negative, 2300 * requested for the if_data array. 2301 */ 2302 void 2303 xfs_idata_realloc( 2304 xfs_inode_t *ip, 2305 int byte_diff, 2306 int whichfork) 2307 { 2308 xfs_ifork_t *ifp; 2309 int new_size; 2310 int real_size; 2311 2312 if (byte_diff == 0) { 2313 return; 2314 } 2315 2316 ifp = XFS_IFORK_PTR(ip, whichfork); 2317 new_size = (int)ifp->if_bytes + byte_diff; 2318 ASSERT(new_size >= 0); 2319 2320 if (new_size == 0) { 2321 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2322 kmem_free(ifp->if_u1.if_data); 2323 } 2324 ifp->if_u1.if_data = NULL; 2325 real_size = 0; 2326 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 2327 /* 2328 * If the valid extents/data can fit in if_inline_ext/data, 2329 * copy them from the malloc'd vector and free it. 2330 */ 2331 if (ifp->if_u1.if_data == NULL) { 2332 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2333 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2334 ASSERT(ifp->if_real_bytes != 0); 2335 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 2336 new_size); 2337 kmem_free(ifp->if_u1.if_data); 2338 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 2339 } 2340 real_size = 0; 2341 } else { 2342 /* 2343 * Stuck with malloc/realloc. 2344 * For inline data, the underlying buffer must be 2345 * a multiple of 4 bytes in size so that it can be 2346 * logged and stay on word boundaries. We enforce 2347 * that here. 2348 */ 2349 real_size = roundup(new_size, 4); 2350 if (ifp->if_u1.if_data == NULL) { 2351 ASSERT(ifp->if_real_bytes == 0); 2352 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2353 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 2354 /* 2355 * Only do the realloc if the underlying size 2356 * is really changing. 2357 */ 2358 if (ifp->if_real_bytes != real_size) { 2359 ifp->if_u1.if_data = 2360 kmem_realloc(ifp->if_u1.if_data, 2361 real_size, 2362 ifp->if_real_bytes, 2363 KM_SLEEP); 2364 } 2365 } else { 2366 ASSERT(ifp->if_real_bytes == 0); 2367 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); 2368 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 2369 ifp->if_bytes); 2370 } 2371 } 2372 ifp->if_real_bytes = real_size; 2373 ifp->if_bytes = new_size; 2374 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2375 } 2376 2377 void 2378 xfs_idestroy_fork( 2379 xfs_inode_t *ip, 2380 int whichfork) 2381 { 2382 xfs_ifork_t *ifp; 2383 2384 ifp = XFS_IFORK_PTR(ip, whichfork); 2385 if (ifp->if_broot != NULL) { 2386 kmem_free(ifp->if_broot); 2387 ifp->if_broot = NULL; 2388 } 2389 2390 /* 2391 * If the format is local, then we can't have an extents 2392 * array so just look for an inline data array. If we're 2393 * not local then we may or may not have an extents list, 2394 * so check and free it up if we do. 2395 */ 2396 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2397 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2398 (ifp->if_u1.if_data != NULL)) { 2399 ASSERT(ifp->if_real_bytes != 0); 2400 kmem_free(ifp->if_u1.if_data); 2401 ifp->if_u1.if_data = NULL; 2402 ifp->if_real_bytes = 0; 2403 } 2404 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2405 ((ifp->if_flags & XFS_IFEXTIREC) || 2406 ((ifp->if_u1.if_extents != NULL) && 2407 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2408 ASSERT(ifp->if_real_bytes != 0); 2409 xfs_iext_destroy(ifp); 2410 } 2411 ASSERT(ifp->if_u1.if_extents == NULL || 2412 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2413 ASSERT(ifp->if_real_bytes == 0); 2414 if (whichfork == XFS_ATTR_FORK) { 2415 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2416 ip->i_afp = NULL; 2417 } 2418 } 2419 2420 /* 2421 * This is called to unpin an inode. The caller must have the inode locked 2422 * in at least shared mode so that the buffer cannot be subsequently pinned 2423 * once someone is waiting for it to be unpinned. 2424 */ 2425 static void 2426 xfs_iunpin_nowait( 2427 struct xfs_inode *ip) 2428 { 2429 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2430 2431 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2432 2433 /* Give the log a push to start the unpinning I/O */ 2434 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2435 2436 } 2437 2438 void 2439 xfs_iunpin_wait( 2440 struct xfs_inode *ip) 2441 { 2442 if (xfs_ipincount(ip)) { 2443 xfs_iunpin_nowait(ip); 2444 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0)); 2445 } 2446 } 2447 2448 /* 2449 * xfs_iextents_copy() 2450 * 2451 * This is called to copy the REAL extents (as opposed to the delayed 2452 * allocation extents) from the inode into the given buffer. It 2453 * returns the number of bytes copied into the buffer. 2454 * 2455 * If there are no delayed allocation extents, then we can just 2456 * memcpy() the extents into the buffer. Otherwise, we need to 2457 * examine each extent in turn and skip those which are delayed. 2458 */ 2459 int 2460 xfs_iextents_copy( 2461 xfs_inode_t *ip, 2462 xfs_bmbt_rec_t *dp, 2463 int whichfork) 2464 { 2465 int copied; 2466 int i; 2467 xfs_ifork_t *ifp; 2468 int nrecs; 2469 xfs_fsblock_t start_block; 2470 2471 ifp = XFS_IFORK_PTR(ip, whichfork); 2472 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2473 ASSERT(ifp->if_bytes > 0); 2474 2475 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2476 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2477 ASSERT(nrecs > 0); 2478 2479 /* 2480 * There are some delayed allocation extents in the 2481 * inode, so copy the extents one at a time and skip 2482 * the delayed ones. There must be at least one 2483 * non-delayed extent. 2484 */ 2485 copied = 0; 2486 for (i = 0; i < nrecs; i++) { 2487 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2488 start_block = xfs_bmbt_get_startblock(ep); 2489 if (isnullstartblock(start_block)) { 2490 /* 2491 * It's a delayed allocation extent, so skip it. 2492 */ 2493 continue; 2494 } 2495 2496 /* Translate to on disk format */ 2497 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2498 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2499 dp++; 2500 copied++; 2501 } 2502 ASSERT(copied != 0); 2503 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); 2504 2505 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2506 } 2507 2508 /* 2509 * Each of the following cases stores data into the same region 2510 * of the on-disk inode, so only one of them can be valid at 2511 * any given time. While it is possible to have conflicting formats 2512 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2513 * in EXTENTS format, this can only happen when the fork has 2514 * changed formats after being modified but before being flushed. 2515 * In these cases, the format always takes precedence, because the 2516 * format indicates the current state of the fork. 2517 */ 2518 /*ARGSUSED*/ 2519 STATIC void 2520 xfs_iflush_fork( 2521 xfs_inode_t *ip, 2522 xfs_dinode_t *dip, 2523 xfs_inode_log_item_t *iip, 2524 int whichfork, 2525 xfs_buf_t *bp) 2526 { 2527 char *cp; 2528 xfs_ifork_t *ifp; 2529 xfs_mount_t *mp; 2530 #ifdef XFS_TRANS_DEBUG 2531 int first; 2532 #endif 2533 static const short brootflag[2] = 2534 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2535 static const short dataflag[2] = 2536 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2537 static const short extflag[2] = 2538 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2539 2540 if (!iip) 2541 return; 2542 ifp = XFS_IFORK_PTR(ip, whichfork); 2543 /* 2544 * This can happen if we gave up in iformat in an error path, 2545 * for the attribute fork. 2546 */ 2547 if (!ifp) { 2548 ASSERT(whichfork == XFS_ATTR_FORK); 2549 return; 2550 } 2551 cp = XFS_DFORK_PTR(dip, whichfork); 2552 mp = ip->i_mount; 2553 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2554 case XFS_DINODE_FMT_LOCAL: 2555 if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && 2556 (ifp->if_bytes > 0)) { 2557 ASSERT(ifp->if_u1.if_data != NULL); 2558 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2559 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2560 } 2561 break; 2562 2563 case XFS_DINODE_FMT_EXTENTS: 2564 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2565 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2566 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || 2567 (ifp->if_bytes == 0)); 2568 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || 2569 (ifp->if_bytes > 0)); 2570 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2571 (ifp->if_bytes > 0)) { 2572 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2573 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2574 whichfork); 2575 } 2576 break; 2577 2578 case XFS_DINODE_FMT_BTREE: 2579 if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && 2580 (ifp->if_broot_bytes > 0)) { 2581 ASSERT(ifp->if_broot != NULL); 2582 ASSERT(ifp->if_broot_bytes <= 2583 (XFS_IFORK_SIZE(ip, whichfork) + 2584 XFS_BROOT_SIZE_ADJ)); 2585 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2586 (xfs_bmdr_block_t *)cp, 2587 XFS_DFORK_SIZE(dip, mp, whichfork)); 2588 } 2589 break; 2590 2591 case XFS_DINODE_FMT_DEV: 2592 if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { 2593 ASSERT(whichfork == XFS_DATA_FORK); 2594 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2595 } 2596 break; 2597 2598 case XFS_DINODE_FMT_UUID: 2599 if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { 2600 ASSERT(whichfork == XFS_DATA_FORK); 2601 memcpy(XFS_DFORK_DPTR(dip), 2602 &ip->i_df.if_u2.if_uuid, 2603 sizeof(uuid_t)); 2604 } 2605 break; 2606 2607 default: 2608 ASSERT(0); 2609 break; 2610 } 2611 } 2612 2613 STATIC int 2614 xfs_iflush_cluster( 2615 xfs_inode_t *ip, 2616 xfs_buf_t *bp) 2617 { 2618 xfs_mount_t *mp = ip->i_mount; 2619 struct xfs_perag *pag; 2620 unsigned long first_index, mask; 2621 unsigned long inodes_per_cluster; 2622 int ilist_size; 2623 xfs_inode_t **ilist; 2624 xfs_inode_t *iq; 2625 int nr_found; 2626 int clcount = 0; 2627 int bufwasdelwri; 2628 int i; 2629 2630 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2631 2632 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2633 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2634 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2635 if (!ilist) 2636 goto out_put; 2637 2638 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2639 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2640 read_lock(&pag->pag_ici_lock); 2641 /* really need a gang lookup range call here */ 2642 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2643 first_index, inodes_per_cluster); 2644 if (nr_found == 0) 2645 goto out_free; 2646 2647 for (i = 0; i < nr_found; i++) { 2648 iq = ilist[i]; 2649 if (iq == ip) 2650 continue; 2651 /* if the inode lies outside this cluster, we're done. */ 2652 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2653 break; 2654 /* 2655 * Do an un-protected check to see if the inode is dirty and 2656 * is a candidate for flushing. These checks will be repeated 2657 * later after the appropriate locks are acquired. 2658 */ 2659 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2660 continue; 2661 2662 /* 2663 * Try to get locks. If any are unavailable or it is pinned, 2664 * then this inode cannot be flushed and is skipped. 2665 */ 2666 2667 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2668 continue; 2669 if (!xfs_iflock_nowait(iq)) { 2670 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2671 continue; 2672 } 2673 if (xfs_ipincount(iq)) { 2674 xfs_ifunlock(iq); 2675 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2676 continue; 2677 } 2678 2679 /* 2680 * arriving here means that this inode can be flushed. First 2681 * re-check that it's dirty before flushing. 2682 */ 2683 if (!xfs_inode_clean(iq)) { 2684 int error; 2685 error = xfs_iflush_int(iq, bp); 2686 if (error) { 2687 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2688 goto cluster_corrupt_out; 2689 } 2690 clcount++; 2691 } else { 2692 xfs_ifunlock(iq); 2693 } 2694 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2695 } 2696 2697 if (clcount) { 2698 XFS_STATS_INC(xs_icluster_flushcnt); 2699 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2700 } 2701 2702 out_free: 2703 read_unlock(&pag->pag_ici_lock); 2704 kmem_free(ilist); 2705 out_put: 2706 xfs_perag_put(pag); 2707 return 0; 2708 2709 2710 cluster_corrupt_out: 2711 /* 2712 * Corruption detected in the clustering loop. Invalidate the 2713 * inode buffer and shut down the filesystem. 2714 */ 2715 read_unlock(&pag->pag_ici_lock); 2716 /* 2717 * Clean up the buffer. If it was B_DELWRI, just release it -- 2718 * brelse can handle it with no problems. If not, shut down the 2719 * filesystem before releasing the buffer. 2720 */ 2721 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2722 if (bufwasdelwri) 2723 xfs_buf_relse(bp); 2724 2725 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2726 2727 if (!bufwasdelwri) { 2728 /* 2729 * Just like incore_relse: if we have b_iodone functions, 2730 * mark the buffer as an error and call them. Otherwise 2731 * mark it as stale and brelse. 2732 */ 2733 if (XFS_BUF_IODONE_FUNC(bp)) { 2734 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 2735 XFS_BUF_UNDONE(bp); 2736 XFS_BUF_STALE(bp); 2737 XFS_BUF_ERROR(bp,EIO); 2738 xfs_biodone(bp); 2739 } else { 2740 XFS_BUF_STALE(bp); 2741 xfs_buf_relse(bp); 2742 } 2743 } 2744 2745 /* 2746 * Unlocks the flush lock 2747 */ 2748 xfs_iflush_abort(iq); 2749 kmem_free(ilist); 2750 xfs_perag_put(pag); 2751 return XFS_ERROR(EFSCORRUPTED); 2752 } 2753 2754 /* 2755 * xfs_iflush() will write a modified inode's changes out to the 2756 * inode's on disk home. The caller must have the inode lock held 2757 * in at least shared mode and the inode flush completion must be 2758 * active as well. The inode lock will still be held upon return from 2759 * the call and the caller is free to unlock it. 2760 * The inode flush will be completed when the inode reaches the disk. 2761 * The flags indicate how the inode's buffer should be written out. 2762 */ 2763 int 2764 xfs_iflush( 2765 xfs_inode_t *ip, 2766 uint flags) 2767 { 2768 xfs_inode_log_item_t *iip; 2769 xfs_buf_t *bp; 2770 xfs_dinode_t *dip; 2771 xfs_mount_t *mp; 2772 int error; 2773 2774 XFS_STATS_INC(xs_iflush_count); 2775 2776 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2777 ASSERT(!completion_done(&ip->i_flush)); 2778 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2779 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2780 2781 iip = ip->i_itemp; 2782 mp = ip->i_mount; 2783 2784 /* 2785 * We can't flush the inode until it is unpinned, so wait for it if we 2786 * are allowed to block. We know noone new can pin it, because we are 2787 * holding the inode lock shared and you need to hold it exclusively to 2788 * pin the inode. 2789 * 2790 * If we are not allowed to block, force the log out asynchronously so 2791 * that when we come back the inode will be unpinned. If other inodes 2792 * in the same cluster are dirty, they will probably write the inode 2793 * out for us if they occur after the log force completes. 2794 */ 2795 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2796 xfs_iunpin_nowait(ip); 2797 xfs_ifunlock(ip); 2798 return EAGAIN; 2799 } 2800 xfs_iunpin_wait(ip); 2801 2802 /* 2803 * For stale inodes we cannot rely on the backing buffer remaining 2804 * stale in cache for the remaining life of the stale inode and so 2805 * xfs_itobp() below may give us a buffer that no longer contains 2806 * inodes below. We have to check this after ensuring the inode is 2807 * unpinned so that it is safe to reclaim the stale inode after the 2808 * flush call. 2809 */ 2810 if (xfs_iflags_test(ip, XFS_ISTALE)) { 2811 xfs_ifunlock(ip); 2812 return 0; 2813 } 2814 2815 /* 2816 * This may have been unpinned because the filesystem is shutting 2817 * down forcibly. If that's the case we must not write this inode 2818 * to disk, because the log record didn't make it to disk! 2819 */ 2820 if (XFS_FORCED_SHUTDOWN(mp)) { 2821 ip->i_update_core = 0; 2822 if (iip) 2823 iip->ili_format.ilf_fields = 0; 2824 xfs_ifunlock(ip); 2825 return XFS_ERROR(EIO); 2826 } 2827 2828 /* 2829 * Get the buffer containing the on-disk inode. 2830 */ 2831 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2832 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2833 if (error || !bp) { 2834 xfs_ifunlock(ip); 2835 return error; 2836 } 2837 2838 /* 2839 * First flush out the inode that xfs_iflush was called with. 2840 */ 2841 error = xfs_iflush_int(ip, bp); 2842 if (error) 2843 goto corrupt_out; 2844 2845 /* 2846 * If the buffer is pinned then push on the log now so we won't 2847 * get stuck waiting in the write for too long. 2848 */ 2849 if (XFS_BUF_ISPINNED(bp)) 2850 xfs_log_force(mp, 0); 2851 2852 /* 2853 * inode clustering: 2854 * see if other inodes can be gathered into this write 2855 */ 2856 error = xfs_iflush_cluster(ip, bp); 2857 if (error) 2858 goto cluster_corrupt_out; 2859 2860 if (flags & SYNC_WAIT) 2861 error = xfs_bwrite(mp, bp); 2862 else 2863 xfs_bdwrite(mp, bp); 2864 return error; 2865 2866 corrupt_out: 2867 xfs_buf_relse(bp); 2868 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2869 cluster_corrupt_out: 2870 /* 2871 * Unlocks the flush lock 2872 */ 2873 xfs_iflush_abort(ip); 2874 return XFS_ERROR(EFSCORRUPTED); 2875 } 2876 2877 2878 STATIC int 2879 xfs_iflush_int( 2880 xfs_inode_t *ip, 2881 xfs_buf_t *bp) 2882 { 2883 xfs_inode_log_item_t *iip; 2884 xfs_dinode_t *dip; 2885 xfs_mount_t *mp; 2886 #ifdef XFS_TRANS_DEBUG 2887 int first; 2888 #endif 2889 2890 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2891 ASSERT(!completion_done(&ip->i_flush)); 2892 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2893 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2894 2895 iip = ip->i_itemp; 2896 mp = ip->i_mount; 2897 2898 /* set *dip = inode's place in the buffer */ 2899 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2900 2901 /* 2902 * Clear i_update_core before copying out the data. 2903 * This is for coordination with our timestamp updates 2904 * that don't hold the inode lock. They will always 2905 * update the timestamps BEFORE setting i_update_core, 2906 * so if we clear i_update_core after they set it we 2907 * are guaranteed to see their updates to the timestamps. 2908 * I believe that this depends on strongly ordered memory 2909 * semantics, but we have that. We use the SYNCHRONIZE 2910 * macro to make sure that the compiler does not reorder 2911 * the i_update_core access below the data copy below. 2912 */ 2913 ip->i_update_core = 0; 2914 SYNCHRONIZE(); 2915 2916 /* 2917 * Make sure to get the latest timestamps from the Linux inode. 2918 */ 2919 xfs_synchronize_times(ip); 2920 2921 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2922 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2923 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2924 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2925 ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2926 goto corrupt_out; 2927 } 2928 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2929 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2930 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2931 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2932 ip->i_ino, ip, ip->i_d.di_magic); 2933 goto corrupt_out; 2934 } 2935 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2936 if (XFS_TEST_ERROR( 2937 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2938 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2939 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2940 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2941 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 2942 ip->i_ino, ip); 2943 goto corrupt_out; 2944 } 2945 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2946 if (XFS_TEST_ERROR( 2947 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2948 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2949 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2950 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2951 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2952 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 2953 ip->i_ino, ip); 2954 goto corrupt_out; 2955 } 2956 } 2957 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2958 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2959 XFS_RANDOM_IFLUSH_5)) { 2960 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2961 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 2962 ip->i_ino, 2963 ip->i_d.di_nextents + ip->i_d.di_anextents, 2964 ip->i_d.di_nblocks, 2965 ip); 2966 goto corrupt_out; 2967 } 2968 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2969 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2970 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2971 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2972 ip->i_ino, ip->i_d.di_forkoff, ip); 2973 goto corrupt_out; 2974 } 2975 /* 2976 * bump the flush iteration count, used to detect flushes which 2977 * postdate a log record during recovery. 2978 */ 2979 2980 ip->i_d.di_flushiter++; 2981 2982 /* 2983 * Copy the dirty parts of the inode into the on-disk 2984 * inode. We always copy out the core of the inode, 2985 * because if the inode is dirty at all the core must 2986 * be. 2987 */ 2988 xfs_dinode_to_disk(dip, &ip->i_d); 2989 2990 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 2991 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 2992 ip->i_d.di_flushiter = 0; 2993 2994 /* 2995 * If this is really an old format inode and the superblock version 2996 * has not been updated to support only new format inodes, then 2997 * convert back to the old inode format. If the superblock version 2998 * has been updated, then make the conversion permanent. 2999 */ 3000 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 3001 if (ip->i_d.di_version == 1) { 3002 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 3003 /* 3004 * Convert it back. 3005 */ 3006 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 3007 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 3008 } else { 3009 /* 3010 * The superblock version has already been bumped, 3011 * so just make the conversion to the new inode 3012 * format permanent. 3013 */ 3014 ip->i_d.di_version = 2; 3015 dip->di_version = 2; 3016 ip->i_d.di_onlink = 0; 3017 dip->di_onlink = 0; 3018 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 3019 memset(&(dip->di_pad[0]), 0, 3020 sizeof(dip->di_pad)); 3021 ASSERT(ip->i_d.di_projid == 0); 3022 } 3023 } 3024 3025 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 3026 if (XFS_IFORK_Q(ip)) 3027 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 3028 xfs_inobp_check(mp, bp); 3029 3030 /* 3031 * We've recorded everything logged in the inode, so we'd 3032 * like to clear the ilf_fields bits so we don't log and 3033 * flush things unnecessarily. However, we can't stop 3034 * logging all this information until the data we've copied 3035 * into the disk buffer is written to disk. If we did we might 3036 * overwrite the copy of the inode in the log with all the 3037 * data after re-logging only part of it, and in the face of 3038 * a crash we wouldn't have all the data we need to recover. 3039 * 3040 * What we do is move the bits to the ili_last_fields field. 3041 * When logging the inode, these bits are moved back to the 3042 * ilf_fields field. In the xfs_iflush_done() routine we 3043 * clear ili_last_fields, since we know that the information 3044 * those bits represent is permanently on disk. As long as 3045 * the flush completes before the inode is logged again, then 3046 * both ilf_fields and ili_last_fields will be cleared. 3047 * 3048 * We can play with the ilf_fields bits here, because the inode 3049 * lock must be held exclusively in order to set bits there 3050 * and the flush lock protects the ili_last_fields bits. 3051 * Set ili_logged so the flush done 3052 * routine can tell whether or not to look in the AIL. 3053 * Also, store the current LSN of the inode so that we can tell 3054 * whether the item has moved in the AIL from xfs_iflush_done(). 3055 * In order to read the lsn we need the AIL lock, because 3056 * it is a 64 bit value that cannot be read atomically. 3057 */ 3058 if (iip != NULL && iip->ili_format.ilf_fields != 0) { 3059 iip->ili_last_fields = iip->ili_format.ilf_fields; 3060 iip->ili_format.ilf_fields = 0; 3061 iip->ili_logged = 1; 3062 3063 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3064 &iip->ili_item.li_lsn); 3065 3066 /* 3067 * Attach the function xfs_iflush_done to the inode's 3068 * buffer. This will remove the inode from the AIL 3069 * and unlock the inode's flush lock when the inode is 3070 * completely written to disk. 3071 */ 3072 xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) 3073 xfs_iflush_done, (xfs_log_item_t *)iip); 3074 3075 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 3076 ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); 3077 } else { 3078 /* 3079 * We're flushing an inode which is not in the AIL and has 3080 * not been logged but has i_update_core set. For this 3081 * case we can use a B_DELWRI flush and immediately drop 3082 * the inode flush lock because we can avoid the whole 3083 * AIL state thing. It's OK to drop the flush lock now, 3084 * because we've already locked the buffer and to do anything 3085 * you really need both. 3086 */ 3087 if (iip != NULL) { 3088 ASSERT(iip->ili_logged == 0); 3089 ASSERT(iip->ili_last_fields == 0); 3090 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 3091 } 3092 xfs_ifunlock(ip); 3093 } 3094 3095 return 0; 3096 3097 corrupt_out: 3098 return XFS_ERROR(EFSCORRUPTED); 3099 } 3100 3101 /* 3102 * Return a pointer to the extent record at file index idx. 3103 */ 3104 xfs_bmbt_rec_host_t * 3105 xfs_iext_get_ext( 3106 xfs_ifork_t *ifp, /* inode fork pointer */ 3107 xfs_extnum_t idx) /* index of target extent */ 3108 { 3109 ASSERT(idx >= 0); 3110 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3111 return ifp->if_u1.if_ext_irec->er_extbuf; 3112 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3113 xfs_ext_irec_t *erp; /* irec pointer */ 3114 int erp_idx = 0; /* irec index */ 3115 xfs_extnum_t page_idx = idx; /* ext index in target list */ 3116 3117 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3118 return &erp->er_extbuf[page_idx]; 3119 } else if (ifp->if_bytes) { 3120 return &ifp->if_u1.if_extents[idx]; 3121 } else { 3122 return NULL; 3123 } 3124 } 3125 3126 /* 3127 * Insert new item(s) into the extent records for incore inode 3128 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 3129 */ 3130 void 3131 xfs_iext_insert( 3132 xfs_inode_t *ip, /* incore inode pointer */ 3133 xfs_extnum_t idx, /* starting index of new items */ 3134 xfs_extnum_t count, /* number of inserted items */ 3135 xfs_bmbt_irec_t *new, /* items to insert */ 3136 int state) /* type of extent conversion */ 3137 { 3138 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3139 xfs_extnum_t i; /* extent record index */ 3140 3141 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); 3142 3143 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3144 xfs_iext_add(ifp, idx, count); 3145 for (i = idx; i < idx + count; i++, new++) 3146 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); 3147 } 3148 3149 /* 3150 * This is called when the amount of space required for incore file 3151 * extents needs to be increased. The ext_diff parameter stores the 3152 * number of new extents being added and the idx parameter contains 3153 * the extent index where the new extents will be added. If the new 3154 * extents are being appended, then we just need to (re)allocate and 3155 * initialize the space. Otherwise, if the new extents are being 3156 * inserted into the middle of the existing entries, a bit more work 3157 * is required to make room for the new extents to be inserted. The 3158 * caller is responsible for filling in the new extent entries upon 3159 * return. 3160 */ 3161 void 3162 xfs_iext_add( 3163 xfs_ifork_t *ifp, /* inode fork pointer */ 3164 xfs_extnum_t idx, /* index to begin adding exts */ 3165 int ext_diff) /* number of extents to add */ 3166 { 3167 int byte_diff; /* new bytes being added */ 3168 int new_size; /* size of extents after adding */ 3169 xfs_extnum_t nextents; /* number of extents in file */ 3170 3171 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3172 ASSERT((idx >= 0) && (idx <= nextents)); 3173 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 3174 new_size = ifp->if_bytes + byte_diff; 3175 /* 3176 * If the new number of extents (nextents + ext_diff) 3177 * fits inside the inode, then continue to use the inline 3178 * extent buffer. 3179 */ 3180 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 3181 if (idx < nextents) { 3182 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 3183 &ifp->if_u2.if_inline_ext[idx], 3184 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3185 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 3186 } 3187 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3188 ifp->if_real_bytes = 0; 3189 ifp->if_lastex = nextents + ext_diff; 3190 } 3191 /* 3192 * Otherwise use a linear (direct) extent list. 3193 * If the extents are currently inside the inode, 3194 * xfs_iext_realloc_direct will switch us from 3195 * inline to direct extent allocation mode. 3196 */ 3197 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 3198 xfs_iext_realloc_direct(ifp, new_size); 3199 if (idx < nextents) { 3200 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 3201 &ifp->if_u1.if_extents[idx], 3202 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 3203 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 3204 } 3205 } 3206 /* Indirection array */ 3207 else { 3208 xfs_ext_irec_t *erp; 3209 int erp_idx = 0; 3210 int page_idx = idx; 3211 3212 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 3213 if (ifp->if_flags & XFS_IFEXTIREC) { 3214 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 3215 } else { 3216 xfs_iext_irec_init(ifp); 3217 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3218 erp = ifp->if_u1.if_ext_irec; 3219 } 3220 /* Extents fit in target extent page */ 3221 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 3222 if (page_idx < erp->er_extcount) { 3223 memmove(&erp->er_extbuf[page_idx + ext_diff], 3224 &erp->er_extbuf[page_idx], 3225 (erp->er_extcount - page_idx) * 3226 sizeof(xfs_bmbt_rec_t)); 3227 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 3228 } 3229 erp->er_extcount += ext_diff; 3230 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3231 } 3232 /* Insert a new extent page */ 3233 else if (erp) { 3234 xfs_iext_add_indirect_multi(ifp, 3235 erp_idx, page_idx, ext_diff); 3236 } 3237 /* 3238 * If extent(s) are being appended to the last page in 3239 * the indirection array and the new extent(s) don't fit 3240 * in the page, then erp is NULL and erp_idx is set to 3241 * the next index needed in the indirection array. 3242 */ 3243 else { 3244 int count = ext_diff; 3245 3246 while (count) { 3247 erp = xfs_iext_irec_new(ifp, erp_idx); 3248 erp->er_extcount = count; 3249 count -= MIN(count, (int)XFS_LINEAR_EXTS); 3250 if (count) { 3251 erp_idx++; 3252 } 3253 } 3254 } 3255 } 3256 ifp->if_bytes = new_size; 3257 } 3258 3259 /* 3260 * This is called when incore extents are being added to the indirection 3261 * array and the new extents do not fit in the target extent list. The 3262 * erp_idx parameter contains the irec index for the target extent list 3263 * in the indirection array, and the idx parameter contains the extent 3264 * index within the list. The number of extents being added is stored 3265 * in the count parameter. 3266 * 3267 * |-------| |-------| 3268 * | | | | idx - number of extents before idx 3269 * | idx | | count | 3270 * | | | | count - number of extents being inserted at idx 3271 * |-------| |-------| 3272 * | count | | nex2 | nex2 - number of extents after idx + count 3273 * |-------| |-------| 3274 */ 3275 void 3276 xfs_iext_add_indirect_multi( 3277 xfs_ifork_t *ifp, /* inode fork pointer */ 3278 int erp_idx, /* target extent irec index */ 3279 xfs_extnum_t idx, /* index within target list */ 3280 int count) /* new extents being added */ 3281 { 3282 int byte_diff; /* new bytes being added */ 3283 xfs_ext_irec_t *erp; /* pointer to irec entry */ 3284 xfs_extnum_t ext_diff; /* number of extents to add */ 3285 xfs_extnum_t ext_cnt; /* new extents still needed */ 3286 xfs_extnum_t nex2; /* extents after idx + count */ 3287 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 3288 int nlists; /* number of irec's (lists) */ 3289 3290 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3291 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3292 nex2 = erp->er_extcount - idx; 3293 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3294 3295 /* 3296 * Save second part of target extent list 3297 * (all extents past */ 3298 if (nex2) { 3299 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3300 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); 3301 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 3302 erp->er_extcount -= nex2; 3303 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 3304 memset(&erp->er_extbuf[idx], 0, byte_diff); 3305 } 3306 3307 /* 3308 * Add the new extents to the end of the target 3309 * list, then allocate new irec record(s) and 3310 * extent buffer(s) as needed to store the rest 3311 * of the new extents. 3312 */ 3313 ext_cnt = count; 3314 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 3315 if (ext_diff) { 3316 erp->er_extcount += ext_diff; 3317 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3318 ext_cnt -= ext_diff; 3319 } 3320 while (ext_cnt) { 3321 erp_idx++; 3322 erp = xfs_iext_irec_new(ifp, erp_idx); 3323 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 3324 erp->er_extcount = ext_diff; 3325 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 3326 ext_cnt -= ext_diff; 3327 } 3328 3329 /* Add nex2 extents back to indirection array */ 3330 if (nex2) { 3331 xfs_extnum_t ext_avail; 3332 int i; 3333 3334 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 3335 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 3336 i = 0; 3337 /* 3338 * If nex2 extents fit in the current page, append 3339 * nex2_ep after the new extents. 3340 */ 3341 if (nex2 <= ext_avail) { 3342 i = erp->er_extcount; 3343 } 3344 /* 3345 * Otherwise, check if space is available in the 3346 * next page. 3347 */ 3348 else if ((erp_idx < nlists - 1) && 3349 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 3350 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 3351 erp_idx++; 3352 erp++; 3353 /* Create a hole for nex2 extents */ 3354 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 3355 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 3356 } 3357 /* 3358 * Final choice, create a new extent page for 3359 * nex2 extents. 3360 */ 3361 else { 3362 erp_idx++; 3363 erp = xfs_iext_irec_new(ifp, erp_idx); 3364 } 3365 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 3366 kmem_free(nex2_ep); 3367 erp->er_extcount += nex2; 3368 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 3369 } 3370 } 3371 3372 /* 3373 * This is called when the amount of space required for incore file 3374 * extents needs to be decreased. The ext_diff parameter stores the 3375 * number of extents to be removed and the idx parameter contains 3376 * the extent index where the extents will be removed from. 3377 * 3378 * If the amount of space needed has decreased below the linear 3379 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 3380 * extent array. Otherwise, use kmem_realloc() to adjust the 3381 * size to what is needed. 3382 */ 3383 void 3384 xfs_iext_remove( 3385 xfs_inode_t *ip, /* incore inode pointer */ 3386 xfs_extnum_t idx, /* index to begin removing exts */ 3387 int ext_diff, /* number of extents to remove */ 3388 int state) /* type of extent conversion */ 3389 { 3390 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 3391 xfs_extnum_t nextents; /* number of extents in file */ 3392 int new_size; /* size of extents after removal */ 3393 3394 trace_xfs_iext_remove(ip, idx, state, _RET_IP_); 3395 3396 ASSERT(ext_diff > 0); 3397 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3398 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 3399 3400 if (new_size == 0) { 3401 xfs_iext_destroy(ifp); 3402 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3403 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3404 } else if (ifp->if_real_bytes) { 3405 xfs_iext_remove_direct(ifp, idx, ext_diff); 3406 } else { 3407 xfs_iext_remove_inline(ifp, idx, ext_diff); 3408 } 3409 ifp->if_bytes = new_size; 3410 } 3411 3412 /* 3413 * This removes ext_diff extents from the inline buffer, beginning 3414 * at extent index idx. 3415 */ 3416 void 3417 xfs_iext_remove_inline( 3418 xfs_ifork_t *ifp, /* inode fork pointer */ 3419 xfs_extnum_t idx, /* index to begin removing exts */ 3420 int ext_diff) /* number of extents to remove */ 3421 { 3422 int nextents; /* number of extents in file */ 3423 3424 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3425 ASSERT(idx < XFS_INLINE_EXTS); 3426 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3427 ASSERT(((nextents - ext_diff) > 0) && 3428 (nextents - ext_diff) < XFS_INLINE_EXTS); 3429 3430 if (idx + ext_diff < nextents) { 3431 memmove(&ifp->if_u2.if_inline_ext[idx], 3432 &ifp->if_u2.if_inline_ext[idx + ext_diff], 3433 (nextents - (idx + ext_diff)) * 3434 sizeof(xfs_bmbt_rec_t)); 3435 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 3436 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3437 } else { 3438 memset(&ifp->if_u2.if_inline_ext[idx], 0, 3439 ext_diff * sizeof(xfs_bmbt_rec_t)); 3440 } 3441 } 3442 3443 /* 3444 * This removes ext_diff extents from a linear (direct) extent list, 3445 * beginning at extent index idx. If the extents are being removed 3446 * from the end of the list (ie. truncate) then we just need to re- 3447 * allocate the list to remove the extra space. Otherwise, if the 3448 * extents are being removed from the middle of the existing extent 3449 * entries, then we first need to move the extent records beginning 3450 * at idx + ext_diff up in the list to overwrite the records being 3451 * removed, then remove the extra space via kmem_realloc. 3452 */ 3453 void 3454 xfs_iext_remove_direct( 3455 xfs_ifork_t *ifp, /* inode fork pointer */ 3456 xfs_extnum_t idx, /* index to begin removing exts */ 3457 int ext_diff) /* number of extents to remove */ 3458 { 3459 xfs_extnum_t nextents; /* number of extents in file */ 3460 int new_size; /* size of extents after removal */ 3461 3462 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3463 new_size = ifp->if_bytes - 3464 (ext_diff * sizeof(xfs_bmbt_rec_t)); 3465 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3466 3467 if (new_size == 0) { 3468 xfs_iext_destroy(ifp); 3469 return; 3470 } 3471 /* Move extents up in the list (if needed) */ 3472 if (idx + ext_diff < nextents) { 3473 memmove(&ifp->if_u1.if_extents[idx], 3474 &ifp->if_u1.if_extents[idx + ext_diff], 3475 (nextents - (idx + ext_diff)) * 3476 sizeof(xfs_bmbt_rec_t)); 3477 } 3478 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 3479 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3480 /* 3481 * Reallocate the direct extent list. If the extents 3482 * will fit inside the inode then xfs_iext_realloc_direct 3483 * will switch from direct to inline extent allocation 3484 * mode for us. 3485 */ 3486 xfs_iext_realloc_direct(ifp, new_size); 3487 ifp->if_bytes = new_size; 3488 } 3489 3490 /* 3491 * This is called when incore extents are being removed from the 3492 * indirection array and the extents being removed span multiple extent 3493 * buffers. The idx parameter contains the file extent index where we 3494 * want to begin removing extents, and the count parameter contains 3495 * how many extents need to be removed. 3496 * 3497 * |-------| |-------| 3498 * | nex1 | | | nex1 - number of extents before idx 3499 * |-------| | count | 3500 * | | | | count - number of extents being removed at idx 3501 * | count | |-------| 3502 * | | | nex2 | nex2 - number of extents after idx + count 3503 * |-------| |-------| 3504 */ 3505 void 3506 xfs_iext_remove_indirect( 3507 xfs_ifork_t *ifp, /* inode fork pointer */ 3508 xfs_extnum_t idx, /* index to begin removing extents */ 3509 int count) /* number of extents to remove */ 3510 { 3511 xfs_ext_irec_t *erp; /* indirection array pointer */ 3512 int erp_idx = 0; /* indirection array index */ 3513 xfs_extnum_t ext_cnt; /* extents left to remove */ 3514 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3515 xfs_extnum_t nex1; /* number of extents before idx */ 3516 xfs_extnum_t nex2; /* extents after idx + count */ 3517 int nlists; /* entries in indirection array */ 3518 int page_idx = idx; /* index in target extent list */ 3519 3520 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3521 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3522 ASSERT(erp != NULL); 3523 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3524 nex1 = page_idx; 3525 ext_cnt = count; 3526 while (ext_cnt) { 3527 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 3528 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 3529 /* 3530 * Check for deletion of entire list; 3531 * xfs_iext_irec_remove() updates extent offsets. 3532 */ 3533 if (ext_diff == erp->er_extcount) { 3534 xfs_iext_irec_remove(ifp, erp_idx); 3535 ext_cnt -= ext_diff; 3536 nex1 = 0; 3537 if (ext_cnt) { 3538 ASSERT(erp_idx < ifp->if_real_bytes / 3539 XFS_IEXT_BUFSZ); 3540 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3541 nex1 = 0; 3542 continue; 3543 } else { 3544 break; 3545 } 3546 } 3547 /* Move extents up (if needed) */ 3548 if (nex2) { 3549 memmove(&erp->er_extbuf[nex1], 3550 &erp->er_extbuf[nex1 + ext_diff], 3551 nex2 * sizeof(xfs_bmbt_rec_t)); 3552 } 3553 /* Zero out rest of page */ 3554 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 3555 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 3556 /* Update remaining counters */ 3557 erp->er_extcount -= ext_diff; 3558 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 3559 ext_cnt -= ext_diff; 3560 nex1 = 0; 3561 erp_idx++; 3562 erp++; 3563 } 3564 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 3565 xfs_iext_irec_compact(ifp); 3566 } 3567 3568 /* 3569 * Create, destroy, or resize a linear (direct) block of extents. 3570 */ 3571 void 3572 xfs_iext_realloc_direct( 3573 xfs_ifork_t *ifp, /* inode fork pointer */ 3574 int new_size) /* new size of extents */ 3575 { 3576 int rnew_size; /* real new size of extents */ 3577 3578 rnew_size = new_size; 3579 3580 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 3581 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 3582 (new_size != ifp->if_real_bytes))); 3583 3584 /* Free extent records */ 3585 if (new_size == 0) { 3586 xfs_iext_destroy(ifp); 3587 } 3588 /* Resize direct extent list and zero any new bytes */ 3589 else if (ifp->if_real_bytes) { 3590 /* Check if extents will fit inside the inode */ 3591 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 3592 xfs_iext_direct_to_inline(ifp, new_size / 3593 (uint)sizeof(xfs_bmbt_rec_t)); 3594 ifp->if_bytes = new_size; 3595 return; 3596 } 3597 if (!is_power_of_2(new_size)){ 3598 rnew_size = roundup_pow_of_two(new_size); 3599 } 3600 if (rnew_size != ifp->if_real_bytes) { 3601 ifp->if_u1.if_extents = 3602 kmem_realloc(ifp->if_u1.if_extents, 3603 rnew_size, 3604 ifp->if_real_bytes, KM_NOFS); 3605 } 3606 if (rnew_size > ifp->if_real_bytes) { 3607 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 3608 (uint)sizeof(xfs_bmbt_rec_t)], 0, 3609 rnew_size - ifp->if_real_bytes); 3610 } 3611 } 3612 /* 3613 * Switch from the inline extent buffer to a direct 3614 * extent list. Be sure to include the inline extent 3615 * bytes in new_size. 3616 */ 3617 else { 3618 new_size += ifp->if_bytes; 3619 if (!is_power_of_2(new_size)) { 3620 rnew_size = roundup_pow_of_two(new_size); 3621 } 3622 xfs_iext_inline_to_direct(ifp, rnew_size); 3623 } 3624 ifp->if_real_bytes = rnew_size; 3625 ifp->if_bytes = new_size; 3626 } 3627 3628 /* 3629 * Switch from linear (direct) extent records to inline buffer. 3630 */ 3631 void 3632 xfs_iext_direct_to_inline( 3633 xfs_ifork_t *ifp, /* inode fork pointer */ 3634 xfs_extnum_t nextents) /* number of extents in file */ 3635 { 3636 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3637 ASSERT(nextents <= XFS_INLINE_EXTS); 3638 /* 3639 * The inline buffer was zeroed when we switched 3640 * from inline to direct extent allocation mode, 3641 * so we don't need to clear it here. 3642 */ 3643 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 3644 nextents * sizeof(xfs_bmbt_rec_t)); 3645 kmem_free(ifp->if_u1.if_extents); 3646 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3647 ifp->if_real_bytes = 0; 3648 } 3649 3650 /* 3651 * Switch from inline buffer to linear (direct) extent records. 3652 * new_size should already be rounded up to the next power of 2 3653 * by the caller (when appropriate), so use new_size as it is. 3654 * However, since new_size may be rounded up, we can't update 3655 * if_bytes here. It is the caller's responsibility to update 3656 * if_bytes upon return. 3657 */ 3658 void 3659 xfs_iext_inline_to_direct( 3660 xfs_ifork_t *ifp, /* inode fork pointer */ 3661 int new_size) /* number of extents in file */ 3662 { 3663 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); 3664 memset(ifp->if_u1.if_extents, 0, new_size); 3665 if (ifp->if_bytes) { 3666 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 3667 ifp->if_bytes); 3668 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3669 sizeof(xfs_bmbt_rec_t)); 3670 } 3671 ifp->if_real_bytes = new_size; 3672 } 3673 3674 /* 3675 * Resize an extent indirection array to new_size bytes. 3676 */ 3677 STATIC void 3678 xfs_iext_realloc_indirect( 3679 xfs_ifork_t *ifp, /* inode fork pointer */ 3680 int new_size) /* new indirection array size */ 3681 { 3682 int nlists; /* number of irec's (ex lists) */ 3683 int size; /* current indirection array size */ 3684 3685 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3686 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3687 size = nlists * sizeof(xfs_ext_irec_t); 3688 ASSERT(ifp->if_real_bytes); 3689 ASSERT((new_size >= 0) && (new_size != size)); 3690 if (new_size == 0) { 3691 xfs_iext_destroy(ifp); 3692 } else { 3693 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 3694 kmem_realloc(ifp->if_u1.if_ext_irec, 3695 new_size, size, KM_NOFS); 3696 } 3697 } 3698 3699 /* 3700 * Switch from indirection array to linear (direct) extent allocations. 3701 */ 3702 STATIC void 3703 xfs_iext_indirect_to_direct( 3704 xfs_ifork_t *ifp) /* inode fork pointer */ 3705 { 3706 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3707 xfs_extnum_t nextents; /* number of extents in file */ 3708 int size; /* size of file extents */ 3709 3710 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3711 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3712 ASSERT(nextents <= XFS_LINEAR_EXTS); 3713 size = nextents * sizeof(xfs_bmbt_rec_t); 3714 3715 xfs_iext_irec_compact_pages(ifp); 3716 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 3717 3718 ep = ifp->if_u1.if_ext_irec->er_extbuf; 3719 kmem_free(ifp->if_u1.if_ext_irec); 3720 ifp->if_flags &= ~XFS_IFEXTIREC; 3721 ifp->if_u1.if_extents = ep; 3722 ifp->if_bytes = size; 3723 if (nextents < XFS_LINEAR_EXTS) { 3724 xfs_iext_realloc_direct(ifp, size); 3725 } 3726 } 3727 3728 /* 3729 * Free incore file extents. 3730 */ 3731 void 3732 xfs_iext_destroy( 3733 xfs_ifork_t *ifp) /* inode fork pointer */ 3734 { 3735 if (ifp->if_flags & XFS_IFEXTIREC) { 3736 int erp_idx; 3737 int nlists; 3738 3739 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3740 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 3741 xfs_iext_irec_remove(ifp, erp_idx); 3742 } 3743 ifp->if_flags &= ~XFS_IFEXTIREC; 3744 } else if (ifp->if_real_bytes) { 3745 kmem_free(ifp->if_u1.if_extents); 3746 } else if (ifp->if_bytes) { 3747 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3748 sizeof(xfs_bmbt_rec_t)); 3749 } 3750 ifp->if_u1.if_extents = NULL; 3751 ifp->if_real_bytes = 0; 3752 ifp->if_bytes = 0; 3753 } 3754 3755 /* 3756 * Return a pointer to the extent record for file system block bno. 3757 */ 3758 xfs_bmbt_rec_host_t * /* pointer to found extent record */ 3759 xfs_iext_bno_to_ext( 3760 xfs_ifork_t *ifp, /* inode fork pointer */ 3761 xfs_fileoff_t bno, /* block number to search for */ 3762 xfs_extnum_t *idxp) /* index of target extent */ 3763 { 3764 xfs_bmbt_rec_host_t *base; /* pointer to first extent */ 3765 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 3766 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ 3767 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3768 int high; /* upper boundary in search */ 3769 xfs_extnum_t idx = 0; /* index of target extent */ 3770 int low; /* lower boundary in search */ 3771 xfs_extnum_t nextents; /* number of file extents */ 3772 xfs_fileoff_t startoff = 0; /* start offset of extent */ 3773 3774 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3775 if (nextents == 0) { 3776 *idxp = 0; 3777 return NULL; 3778 } 3779 low = 0; 3780 if (ifp->if_flags & XFS_IFEXTIREC) { 3781 /* Find target extent list */ 3782 int erp_idx = 0; 3783 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 3784 base = erp->er_extbuf; 3785 high = erp->er_extcount - 1; 3786 } else { 3787 base = ifp->if_u1.if_extents; 3788 high = nextents - 1; 3789 } 3790 /* Binary search extent records */ 3791 while (low <= high) { 3792 idx = (low + high) >> 1; 3793 ep = base + idx; 3794 startoff = xfs_bmbt_get_startoff(ep); 3795 blockcount = xfs_bmbt_get_blockcount(ep); 3796 if (bno < startoff) { 3797 high = idx - 1; 3798 } else if (bno >= startoff + blockcount) { 3799 low = idx + 1; 3800 } else { 3801 /* Convert back to file-based extent index */ 3802 if (ifp->if_flags & XFS_IFEXTIREC) { 3803 idx += erp->er_extoff; 3804 } 3805 *idxp = idx; 3806 return ep; 3807 } 3808 } 3809 /* Convert back to file-based extent index */ 3810 if (ifp->if_flags & XFS_IFEXTIREC) { 3811 idx += erp->er_extoff; 3812 } 3813 if (bno >= startoff + blockcount) { 3814 if (++idx == nextents) { 3815 ep = NULL; 3816 } else { 3817 ep = xfs_iext_get_ext(ifp, idx); 3818 } 3819 } 3820 *idxp = idx; 3821 return ep; 3822 } 3823 3824 /* 3825 * Return a pointer to the indirection array entry containing the 3826 * extent record for filesystem block bno. Store the index of the 3827 * target irec in *erp_idxp. 3828 */ 3829 xfs_ext_irec_t * /* pointer to found extent record */ 3830 xfs_iext_bno_to_irec( 3831 xfs_ifork_t *ifp, /* inode fork pointer */ 3832 xfs_fileoff_t bno, /* block number to search for */ 3833 int *erp_idxp) /* irec index of target ext list */ 3834 { 3835 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3836 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 3837 int erp_idx; /* indirection array index */ 3838 int nlists; /* number of extent irec's (lists) */ 3839 int high; /* binary search upper limit */ 3840 int low; /* binary search lower limit */ 3841 3842 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3843 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3844 erp_idx = 0; 3845 low = 0; 3846 high = nlists - 1; 3847 while (low <= high) { 3848 erp_idx = (low + high) >> 1; 3849 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3850 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 3851 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 3852 high = erp_idx - 1; 3853 } else if (erp_next && bno >= 3854 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 3855 low = erp_idx + 1; 3856 } else { 3857 break; 3858 } 3859 } 3860 *erp_idxp = erp_idx; 3861 return erp; 3862 } 3863 3864 /* 3865 * Return a pointer to the indirection array entry containing the 3866 * extent record at file extent index *idxp. Store the index of the 3867 * target irec in *erp_idxp and store the page index of the target 3868 * extent record in *idxp. 3869 */ 3870 xfs_ext_irec_t * 3871 xfs_iext_idx_to_irec( 3872 xfs_ifork_t *ifp, /* inode fork pointer */ 3873 xfs_extnum_t *idxp, /* extent index (file -> page) */ 3874 int *erp_idxp, /* pointer to target irec */ 3875 int realloc) /* new bytes were just added */ 3876 { 3877 xfs_ext_irec_t *prev; /* pointer to previous irec */ 3878 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 3879 int erp_idx; /* indirection array index */ 3880 int nlists; /* number of irec's (ex lists) */ 3881 int high; /* binary search upper limit */ 3882 int low; /* binary search lower limit */ 3883 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3884 3885 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3886 ASSERT(page_idx >= 0 && page_idx <= 3887 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 3888 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3889 erp_idx = 0; 3890 low = 0; 3891 high = nlists - 1; 3892 3893 /* Binary search extent irec's */ 3894 while (low <= high) { 3895 erp_idx = (low + high) >> 1; 3896 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3897 prev = erp_idx > 0 ? erp - 1 : NULL; 3898 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 3899 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 3900 high = erp_idx - 1; 3901 } else if (page_idx > erp->er_extoff + erp->er_extcount || 3902 (page_idx == erp->er_extoff + erp->er_extcount && 3903 !realloc)) { 3904 low = erp_idx + 1; 3905 } else if (page_idx == erp->er_extoff + erp->er_extcount && 3906 erp->er_extcount == XFS_LINEAR_EXTS) { 3907 ASSERT(realloc); 3908 page_idx = 0; 3909 erp_idx++; 3910 erp = erp_idx < nlists ? erp + 1 : NULL; 3911 break; 3912 } else { 3913 page_idx -= erp->er_extoff; 3914 break; 3915 } 3916 } 3917 *idxp = page_idx; 3918 *erp_idxp = erp_idx; 3919 return(erp); 3920 } 3921 3922 /* 3923 * Allocate and initialize an indirection array once the space needed 3924 * for incore extents increases above XFS_IEXT_BUFSZ. 3925 */ 3926 void 3927 xfs_iext_irec_init( 3928 xfs_ifork_t *ifp) /* inode fork pointer */ 3929 { 3930 xfs_ext_irec_t *erp; /* indirection array pointer */ 3931 xfs_extnum_t nextents; /* number of extents in file */ 3932 3933 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3934 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3935 ASSERT(nextents <= XFS_LINEAR_EXTS); 3936 3937 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); 3938 3939 if (nextents == 0) { 3940 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3941 } else if (!ifp->if_real_bytes) { 3942 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 3943 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 3944 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 3945 } 3946 erp->er_extbuf = ifp->if_u1.if_extents; 3947 erp->er_extcount = nextents; 3948 erp->er_extoff = 0; 3949 3950 ifp->if_flags |= XFS_IFEXTIREC; 3951 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 3952 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 3953 ifp->if_u1.if_ext_irec = erp; 3954 3955 return; 3956 } 3957 3958 /* 3959 * Allocate and initialize a new entry in the indirection array. 3960 */ 3961 xfs_ext_irec_t * 3962 xfs_iext_irec_new( 3963 xfs_ifork_t *ifp, /* inode fork pointer */ 3964 int erp_idx) /* index for new irec */ 3965 { 3966 xfs_ext_irec_t *erp; /* indirection array pointer */ 3967 int i; /* loop counter */ 3968 int nlists; /* number of irec's (ex lists) */ 3969 3970 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3971 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3972 3973 /* Resize indirection array */ 3974 xfs_iext_realloc_indirect(ifp, ++nlists * 3975 sizeof(xfs_ext_irec_t)); 3976 /* 3977 * Move records down in the array so the 3978 * new page can use erp_idx. 3979 */ 3980 erp = ifp->if_u1.if_ext_irec; 3981 for (i = nlists - 1; i > erp_idx; i--) { 3982 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 3983 } 3984 ASSERT(i == erp_idx); 3985 3986 /* Initialize new extent record */ 3987 erp = ifp->if_u1.if_ext_irec; 3988 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3989 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3990 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 3991 erp[erp_idx].er_extcount = 0; 3992 erp[erp_idx].er_extoff = erp_idx > 0 ? 3993 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 3994 return (&erp[erp_idx]); 3995 } 3996 3997 /* 3998 * Remove a record from the indirection array. 3999 */ 4000 void 4001 xfs_iext_irec_remove( 4002 xfs_ifork_t *ifp, /* inode fork pointer */ 4003 int erp_idx) /* irec index to remove */ 4004 { 4005 xfs_ext_irec_t *erp; /* indirection array pointer */ 4006 int i; /* loop counter */ 4007 int nlists; /* number of irec's (ex lists) */ 4008 4009 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4010 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4011 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4012 if (erp->er_extbuf) { 4013 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 4014 -erp->er_extcount); 4015 kmem_free(erp->er_extbuf); 4016 } 4017 /* Compact extent records */ 4018 erp = ifp->if_u1.if_ext_irec; 4019 for (i = erp_idx; i < nlists - 1; i++) { 4020 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 4021 } 4022 /* 4023 * Manually free the last extent record from the indirection 4024 * array. A call to xfs_iext_realloc_indirect() with a size 4025 * of zero would result in a call to xfs_iext_destroy() which 4026 * would in turn call this function again, creating a nasty 4027 * infinite loop. 4028 */ 4029 if (--nlists) { 4030 xfs_iext_realloc_indirect(ifp, 4031 nlists * sizeof(xfs_ext_irec_t)); 4032 } else { 4033 kmem_free(ifp->if_u1.if_ext_irec); 4034 } 4035 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 4036 } 4037 4038 /* 4039 * This is called to clean up large amounts of unused memory allocated 4040 * by the indirection array. Before compacting anything though, verify 4041 * that the indirection array is still needed and switch back to the 4042 * linear extent list (or even the inline buffer) if possible. The 4043 * compaction policy is as follows: 4044 * 4045 * Full Compaction: Extents fit into a single page (or inline buffer) 4046 * Partial Compaction: Extents occupy less than 50% of allocated space 4047 * No Compaction: Extents occupy at least 50% of allocated space 4048 */ 4049 void 4050 xfs_iext_irec_compact( 4051 xfs_ifork_t *ifp) /* inode fork pointer */ 4052 { 4053 xfs_extnum_t nextents; /* number of extents in file */ 4054 int nlists; /* number of irec's (ex lists) */ 4055 4056 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4057 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4058 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4059 4060 if (nextents == 0) { 4061 xfs_iext_destroy(ifp); 4062 } else if (nextents <= XFS_INLINE_EXTS) { 4063 xfs_iext_indirect_to_direct(ifp); 4064 xfs_iext_direct_to_inline(ifp, nextents); 4065 } else if (nextents <= XFS_LINEAR_EXTS) { 4066 xfs_iext_indirect_to_direct(ifp); 4067 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 4068 xfs_iext_irec_compact_pages(ifp); 4069 } 4070 } 4071 4072 /* 4073 * Combine extents from neighboring extent pages. 4074 */ 4075 void 4076 xfs_iext_irec_compact_pages( 4077 xfs_ifork_t *ifp) /* inode fork pointer */ 4078 { 4079 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 4080 int erp_idx = 0; /* indirection array index */ 4081 int nlists; /* number of irec's (ex lists) */ 4082 4083 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4084 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4085 while (erp_idx < nlists - 1) { 4086 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 4087 erp_next = erp + 1; 4088 if (erp_next->er_extcount <= 4089 (XFS_LINEAR_EXTS - erp->er_extcount)) { 4090 memcpy(&erp->er_extbuf[erp->er_extcount], 4091 erp_next->er_extbuf, erp_next->er_extcount * 4092 sizeof(xfs_bmbt_rec_t)); 4093 erp->er_extcount += erp_next->er_extcount; 4094 /* 4095 * Free page before removing extent record 4096 * so er_extoffs don't get modified in 4097 * xfs_iext_irec_remove. 4098 */ 4099 kmem_free(erp_next->er_extbuf); 4100 erp_next->er_extbuf = NULL; 4101 xfs_iext_irec_remove(ifp, erp_idx + 1); 4102 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4103 } else { 4104 erp_idx++; 4105 } 4106 } 4107 } 4108 4109 /* 4110 * This is called to update the er_extoff field in the indirection 4111 * array when extents have been added or removed from one of the 4112 * extent lists. erp_idx contains the irec index to begin updating 4113 * at and ext_diff contains the number of extents that were added 4114 * or removed. 4115 */ 4116 void 4117 xfs_iext_irec_update_extoffs( 4118 xfs_ifork_t *ifp, /* inode fork pointer */ 4119 int erp_idx, /* irec index to update */ 4120 int ext_diff) /* number of new extents */ 4121 { 4122 int i; /* loop counter */ 4123 int nlists; /* number of irec's (ex lists */ 4124 4125 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 4126 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 4127 for (i = erp_idx; i < nlists; i++) { 4128 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 4129 } 4130 } 4131