1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include <linux/log2.h> 19 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_log.h" 24 #include "xfs_inum.h" 25 #include "xfs_trans.h" 26 #include "xfs_trans_priv.h" 27 #include "xfs_sb.h" 28 #include "xfs_ag.h" 29 #include "xfs_mount.h" 30 #include "xfs_bmap_btree.h" 31 #include "xfs_alloc_btree.h" 32 #include "xfs_ialloc_btree.h" 33 #include "xfs_attr_sf.h" 34 #include "xfs_dinode.h" 35 #include "xfs_inode.h" 36 #include "xfs_buf_item.h" 37 #include "xfs_inode_item.h" 38 #include "xfs_btree.h" 39 #include "xfs_alloc.h" 40 #include "xfs_ialloc.h" 41 #include "xfs_bmap.h" 42 #include "xfs_error.h" 43 #include "xfs_utils.h" 44 #include "xfs_quota.h" 45 #include "xfs_filestream.h" 46 #include "xfs_vnodeops.h" 47 #include "xfs_trace.h" 48 49 kmem_zone_t *xfs_ifork_zone; 50 kmem_zone_t *xfs_inode_zone; 51 52 /* 53 * Used in xfs_itruncate_extents(). This is the maximum number of extents 54 * freed from a file in a single transaction. 55 */ 56 #define XFS_ITRUNC_MAX_EXTENTS 2 57 58 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 59 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); 60 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); 61 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); 62 63 /* 64 * helper function to extract extent size hint from inode 65 */ 66 xfs_extlen_t 67 xfs_get_extsz_hint( 68 struct xfs_inode *ip) 69 { 70 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) 71 return ip->i_d.di_extsize; 72 if (XFS_IS_REALTIME_INODE(ip)) 73 return ip->i_mount->m_sb.sb_rextsize; 74 return 0; 75 } 76 77 #ifdef DEBUG 78 /* 79 * Make sure that the extents in the given memory buffer 80 * are valid. 81 */ 82 STATIC void 83 xfs_validate_extents( 84 xfs_ifork_t *ifp, 85 int nrecs, 86 xfs_exntfmt_t fmt) 87 { 88 xfs_bmbt_irec_t irec; 89 xfs_bmbt_rec_host_t rec; 90 int i; 91 92 for (i = 0; i < nrecs; i++) { 93 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 94 rec.l0 = get_unaligned(&ep->l0); 95 rec.l1 = get_unaligned(&ep->l1); 96 xfs_bmbt_get_all(&rec, &irec); 97 if (fmt == XFS_EXTFMT_NOSTATE) 98 ASSERT(irec.br_state == XFS_EXT_NORM); 99 } 100 } 101 #else /* DEBUG */ 102 #define xfs_validate_extents(ifp, nrecs, fmt) 103 #endif /* DEBUG */ 104 105 /* 106 * Check that none of the inode's in the buffer have a next 107 * unlinked field of 0. 108 */ 109 #if defined(DEBUG) 110 void 111 xfs_inobp_check( 112 xfs_mount_t *mp, 113 xfs_buf_t *bp) 114 { 115 int i; 116 int j; 117 xfs_dinode_t *dip; 118 119 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 120 121 for (i = 0; i < j; i++) { 122 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 123 i * mp->m_sb.sb_inodesize); 124 if (!dip->di_next_unlinked) { 125 xfs_alert(mp, 126 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", 127 bp); 128 ASSERT(dip->di_next_unlinked); 129 } 130 } 131 } 132 #endif 133 134 /* 135 * Find the buffer associated with the given inode map 136 * We do basic validation checks on the buffer once it has been 137 * retrieved from disk. 138 */ 139 STATIC int 140 xfs_imap_to_bp( 141 xfs_mount_t *mp, 142 xfs_trans_t *tp, 143 struct xfs_imap *imap, 144 xfs_buf_t **bpp, 145 uint buf_flags, 146 uint iget_flags) 147 { 148 int error; 149 int i; 150 int ni; 151 xfs_buf_t *bp; 152 153 buf_flags |= XBF_UNMAPPED; 154 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 155 (int)imap->im_len, buf_flags, &bp); 156 if (error) { 157 if (error != EAGAIN) { 158 xfs_warn(mp, 159 "%s: xfs_trans_read_buf() returned error %d.", 160 __func__, error); 161 } else { 162 ASSERT(buf_flags & XBF_TRYLOCK); 163 } 164 return error; 165 } 166 167 /* 168 * Validate the magic number and version of every inode in the buffer 169 * (if DEBUG kernel) or the first inode in the buffer, otherwise. 170 */ 171 #ifdef DEBUG 172 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog; 173 #else /* usual case */ 174 ni = 1; 175 #endif 176 177 for (i = 0; i < ni; i++) { 178 int di_ok; 179 xfs_dinode_t *dip; 180 181 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 182 (i << mp->m_sb.sb_inodelog)); 183 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 184 XFS_DINODE_GOOD_VERSION(dip->di_version); 185 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 186 XFS_ERRTAG_ITOBP_INOTOBP, 187 XFS_RANDOM_ITOBP_INOTOBP))) { 188 if (iget_flags & XFS_IGET_UNTRUSTED) { 189 xfs_trans_brelse(tp, bp); 190 return XFS_ERROR(EINVAL); 191 } 192 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 193 XFS_ERRLEVEL_HIGH, mp, dip); 194 #ifdef DEBUG 195 xfs_emerg(mp, 196 "bad inode magic/vsn daddr %lld #%d (magic=%x)", 197 (unsigned long long)imap->im_blkno, i, 198 be16_to_cpu(dip->di_magic)); 199 ASSERT(0); 200 #endif 201 xfs_trans_brelse(tp, bp); 202 return XFS_ERROR(EFSCORRUPTED); 203 } 204 } 205 206 xfs_inobp_check(mp, bp); 207 *bpp = bp; 208 return 0; 209 } 210 211 /* 212 * This routine is called to map an inode number within a file 213 * system to the buffer containing the on-disk version of the 214 * inode. It returns a pointer to the buffer containing the 215 * on-disk inode in the bpp parameter, and in the dip parameter 216 * it returns a pointer to the on-disk inode within that buffer. 217 * 218 * If a non-zero error is returned, then the contents of bpp and 219 * dipp are undefined. 220 * 221 * Use xfs_imap() to determine the size and location of the 222 * buffer to read from disk. 223 */ 224 int 225 xfs_inotobp( 226 xfs_mount_t *mp, 227 xfs_trans_t *tp, 228 xfs_ino_t ino, 229 xfs_dinode_t **dipp, 230 xfs_buf_t **bpp, 231 int *offset, 232 uint imap_flags) 233 { 234 struct xfs_imap imap; 235 xfs_buf_t *bp; 236 int error; 237 238 imap.im_blkno = 0; 239 error = xfs_imap(mp, tp, ino, &imap, imap_flags); 240 if (error) 241 return error; 242 243 error = xfs_imap_to_bp(mp, tp, &imap, &bp, 0, imap_flags); 244 if (error) 245 return error; 246 247 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 248 *bpp = bp; 249 *offset = imap.im_boffset; 250 return 0; 251 } 252 253 254 /* 255 * This routine is called to map an inode to the buffer containing 256 * the on-disk version of the inode. It returns a pointer to the 257 * buffer containing the on-disk inode in the bpp parameter, and in 258 * the dip parameter it returns a pointer to the on-disk inode within 259 * that buffer. 260 * 261 * If a non-zero error is returned, then the contents of bpp and 262 * dipp are undefined. 263 * 264 * The inode is expected to already been mapped to its buffer and read 265 * in once, thus we can use the mapping information stored in the inode 266 * rather than calling xfs_imap(). This allows us to avoid the overhead 267 * of looking at the inode btree for small block file systems 268 * (see xfs_imap()). 269 */ 270 int 271 xfs_itobp( 272 xfs_mount_t *mp, 273 xfs_trans_t *tp, 274 xfs_inode_t *ip, 275 xfs_dinode_t **dipp, 276 xfs_buf_t **bpp, 277 uint buf_flags) 278 { 279 xfs_buf_t *bp; 280 int error; 281 282 ASSERT(ip->i_imap.im_blkno != 0); 283 284 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0); 285 if (error) 286 return error; 287 288 if (!bp) { 289 ASSERT(buf_flags & XBF_TRYLOCK); 290 ASSERT(tp == NULL); 291 *bpp = NULL; 292 return EAGAIN; 293 } 294 295 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 296 *bpp = bp; 297 return 0; 298 } 299 300 /* 301 * Move inode type and inode format specific information from the 302 * on-disk inode to the in-core inode. For fifos, devs, and sockets 303 * this means set if_rdev to the proper value. For files, directories, 304 * and symlinks this means to bring in the in-line data or extent 305 * pointers. For a file in B-tree format, only the root is immediately 306 * brought in-core. The rest will be in-lined in if_extents when it 307 * is first referenced (see xfs_iread_extents()). 308 */ 309 STATIC int 310 xfs_iformat( 311 xfs_inode_t *ip, 312 xfs_dinode_t *dip) 313 { 314 xfs_attr_shortform_t *atp; 315 int size; 316 int error = 0; 317 xfs_fsize_t di_size; 318 319 if (unlikely(be32_to_cpu(dip->di_nextents) + 320 be16_to_cpu(dip->di_anextents) > 321 be64_to_cpu(dip->di_nblocks))) { 322 xfs_warn(ip->i_mount, 323 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 324 (unsigned long long)ip->i_ino, 325 (int)(be32_to_cpu(dip->di_nextents) + 326 be16_to_cpu(dip->di_anextents)), 327 (unsigned long long) 328 be64_to_cpu(dip->di_nblocks)); 329 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 330 ip->i_mount, dip); 331 return XFS_ERROR(EFSCORRUPTED); 332 } 333 334 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 335 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", 336 (unsigned long long)ip->i_ino, 337 dip->di_forkoff); 338 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 339 ip->i_mount, dip); 340 return XFS_ERROR(EFSCORRUPTED); 341 } 342 343 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 344 !ip->i_mount->m_rtdev_targp)) { 345 xfs_warn(ip->i_mount, 346 "corrupt dinode %Lu, has realtime flag set.", 347 ip->i_ino); 348 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 349 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 350 return XFS_ERROR(EFSCORRUPTED); 351 } 352 353 switch (ip->i_d.di_mode & S_IFMT) { 354 case S_IFIFO: 355 case S_IFCHR: 356 case S_IFBLK: 357 case S_IFSOCK: 358 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 359 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 360 ip->i_mount, dip); 361 return XFS_ERROR(EFSCORRUPTED); 362 } 363 ip->i_d.di_size = 0; 364 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 365 break; 366 367 case S_IFREG: 368 case S_IFLNK: 369 case S_IFDIR: 370 switch (dip->di_format) { 371 case XFS_DINODE_FMT_LOCAL: 372 /* 373 * no local regular files yet 374 */ 375 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { 376 xfs_warn(ip->i_mount, 377 "corrupt inode %Lu (local format for regular file).", 378 (unsigned long long) ip->i_ino); 379 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 380 XFS_ERRLEVEL_LOW, 381 ip->i_mount, dip); 382 return XFS_ERROR(EFSCORRUPTED); 383 } 384 385 di_size = be64_to_cpu(dip->di_size); 386 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 387 xfs_warn(ip->i_mount, 388 "corrupt inode %Lu (bad size %Ld for local inode).", 389 (unsigned long long) ip->i_ino, 390 (long long) di_size); 391 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 392 XFS_ERRLEVEL_LOW, 393 ip->i_mount, dip); 394 return XFS_ERROR(EFSCORRUPTED); 395 } 396 397 size = (int)di_size; 398 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); 399 break; 400 case XFS_DINODE_FMT_EXTENTS: 401 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); 402 break; 403 case XFS_DINODE_FMT_BTREE: 404 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); 405 break; 406 default: 407 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 408 ip->i_mount); 409 return XFS_ERROR(EFSCORRUPTED); 410 } 411 break; 412 413 default: 414 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 415 return XFS_ERROR(EFSCORRUPTED); 416 } 417 if (error) { 418 return error; 419 } 420 if (!XFS_DFORK_Q(dip)) 421 return 0; 422 423 ASSERT(ip->i_afp == NULL); 424 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 425 426 switch (dip->di_aformat) { 427 case XFS_DINODE_FMT_LOCAL: 428 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 429 size = be16_to_cpu(atp->hdr.totsize); 430 431 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 432 xfs_warn(ip->i_mount, 433 "corrupt inode %Lu (bad attr fork size %Ld).", 434 (unsigned long long) ip->i_ino, 435 (long long) size); 436 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 437 XFS_ERRLEVEL_LOW, 438 ip->i_mount, dip); 439 return XFS_ERROR(EFSCORRUPTED); 440 } 441 442 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 443 break; 444 case XFS_DINODE_FMT_EXTENTS: 445 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 446 break; 447 case XFS_DINODE_FMT_BTREE: 448 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 449 break; 450 default: 451 error = XFS_ERROR(EFSCORRUPTED); 452 break; 453 } 454 if (error) { 455 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 456 ip->i_afp = NULL; 457 xfs_idestroy_fork(ip, XFS_DATA_FORK); 458 } 459 return error; 460 } 461 462 /* 463 * The file is in-lined in the on-disk inode. 464 * If it fits into if_inline_data, then copy 465 * it there, otherwise allocate a buffer for it 466 * and copy the data there. Either way, set 467 * if_data to point at the data. 468 * If we allocate a buffer for the data, make 469 * sure that its size is a multiple of 4 and 470 * record the real size in i_real_bytes. 471 */ 472 STATIC int 473 xfs_iformat_local( 474 xfs_inode_t *ip, 475 xfs_dinode_t *dip, 476 int whichfork, 477 int size) 478 { 479 xfs_ifork_t *ifp; 480 int real_size; 481 482 /* 483 * If the size is unreasonable, then something 484 * is wrong and we just bail out rather than crash in 485 * kmem_alloc() or memcpy() below. 486 */ 487 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 488 xfs_warn(ip->i_mount, 489 "corrupt inode %Lu (bad size %d for local fork, size = %d).", 490 (unsigned long long) ip->i_ino, size, 491 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 492 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 493 ip->i_mount, dip); 494 return XFS_ERROR(EFSCORRUPTED); 495 } 496 ifp = XFS_IFORK_PTR(ip, whichfork); 497 real_size = 0; 498 if (size == 0) 499 ifp->if_u1.if_data = NULL; 500 else if (size <= sizeof(ifp->if_u2.if_inline_data)) 501 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 502 else { 503 real_size = roundup(size, 4); 504 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); 505 } 506 ifp->if_bytes = size; 507 ifp->if_real_bytes = real_size; 508 if (size) 509 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); 510 ifp->if_flags &= ~XFS_IFEXTENTS; 511 ifp->if_flags |= XFS_IFINLINE; 512 return 0; 513 } 514 515 /* 516 * The file consists of a set of extents all 517 * of which fit into the on-disk inode. 518 * If there are few enough extents to fit into 519 * the if_inline_ext, then copy them there. 520 * Otherwise allocate a buffer for them and copy 521 * them into it. Either way, set if_extents 522 * to point at the extents. 523 */ 524 STATIC int 525 xfs_iformat_extents( 526 xfs_inode_t *ip, 527 xfs_dinode_t *dip, 528 int whichfork) 529 { 530 xfs_bmbt_rec_t *dp; 531 xfs_ifork_t *ifp; 532 int nex; 533 int size; 534 int i; 535 536 ifp = XFS_IFORK_PTR(ip, whichfork); 537 nex = XFS_DFORK_NEXTENTS(dip, whichfork); 538 size = nex * (uint)sizeof(xfs_bmbt_rec_t); 539 540 /* 541 * If the number of extents is unreasonable, then something 542 * is wrong and we just bail out rather than crash in 543 * kmem_alloc() or memcpy() below. 544 */ 545 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 546 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", 547 (unsigned long long) ip->i_ino, nex); 548 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 549 ip->i_mount, dip); 550 return XFS_ERROR(EFSCORRUPTED); 551 } 552 553 ifp->if_real_bytes = 0; 554 if (nex == 0) 555 ifp->if_u1.if_extents = NULL; 556 else if (nex <= XFS_INLINE_EXTS) 557 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 558 else 559 xfs_iext_add(ifp, 0, nex); 560 561 ifp->if_bytes = size; 562 if (size) { 563 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); 564 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); 565 for (i = 0; i < nex; i++, dp++) { 566 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 567 ep->l0 = get_unaligned_be64(&dp->l0); 568 ep->l1 = get_unaligned_be64(&dp->l1); 569 } 570 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); 571 if (whichfork != XFS_DATA_FORK || 572 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) 573 if (unlikely(xfs_check_nostate_extents( 574 ifp, 0, nex))) { 575 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 576 XFS_ERRLEVEL_LOW, 577 ip->i_mount); 578 return XFS_ERROR(EFSCORRUPTED); 579 } 580 } 581 ifp->if_flags |= XFS_IFEXTENTS; 582 return 0; 583 } 584 585 /* 586 * The file has too many extents to fit into 587 * the inode, so they are in B-tree format. 588 * Allocate a buffer for the root of the B-tree 589 * and copy the root into it. The i_extents 590 * field will remain NULL until all of the 591 * extents are read in (when they are needed). 592 */ 593 STATIC int 594 xfs_iformat_btree( 595 xfs_inode_t *ip, 596 xfs_dinode_t *dip, 597 int whichfork) 598 { 599 xfs_bmdr_block_t *dfp; 600 xfs_ifork_t *ifp; 601 /* REFERENCED */ 602 int nrecs; 603 int size; 604 605 ifp = XFS_IFORK_PTR(ip, whichfork); 606 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); 607 size = XFS_BMAP_BROOT_SPACE(dfp); 608 nrecs = be16_to_cpu(dfp->bb_numrecs); 609 610 /* 611 * blow out if -- fork has less extents than can fit in 612 * fork (fork shouldn't be a btree format), root btree 613 * block has more records than can fit into the fork, 614 * or the number of extents is greater than the number of 615 * blocks. 616 */ 617 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= 618 XFS_IFORK_MAXEXT(ip, whichfork) || 619 XFS_BMDR_SPACE_CALC(nrecs) > 620 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || 621 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 622 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 623 (unsigned long long) ip->i_ino); 624 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 625 ip->i_mount, dip); 626 return XFS_ERROR(EFSCORRUPTED); 627 } 628 629 ifp->if_broot_bytes = size; 630 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); 631 ASSERT(ifp->if_broot != NULL); 632 /* 633 * Copy and convert from the on-disk structure 634 * to the in-memory structure. 635 */ 636 xfs_bmdr_to_bmbt(ip->i_mount, dfp, 637 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), 638 ifp->if_broot, size); 639 ifp->if_flags &= ~XFS_IFEXTENTS; 640 ifp->if_flags |= XFS_IFBROOT; 641 642 return 0; 643 } 644 645 STATIC void 646 xfs_dinode_from_disk( 647 xfs_icdinode_t *to, 648 xfs_dinode_t *from) 649 { 650 to->di_magic = be16_to_cpu(from->di_magic); 651 to->di_mode = be16_to_cpu(from->di_mode); 652 to->di_version = from ->di_version; 653 to->di_format = from->di_format; 654 to->di_onlink = be16_to_cpu(from->di_onlink); 655 to->di_uid = be32_to_cpu(from->di_uid); 656 to->di_gid = be32_to_cpu(from->di_gid); 657 to->di_nlink = be32_to_cpu(from->di_nlink); 658 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 659 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 660 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 661 to->di_flushiter = be16_to_cpu(from->di_flushiter); 662 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); 663 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); 664 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); 665 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); 666 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); 667 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); 668 to->di_size = be64_to_cpu(from->di_size); 669 to->di_nblocks = be64_to_cpu(from->di_nblocks); 670 to->di_extsize = be32_to_cpu(from->di_extsize); 671 to->di_nextents = be32_to_cpu(from->di_nextents); 672 to->di_anextents = be16_to_cpu(from->di_anextents); 673 to->di_forkoff = from->di_forkoff; 674 to->di_aformat = from->di_aformat; 675 to->di_dmevmask = be32_to_cpu(from->di_dmevmask); 676 to->di_dmstate = be16_to_cpu(from->di_dmstate); 677 to->di_flags = be16_to_cpu(from->di_flags); 678 to->di_gen = be32_to_cpu(from->di_gen); 679 } 680 681 void 682 xfs_dinode_to_disk( 683 xfs_dinode_t *to, 684 xfs_icdinode_t *from) 685 { 686 to->di_magic = cpu_to_be16(from->di_magic); 687 to->di_mode = cpu_to_be16(from->di_mode); 688 to->di_version = from ->di_version; 689 to->di_format = from->di_format; 690 to->di_onlink = cpu_to_be16(from->di_onlink); 691 to->di_uid = cpu_to_be32(from->di_uid); 692 to->di_gid = cpu_to_be32(from->di_gid); 693 to->di_nlink = cpu_to_be32(from->di_nlink); 694 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 695 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 696 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 697 to->di_flushiter = cpu_to_be16(from->di_flushiter); 698 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 699 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 700 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 701 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); 702 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); 703 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); 704 to->di_size = cpu_to_be64(from->di_size); 705 to->di_nblocks = cpu_to_be64(from->di_nblocks); 706 to->di_extsize = cpu_to_be32(from->di_extsize); 707 to->di_nextents = cpu_to_be32(from->di_nextents); 708 to->di_anextents = cpu_to_be16(from->di_anextents); 709 to->di_forkoff = from->di_forkoff; 710 to->di_aformat = from->di_aformat; 711 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 712 to->di_dmstate = cpu_to_be16(from->di_dmstate); 713 to->di_flags = cpu_to_be16(from->di_flags); 714 to->di_gen = cpu_to_be32(from->di_gen); 715 } 716 717 STATIC uint 718 _xfs_dic2xflags( 719 __uint16_t di_flags) 720 { 721 uint flags = 0; 722 723 if (di_flags & XFS_DIFLAG_ANY) { 724 if (di_flags & XFS_DIFLAG_REALTIME) 725 flags |= XFS_XFLAG_REALTIME; 726 if (di_flags & XFS_DIFLAG_PREALLOC) 727 flags |= XFS_XFLAG_PREALLOC; 728 if (di_flags & XFS_DIFLAG_IMMUTABLE) 729 flags |= XFS_XFLAG_IMMUTABLE; 730 if (di_flags & XFS_DIFLAG_APPEND) 731 flags |= XFS_XFLAG_APPEND; 732 if (di_flags & XFS_DIFLAG_SYNC) 733 flags |= XFS_XFLAG_SYNC; 734 if (di_flags & XFS_DIFLAG_NOATIME) 735 flags |= XFS_XFLAG_NOATIME; 736 if (di_flags & XFS_DIFLAG_NODUMP) 737 flags |= XFS_XFLAG_NODUMP; 738 if (di_flags & XFS_DIFLAG_RTINHERIT) 739 flags |= XFS_XFLAG_RTINHERIT; 740 if (di_flags & XFS_DIFLAG_PROJINHERIT) 741 flags |= XFS_XFLAG_PROJINHERIT; 742 if (di_flags & XFS_DIFLAG_NOSYMLINKS) 743 flags |= XFS_XFLAG_NOSYMLINKS; 744 if (di_flags & XFS_DIFLAG_EXTSIZE) 745 flags |= XFS_XFLAG_EXTSIZE; 746 if (di_flags & XFS_DIFLAG_EXTSZINHERIT) 747 flags |= XFS_XFLAG_EXTSZINHERIT; 748 if (di_flags & XFS_DIFLAG_NODEFRAG) 749 flags |= XFS_XFLAG_NODEFRAG; 750 if (di_flags & XFS_DIFLAG_FILESTREAM) 751 flags |= XFS_XFLAG_FILESTREAM; 752 } 753 754 return flags; 755 } 756 757 uint 758 xfs_ip2xflags( 759 xfs_inode_t *ip) 760 { 761 xfs_icdinode_t *dic = &ip->i_d; 762 763 return _xfs_dic2xflags(dic->di_flags) | 764 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); 765 } 766 767 uint 768 xfs_dic2xflags( 769 xfs_dinode_t *dip) 770 { 771 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | 772 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 773 } 774 775 /* 776 * Read the disk inode attributes into the in-core inode structure. 777 */ 778 int 779 xfs_iread( 780 xfs_mount_t *mp, 781 xfs_trans_t *tp, 782 xfs_inode_t *ip, 783 uint iget_flags) 784 { 785 xfs_buf_t *bp; 786 xfs_dinode_t *dip; 787 int error; 788 789 /* 790 * Fill in the location information in the in-core inode. 791 */ 792 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); 793 if (error) 794 return error; 795 796 /* 797 * Get pointers to the on-disk inode and the buffer containing it. 798 */ 799 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, 0, iget_flags); 800 if (error) 801 return error; 802 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 803 804 /* 805 * If we got something that isn't an inode it means someone 806 * (nfs or dmi) has a stale handle. 807 */ 808 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) { 809 #ifdef DEBUG 810 xfs_alert(mp, 811 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)", 812 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC); 813 #endif /* DEBUG */ 814 error = XFS_ERROR(EINVAL); 815 goto out_brelse; 816 } 817 818 /* 819 * If the on-disk inode is already linked to a directory 820 * entry, copy all of the inode into the in-core inode. 821 * xfs_iformat() handles copying in the inode format 822 * specific information. 823 * Otherwise, just get the truly permanent information. 824 */ 825 if (dip->di_mode) { 826 xfs_dinode_from_disk(&ip->i_d, dip); 827 error = xfs_iformat(ip, dip); 828 if (error) { 829 #ifdef DEBUG 830 xfs_alert(mp, "%s: xfs_iformat() returned error %d", 831 __func__, error); 832 #endif /* DEBUG */ 833 goto out_brelse; 834 } 835 } else { 836 ip->i_d.di_magic = be16_to_cpu(dip->di_magic); 837 ip->i_d.di_version = dip->di_version; 838 ip->i_d.di_gen = be32_to_cpu(dip->di_gen); 839 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); 840 /* 841 * Make sure to pull in the mode here as well in 842 * case the inode is released without being used. 843 * This ensures that xfs_inactive() will see that 844 * the inode is already free and not try to mess 845 * with the uninitialized part of it. 846 */ 847 ip->i_d.di_mode = 0; 848 } 849 850 /* 851 * The inode format changed when we moved the link count and 852 * made it 32 bits long. If this is an old format inode, 853 * convert it in memory to look like a new one. If it gets 854 * flushed to disk we will convert back before flushing or 855 * logging it. We zero out the new projid field and the old link 856 * count field. We'll handle clearing the pad field (the remains 857 * of the old uuid field) when we actually convert the inode to 858 * the new format. We don't change the version number so that we 859 * can distinguish this from a real new format inode. 860 */ 861 if (ip->i_d.di_version == 1) { 862 ip->i_d.di_nlink = ip->i_d.di_onlink; 863 ip->i_d.di_onlink = 0; 864 xfs_set_projid(ip, 0); 865 } 866 867 ip->i_delayed_blks = 0; 868 869 /* 870 * Mark the buffer containing the inode as something to keep 871 * around for a while. This helps to keep recently accessed 872 * meta-data in-core longer. 873 */ 874 xfs_buf_set_ref(bp, XFS_INO_REF); 875 876 /* 877 * Use xfs_trans_brelse() to release the buffer containing the 878 * on-disk inode, because it was acquired with xfs_trans_read_buf() 879 * in xfs_itobp() above. If tp is NULL, this is just a normal 880 * brelse(). If we're within a transaction, then xfs_trans_brelse() 881 * will only release the buffer if it is not dirty within the 882 * transaction. It will be OK to release the buffer in this case, 883 * because inodes on disk are never destroyed and we will be 884 * locking the new in-core inode before putting it in the hash 885 * table where other processes can find it. Thus we don't have 886 * to worry about the inode being changed just because we released 887 * the buffer. 888 */ 889 out_brelse: 890 xfs_trans_brelse(tp, bp); 891 return error; 892 } 893 894 /* 895 * Read in extents from a btree-format inode. 896 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. 897 */ 898 int 899 xfs_iread_extents( 900 xfs_trans_t *tp, 901 xfs_inode_t *ip, 902 int whichfork) 903 { 904 int error; 905 xfs_ifork_t *ifp; 906 xfs_extnum_t nextents; 907 908 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 909 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 910 ip->i_mount); 911 return XFS_ERROR(EFSCORRUPTED); 912 } 913 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 914 ifp = XFS_IFORK_PTR(ip, whichfork); 915 916 /* 917 * We know that the size is valid (it's checked in iformat_btree) 918 */ 919 ifp->if_bytes = ifp->if_real_bytes = 0; 920 ifp->if_flags |= XFS_IFEXTENTS; 921 xfs_iext_add(ifp, 0, nextents); 922 error = xfs_bmap_read_extents(tp, ip, whichfork); 923 if (error) { 924 xfs_iext_destroy(ifp); 925 ifp->if_flags &= ~XFS_IFEXTENTS; 926 return error; 927 } 928 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); 929 return 0; 930 } 931 932 /* 933 * Allocate an inode on disk and return a copy of its in-core version. 934 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 935 * appropriately within the inode. The uid and gid for the inode are 936 * set according to the contents of the given cred structure. 937 * 938 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 939 * has a free inode available, call xfs_iget() 940 * to obtain the in-core version of the allocated inode. Finally, 941 * fill in the inode and log its initial contents. In this case, 942 * ialloc_context would be set to NULL and call_again set to false. 943 * 944 * If xfs_dialloc() does not have an available inode, 945 * it will replenish its supply by doing an allocation. Since we can 946 * only do one allocation within a transaction without deadlocks, we 947 * must commit the current transaction before returning the inode itself. 948 * In this case, therefore, we will set call_again to true and return. 949 * The caller should then commit the current transaction, start a new 950 * transaction, and call xfs_ialloc() again to actually get the inode. 951 * 952 * To ensure that some other process does not grab the inode that 953 * was allocated during the first call to xfs_ialloc(), this routine 954 * also returns the [locked] bp pointing to the head of the freelist 955 * as ialloc_context. The caller should hold this buffer across 956 * the commit and pass it back into this routine on the second call. 957 * 958 * If we are allocating quota inodes, we do not have a parent inode 959 * to attach to or associate with (i.e. pip == NULL) because they 960 * are not linked into the directory structure - they are attached 961 * directly to the superblock - and so have no parent. 962 */ 963 int 964 xfs_ialloc( 965 xfs_trans_t *tp, 966 xfs_inode_t *pip, 967 umode_t mode, 968 xfs_nlink_t nlink, 969 xfs_dev_t rdev, 970 prid_t prid, 971 int okalloc, 972 xfs_buf_t **ialloc_context, 973 boolean_t *call_again, 974 xfs_inode_t **ipp) 975 { 976 xfs_ino_t ino; 977 xfs_inode_t *ip; 978 uint flags; 979 int error; 980 timespec_t tv; 981 int filestreams = 0; 982 983 /* 984 * Call the space management code to pick 985 * the on-disk inode to be allocated. 986 */ 987 error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, 988 ialloc_context, call_again, &ino); 989 if (error) 990 return error; 991 if (*call_again || ino == NULLFSINO) { 992 *ipp = NULL; 993 return 0; 994 } 995 ASSERT(*ialloc_context == NULL); 996 997 /* 998 * Get the in-core inode with the lock held exclusively. 999 * This is because we're setting fields here we need 1000 * to prevent others from looking at until we're done. 1001 */ 1002 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE, 1003 XFS_ILOCK_EXCL, &ip); 1004 if (error) 1005 return error; 1006 ASSERT(ip != NULL); 1007 1008 ip->i_d.di_mode = mode; 1009 ip->i_d.di_onlink = 0; 1010 ip->i_d.di_nlink = nlink; 1011 ASSERT(ip->i_d.di_nlink == nlink); 1012 ip->i_d.di_uid = current_fsuid(); 1013 ip->i_d.di_gid = current_fsgid(); 1014 xfs_set_projid(ip, prid); 1015 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 1016 1017 /* 1018 * If the superblock version is up to where we support new format 1019 * inodes and this is currently an old format inode, then change 1020 * the inode version number now. This way we only do the conversion 1021 * here rather than here and in the flush/logging code. 1022 */ 1023 if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) && 1024 ip->i_d.di_version == 1) { 1025 ip->i_d.di_version = 2; 1026 /* 1027 * We've already zeroed the old link count, the projid field, 1028 * and the pad field. 1029 */ 1030 } 1031 1032 /* 1033 * Project ids won't be stored on disk if we are using a version 1 inode. 1034 */ 1035 if ((prid != 0) && (ip->i_d.di_version == 1)) 1036 xfs_bump_ino_vers2(tp, ip); 1037 1038 if (pip && XFS_INHERIT_GID(pip)) { 1039 ip->i_d.di_gid = pip->i_d.di_gid; 1040 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { 1041 ip->i_d.di_mode |= S_ISGID; 1042 } 1043 } 1044 1045 /* 1046 * If the group ID of the new file does not match the effective group 1047 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 1048 * (and only if the irix_sgid_inherit compatibility variable is set). 1049 */ 1050 if ((irix_sgid_inherit) && 1051 (ip->i_d.di_mode & S_ISGID) && 1052 (!in_group_p((gid_t)ip->i_d.di_gid))) { 1053 ip->i_d.di_mode &= ~S_ISGID; 1054 } 1055 1056 ip->i_d.di_size = 0; 1057 ip->i_d.di_nextents = 0; 1058 ASSERT(ip->i_d.di_nblocks == 0); 1059 1060 nanotime(&tv); 1061 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 1062 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 1063 ip->i_d.di_atime = ip->i_d.di_mtime; 1064 ip->i_d.di_ctime = ip->i_d.di_mtime; 1065 1066 /* 1067 * di_gen will have been taken care of in xfs_iread. 1068 */ 1069 ip->i_d.di_extsize = 0; 1070 ip->i_d.di_dmevmask = 0; 1071 ip->i_d.di_dmstate = 0; 1072 ip->i_d.di_flags = 0; 1073 flags = XFS_ILOG_CORE; 1074 switch (mode & S_IFMT) { 1075 case S_IFIFO: 1076 case S_IFCHR: 1077 case S_IFBLK: 1078 case S_IFSOCK: 1079 ip->i_d.di_format = XFS_DINODE_FMT_DEV; 1080 ip->i_df.if_u2.if_rdev = rdev; 1081 ip->i_df.if_flags = 0; 1082 flags |= XFS_ILOG_DEV; 1083 break; 1084 case S_IFREG: 1085 /* 1086 * we can't set up filestreams until after the VFS inode 1087 * is set up properly. 1088 */ 1089 if (pip && xfs_inode_is_filestream(pip)) 1090 filestreams = 1; 1091 /* fall through */ 1092 case S_IFDIR: 1093 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { 1094 uint di_flags = 0; 1095 1096 if (S_ISDIR(mode)) { 1097 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1098 di_flags |= XFS_DIFLAG_RTINHERIT; 1099 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1100 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 1101 ip->i_d.di_extsize = pip->i_d.di_extsize; 1102 } 1103 } else if (S_ISREG(mode)) { 1104 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 1105 di_flags |= XFS_DIFLAG_REALTIME; 1106 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { 1107 di_flags |= XFS_DIFLAG_EXTSIZE; 1108 ip->i_d.di_extsize = pip->i_d.di_extsize; 1109 } 1110 } 1111 if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && 1112 xfs_inherit_noatime) 1113 di_flags |= XFS_DIFLAG_NOATIME; 1114 if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && 1115 xfs_inherit_nodump) 1116 di_flags |= XFS_DIFLAG_NODUMP; 1117 if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && 1118 xfs_inherit_sync) 1119 di_flags |= XFS_DIFLAG_SYNC; 1120 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 1121 xfs_inherit_nosymlinks) 1122 di_flags |= XFS_DIFLAG_NOSYMLINKS; 1123 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) 1124 di_flags |= XFS_DIFLAG_PROJINHERIT; 1125 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 1126 xfs_inherit_nodefrag) 1127 di_flags |= XFS_DIFLAG_NODEFRAG; 1128 if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) 1129 di_flags |= XFS_DIFLAG_FILESTREAM; 1130 ip->i_d.di_flags |= di_flags; 1131 } 1132 /* FALLTHROUGH */ 1133 case S_IFLNK: 1134 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1135 ip->i_df.if_flags = XFS_IFEXTENTS; 1136 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; 1137 ip->i_df.if_u1.if_extents = NULL; 1138 break; 1139 default: 1140 ASSERT(0); 1141 } 1142 /* 1143 * Attribute fork settings for new inode. 1144 */ 1145 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1146 ip->i_d.di_anextents = 0; 1147 1148 /* 1149 * Log the new values stuffed into the inode. 1150 */ 1151 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1152 xfs_trans_log_inode(tp, ip, flags); 1153 1154 /* now that we have an i_mode we can setup inode ops and unlock */ 1155 xfs_setup_inode(ip); 1156 1157 /* now we have set up the vfs inode we can associate the filestream */ 1158 if (filestreams) { 1159 error = xfs_filestream_associate(pip, ip); 1160 if (error < 0) 1161 return -error; 1162 if (!error) 1163 xfs_iflags_set(ip, XFS_IFILESTREAM); 1164 } 1165 1166 *ipp = ip; 1167 return 0; 1168 } 1169 1170 /* 1171 * Free up the underlying blocks past new_size. The new size must be smaller 1172 * than the current size. This routine can be used both for the attribute and 1173 * data fork, and does not modify the inode size, which is left to the caller. 1174 * 1175 * The transaction passed to this routine must have made a permanent log 1176 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1177 * given transaction and start new ones, so make sure everything involved in 1178 * the transaction is tidy before calling here. Some transaction will be 1179 * returned to the caller to be committed. The incoming transaction must 1180 * already include the inode, and both inode locks must be held exclusively. 1181 * The inode must also be "held" within the transaction. On return the inode 1182 * will be "held" within the returned transaction. This routine does NOT 1183 * require any disk space to be reserved for it within the transaction. 1184 * 1185 * If we get an error, we must return with the inode locked and linked into the 1186 * current transaction. This keeps things simple for the higher level code, 1187 * because it always knows that the inode is locked and held in the transaction 1188 * that returns to it whether errors occur or not. We don't mark the inode 1189 * dirty on error so that transactions can be easily aborted if possible. 1190 */ 1191 int 1192 xfs_itruncate_extents( 1193 struct xfs_trans **tpp, 1194 struct xfs_inode *ip, 1195 int whichfork, 1196 xfs_fsize_t new_size) 1197 { 1198 struct xfs_mount *mp = ip->i_mount; 1199 struct xfs_trans *tp = *tpp; 1200 struct xfs_trans *ntp; 1201 xfs_bmap_free_t free_list; 1202 xfs_fsblock_t first_block; 1203 xfs_fileoff_t first_unmap_block; 1204 xfs_fileoff_t last_block; 1205 xfs_filblks_t unmap_len; 1206 int committed; 1207 int error = 0; 1208 int done = 0; 1209 1210 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1211 ASSERT(new_size <= XFS_ISIZE(ip)); 1212 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1213 ASSERT(ip->i_itemp != NULL); 1214 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1215 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1216 1217 trace_xfs_itruncate_extents_start(ip, new_size); 1218 1219 /* 1220 * Since it is possible for space to become allocated beyond 1221 * the end of the file (in a crash where the space is allocated 1222 * but the inode size is not yet updated), simply remove any 1223 * blocks which show up between the new EOF and the maximum 1224 * possible file size. If the first block to be removed is 1225 * beyond the maximum file size (ie it is the same as last_block), 1226 * then there is nothing to do. 1227 */ 1228 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1229 last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 1230 if (first_unmap_block == last_block) 1231 return 0; 1232 1233 ASSERT(first_unmap_block < last_block); 1234 unmap_len = last_block - first_unmap_block + 1; 1235 while (!done) { 1236 xfs_bmap_init(&free_list, &first_block); 1237 error = xfs_bunmapi(tp, ip, 1238 first_unmap_block, unmap_len, 1239 xfs_bmapi_aflag(whichfork), 1240 XFS_ITRUNC_MAX_EXTENTS, 1241 &first_block, &free_list, 1242 &done); 1243 if (error) 1244 goto out_bmap_cancel; 1245 1246 /* 1247 * Duplicate the transaction that has the permanent 1248 * reservation and commit the old transaction. 1249 */ 1250 error = xfs_bmap_finish(&tp, &free_list, &committed); 1251 if (committed) 1252 xfs_trans_ijoin(tp, ip, 0); 1253 if (error) 1254 goto out_bmap_cancel; 1255 1256 if (committed) { 1257 /* 1258 * Mark the inode dirty so it will be logged and 1259 * moved forward in the log as part of every commit. 1260 */ 1261 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1262 } 1263 1264 ntp = xfs_trans_dup(tp); 1265 error = xfs_trans_commit(tp, 0); 1266 tp = ntp; 1267 1268 xfs_trans_ijoin(tp, ip, 0); 1269 1270 if (error) 1271 goto out; 1272 1273 /* 1274 * Transaction commit worked ok so we can drop the extra ticket 1275 * reference that we gained in xfs_trans_dup() 1276 */ 1277 xfs_log_ticket_put(tp->t_ticket); 1278 error = xfs_trans_reserve(tp, 0, 1279 XFS_ITRUNCATE_LOG_RES(mp), 0, 1280 XFS_TRANS_PERM_LOG_RES, 1281 XFS_ITRUNCATE_LOG_COUNT); 1282 if (error) 1283 goto out; 1284 } 1285 1286 /* 1287 * Always re-log the inode so that our permanent transaction can keep 1288 * on rolling it forward in the log. 1289 */ 1290 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1291 1292 trace_xfs_itruncate_extents_end(ip, new_size); 1293 1294 out: 1295 *tpp = tp; 1296 return error; 1297 out_bmap_cancel: 1298 /* 1299 * If the bunmapi call encounters an error, return to the caller where 1300 * the transaction can be properly aborted. We just need to make sure 1301 * we're not holding any resources that we were not when we came in. 1302 */ 1303 xfs_bmap_cancel(&free_list); 1304 goto out; 1305 } 1306 1307 /* 1308 * This is called when the inode's link count goes to 0. 1309 * We place the on-disk inode on a list in the AGI. It 1310 * will be pulled from this list when the inode is freed. 1311 */ 1312 int 1313 xfs_iunlink( 1314 xfs_trans_t *tp, 1315 xfs_inode_t *ip) 1316 { 1317 xfs_mount_t *mp; 1318 xfs_agi_t *agi; 1319 xfs_dinode_t *dip; 1320 xfs_buf_t *agibp; 1321 xfs_buf_t *ibp; 1322 xfs_agino_t agino; 1323 short bucket_index; 1324 int offset; 1325 int error; 1326 1327 ASSERT(ip->i_d.di_nlink == 0); 1328 ASSERT(ip->i_d.di_mode != 0); 1329 1330 mp = tp->t_mountp; 1331 1332 /* 1333 * Get the agi buffer first. It ensures lock ordering 1334 * on the list. 1335 */ 1336 error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); 1337 if (error) 1338 return error; 1339 agi = XFS_BUF_TO_AGI(agibp); 1340 1341 /* 1342 * Get the index into the agi hash table for the 1343 * list this inode will go on. 1344 */ 1345 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1346 ASSERT(agino != 0); 1347 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1348 ASSERT(agi->agi_unlinked[bucket_index]); 1349 ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); 1350 1351 if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { 1352 /* 1353 * There is already another inode in the bucket we need 1354 * to add ourselves to. Add us at the front of the list. 1355 * Here we put the head pointer into our next pointer, 1356 * and then we fall through to point the head at us. 1357 */ 1358 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1359 if (error) 1360 return error; 1361 1362 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); 1363 dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; 1364 offset = ip->i_imap.im_boffset + 1365 offsetof(xfs_dinode_t, di_next_unlinked); 1366 xfs_trans_inode_buf(tp, ibp); 1367 xfs_trans_log_buf(tp, ibp, offset, 1368 (offset + sizeof(xfs_agino_t) - 1)); 1369 xfs_inobp_check(mp, ibp); 1370 } 1371 1372 /* 1373 * Point the bucket head pointer at the inode being inserted. 1374 */ 1375 ASSERT(agino != 0); 1376 agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); 1377 offset = offsetof(xfs_agi_t, agi_unlinked) + 1378 (sizeof(xfs_agino_t) * bucket_index); 1379 xfs_trans_log_buf(tp, agibp, offset, 1380 (offset + sizeof(xfs_agino_t) - 1)); 1381 return 0; 1382 } 1383 1384 /* 1385 * Pull the on-disk inode from the AGI unlinked list. 1386 */ 1387 STATIC int 1388 xfs_iunlink_remove( 1389 xfs_trans_t *tp, 1390 xfs_inode_t *ip) 1391 { 1392 xfs_ino_t next_ino; 1393 xfs_mount_t *mp; 1394 xfs_agi_t *agi; 1395 xfs_dinode_t *dip; 1396 xfs_buf_t *agibp; 1397 xfs_buf_t *ibp; 1398 xfs_agnumber_t agno; 1399 xfs_agino_t agino; 1400 xfs_agino_t next_agino; 1401 xfs_buf_t *last_ibp; 1402 xfs_dinode_t *last_dip = NULL; 1403 short bucket_index; 1404 int offset, last_offset = 0; 1405 int error; 1406 1407 mp = tp->t_mountp; 1408 agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 1409 1410 /* 1411 * Get the agi buffer first. It ensures lock ordering 1412 * on the list. 1413 */ 1414 error = xfs_read_agi(mp, tp, agno, &agibp); 1415 if (error) 1416 return error; 1417 1418 agi = XFS_BUF_TO_AGI(agibp); 1419 1420 /* 1421 * Get the index into the agi hash table for the 1422 * list this inode will go on. 1423 */ 1424 agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 1425 ASSERT(agino != 0); 1426 bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 1427 ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); 1428 ASSERT(agi->agi_unlinked[bucket_index]); 1429 1430 if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { 1431 /* 1432 * We're at the head of the list. Get the inode's 1433 * on-disk buffer to see if there is anyone after us 1434 * on the list. Only modify our next pointer if it 1435 * is not already NULLAGINO. This saves us the overhead 1436 * of dealing with the buffer when there is no need to 1437 * change it. 1438 */ 1439 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1440 if (error) { 1441 xfs_warn(mp, "%s: xfs_itobp() returned error %d.", 1442 __func__, error); 1443 return error; 1444 } 1445 next_agino = be32_to_cpu(dip->di_next_unlinked); 1446 ASSERT(next_agino != 0); 1447 if (next_agino != NULLAGINO) { 1448 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1449 offset = ip->i_imap.im_boffset + 1450 offsetof(xfs_dinode_t, di_next_unlinked); 1451 xfs_trans_inode_buf(tp, ibp); 1452 xfs_trans_log_buf(tp, ibp, offset, 1453 (offset + sizeof(xfs_agino_t) - 1)); 1454 xfs_inobp_check(mp, ibp); 1455 } else { 1456 xfs_trans_brelse(tp, ibp); 1457 } 1458 /* 1459 * Point the bucket head pointer at the next inode. 1460 */ 1461 ASSERT(next_agino != 0); 1462 ASSERT(next_agino != agino); 1463 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); 1464 offset = offsetof(xfs_agi_t, agi_unlinked) + 1465 (sizeof(xfs_agino_t) * bucket_index); 1466 xfs_trans_log_buf(tp, agibp, offset, 1467 (offset + sizeof(xfs_agino_t) - 1)); 1468 } else { 1469 /* 1470 * We need to search the list for the inode being freed. 1471 */ 1472 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1473 last_ibp = NULL; 1474 while (next_agino != agino) { 1475 /* 1476 * If the last inode wasn't the one pointing to 1477 * us, then release its buffer since we're not 1478 * going to do anything with it. 1479 */ 1480 if (last_ibp != NULL) { 1481 xfs_trans_brelse(tp, last_ibp); 1482 } 1483 next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); 1484 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1485 &last_ibp, &last_offset, 0); 1486 if (error) { 1487 xfs_warn(mp, 1488 "%s: xfs_inotobp() returned error %d.", 1489 __func__, error); 1490 return error; 1491 } 1492 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1493 ASSERT(next_agino != NULLAGINO); 1494 ASSERT(next_agino != 0); 1495 } 1496 /* 1497 * Now last_ibp points to the buffer previous to us on 1498 * the unlinked list. Pull us from the list. 1499 */ 1500 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0); 1501 if (error) { 1502 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.", 1503 __func__, error); 1504 return error; 1505 } 1506 next_agino = be32_to_cpu(dip->di_next_unlinked); 1507 ASSERT(next_agino != 0); 1508 ASSERT(next_agino != agino); 1509 if (next_agino != NULLAGINO) { 1510 dip->di_next_unlinked = cpu_to_be32(NULLAGINO); 1511 offset = ip->i_imap.im_boffset + 1512 offsetof(xfs_dinode_t, di_next_unlinked); 1513 xfs_trans_inode_buf(tp, ibp); 1514 xfs_trans_log_buf(tp, ibp, offset, 1515 (offset + sizeof(xfs_agino_t) - 1)); 1516 xfs_inobp_check(mp, ibp); 1517 } else { 1518 xfs_trans_brelse(tp, ibp); 1519 } 1520 /* 1521 * Point the previous inode on the list to the next inode. 1522 */ 1523 last_dip->di_next_unlinked = cpu_to_be32(next_agino); 1524 ASSERT(next_agino != 0); 1525 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); 1526 xfs_trans_inode_buf(tp, last_ibp); 1527 xfs_trans_log_buf(tp, last_ibp, offset, 1528 (offset + sizeof(xfs_agino_t) - 1)); 1529 xfs_inobp_check(mp, last_ibp); 1530 } 1531 return 0; 1532 } 1533 1534 /* 1535 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 1536 * inodes that are in memory - they all must be marked stale and attached to 1537 * the cluster buffer. 1538 */ 1539 STATIC int 1540 xfs_ifree_cluster( 1541 xfs_inode_t *free_ip, 1542 xfs_trans_t *tp, 1543 xfs_ino_t inum) 1544 { 1545 xfs_mount_t *mp = free_ip->i_mount; 1546 int blks_per_cluster; 1547 int nbufs; 1548 int ninodes; 1549 int i, j; 1550 xfs_daddr_t blkno; 1551 xfs_buf_t *bp; 1552 xfs_inode_t *ip; 1553 xfs_inode_log_item_t *iip; 1554 xfs_log_item_t *lip; 1555 struct xfs_perag *pag; 1556 1557 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 1558 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 1559 blks_per_cluster = 1; 1560 ninodes = mp->m_sb.sb_inopblock; 1561 nbufs = XFS_IALLOC_BLOCKS(mp); 1562 } else { 1563 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / 1564 mp->m_sb.sb_blocksize; 1565 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; 1566 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1567 } 1568 1569 for (j = 0; j < nbufs; j++, inum += ninodes) { 1570 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1571 XFS_INO_TO_AGBNO(mp, inum)); 1572 1573 /* 1574 * We obtain and lock the backing buffer first in the process 1575 * here, as we have to ensure that any dirty inode that we 1576 * can't get the flush lock on is attached to the buffer. 1577 * If we scan the in-memory inodes first, then buffer IO can 1578 * complete before we get a lock on it, and hence we may fail 1579 * to mark all the active inodes on the buffer stale. 1580 */ 1581 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 1582 mp->m_bsize * blks_per_cluster, 0); 1583 1584 if (!bp) 1585 return ENOMEM; 1586 /* 1587 * Walk the inodes already attached to the buffer and mark them 1588 * stale. These will all have the flush locks held, so an 1589 * in-memory inode walk can't lock them. By marking them all 1590 * stale first, we will not attempt to lock them in the loop 1591 * below as the XFS_ISTALE flag will be set. 1592 */ 1593 lip = bp->b_fspriv; 1594 while (lip) { 1595 if (lip->li_type == XFS_LI_INODE) { 1596 iip = (xfs_inode_log_item_t *)lip; 1597 ASSERT(iip->ili_logged == 1); 1598 lip->li_cb = xfs_istale_done; 1599 xfs_trans_ail_copy_lsn(mp->m_ail, 1600 &iip->ili_flush_lsn, 1601 &iip->ili_item.li_lsn); 1602 xfs_iflags_set(iip->ili_inode, XFS_ISTALE); 1603 } 1604 lip = lip->li_bio_list; 1605 } 1606 1607 1608 /* 1609 * For each inode in memory attempt to add it to the inode 1610 * buffer and set it up for being staled on buffer IO 1611 * completion. This is safe as we've locked out tail pushing 1612 * and flushing by locking the buffer. 1613 * 1614 * We have already marked every inode that was part of a 1615 * transaction stale above, which means there is no point in 1616 * even trying to lock them. 1617 */ 1618 for (i = 0; i < ninodes; i++) { 1619 retry: 1620 rcu_read_lock(); 1621 ip = radix_tree_lookup(&pag->pag_ici_root, 1622 XFS_INO_TO_AGINO(mp, (inum + i))); 1623 1624 /* Inode not in memory, nothing to do */ 1625 if (!ip) { 1626 rcu_read_unlock(); 1627 continue; 1628 } 1629 1630 /* 1631 * because this is an RCU protected lookup, we could 1632 * find a recently freed or even reallocated inode 1633 * during the lookup. We need to check under the 1634 * i_flags_lock for a valid inode here. Skip it if it 1635 * is not valid, the wrong inode or stale. 1636 */ 1637 spin_lock(&ip->i_flags_lock); 1638 if (ip->i_ino != inum + i || 1639 __xfs_iflags_test(ip, XFS_ISTALE)) { 1640 spin_unlock(&ip->i_flags_lock); 1641 rcu_read_unlock(); 1642 continue; 1643 } 1644 spin_unlock(&ip->i_flags_lock); 1645 1646 /* 1647 * Don't try to lock/unlock the current inode, but we 1648 * _cannot_ skip the other inodes that we did not find 1649 * in the list attached to the buffer and are not 1650 * already marked stale. If we can't lock it, back off 1651 * and retry. 1652 */ 1653 if (ip != free_ip && 1654 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 1655 rcu_read_unlock(); 1656 delay(1); 1657 goto retry; 1658 } 1659 rcu_read_unlock(); 1660 1661 xfs_iflock(ip); 1662 xfs_iflags_set(ip, XFS_ISTALE); 1663 1664 /* 1665 * we don't need to attach clean inodes or those only 1666 * with unlogged changes (which we throw away, anyway). 1667 */ 1668 iip = ip->i_itemp; 1669 if (!iip || xfs_inode_clean(ip)) { 1670 ASSERT(ip != free_ip); 1671 xfs_ifunlock(ip); 1672 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1673 continue; 1674 } 1675 1676 iip->ili_last_fields = iip->ili_fields; 1677 iip->ili_fields = 0; 1678 iip->ili_logged = 1; 1679 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 1680 &iip->ili_item.li_lsn); 1681 1682 xfs_buf_attach_iodone(bp, xfs_istale_done, 1683 &iip->ili_item); 1684 1685 if (ip != free_ip) 1686 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1687 } 1688 1689 xfs_trans_stale_inode_buf(tp, bp); 1690 xfs_trans_binval(tp, bp); 1691 } 1692 1693 xfs_perag_put(pag); 1694 return 0; 1695 } 1696 1697 /* 1698 * This is called to return an inode to the inode free list. 1699 * The inode should already be truncated to 0 length and have 1700 * no pages associated with it. This routine also assumes that 1701 * the inode is already a part of the transaction. 1702 * 1703 * The on-disk copy of the inode will have been added to the list 1704 * of unlinked inodes in the AGI. We need to remove the inode from 1705 * that list atomically with respect to freeing it here. 1706 */ 1707 int 1708 xfs_ifree( 1709 xfs_trans_t *tp, 1710 xfs_inode_t *ip, 1711 xfs_bmap_free_t *flist) 1712 { 1713 int error; 1714 int delete; 1715 xfs_ino_t first_ino; 1716 xfs_dinode_t *dip; 1717 xfs_buf_t *ibp; 1718 1719 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 1720 ASSERT(ip->i_d.di_nlink == 0); 1721 ASSERT(ip->i_d.di_nextents == 0); 1722 ASSERT(ip->i_d.di_anextents == 0); 1723 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); 1724 ASSERT(ip->i_d.di_nblocks == 0); 1725 1726 /* 1727 * Pull the on-disk inode from the AGI unlinked list. 1728 */ 1729 error = xfs_iunlink_remove(tp, ip); 1730 if (error != 0) { 1731 return error; 1732 } 1733 1734 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 1735 if (error != 0) { 1736 return error; 1737 } 1738 ip->i_d.di_mode = 0; /* mark incore inode as free */ 1739 ip->i_d.di_flags = 0; 1740 ip->i_d.di_dmevmask = 0; 1741 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1742 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1743 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1744 /* 1745 * Bump the generation count so no one will be confused 1746 * by reincarnations of this inode. 1747 */ 1748 ip->i_d.di_gen++; 1749 1750 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1751 1752 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0); 1753 if (error) 1754 return error; 1755 1756 /* 1757 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat 1758 * from picking up this inode when it is reclaimed (its incore state 1759 * initialzed but not flushed to disk yet). The in-core di_mode is 1760 * already cleared and a corresponding transaction logged. 1761 * The hack here just synchronizes the in-core to on-disk 1762 * di_mode value in advance before the actual inode sync to disk. 1763 * This is OK because the inode is already unlinked and would never 1764 * change its di_mode again for this inode generation. 1765 * This is a temporary hack that would require a proper fix 1766 * in the future. 1767 */ 1768 dip->di_mode = 0; 1769 1770 if (delete) { 1771 error = xfs_ifree_cluster(ip, tp, first_ino); 1772 } 1773 1774 return error; 1775 } 1776 1777 /* 1778 * Reallocate the space for if_broot based on the number of records 1779 * being added or deleted as indicated in rec_diff. Move the records 1780 * and pointers in if_broot to fit the new size. When shrinking this 1781 * will eliminate holes between the records and pointers created by 1782 * the caller. When growing this will create holes to be filled in 1783 * by the caller. 1784 * 1785 * The caller must not request to add more records than would fit in 1786 * the on-disk inode root. If the if_broot is currently NULL, then 1787 * if we adding records one will be allocated. The caller must also 1788 * not request that the number of records go below zero, although 1789 * it can go to zero. 1790 * 1791 * ip -- the inode whose if_broot area is changing 1792 * ext_diff -- the change in the number of records, positive or negative, 1793 * requested for the if_broot array. 1794 */ 1795 void 1796 xfs_iroot_realloc( 1797 xfs_inode_t *ip, 1798 int rec_diff, 1799 int whichfork) 1800 { 1801 struct xfs_mount *mp = ip->i_mount; 1802 int cur_max; 1803 xfs_ifork_t *ifp; 1804 struct xfs_btree_block *new_broot; 1805 int new_max; 1806 size_t new_size; 1807 char *np; 1808 char *op; 1809 1810 /* 1811 * Handle the degenerate case quietly. 1812 */ 1813 if (rec_diff == 0) { 1814 return; 1815 } 1816 1817 ifp = XFS_IFORK_PTR(ip, whichfork); 1818 if (rec_diff > 0) { 1819 /* 1820 * If there wasn't any memory allocated before, just 1821 * allocate it now and get out. 1822 */ 1823 if (ifp->if_broot_bytes == 0) { 1824 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); 1825 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 1826 ifp->if_broot_bytes = (int)new_size; 1827 return; 1828 } 1829 1830 /* 1831 * If there is already an existing if_broot, then we need 1832 * to realloc() it and shift the pointers to their new 1833 * location. The records don't change location because 1834 * they are kept butted up against the btree block header. 1835 */ 1836 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 1837 new_max = cur_max + rec_diff; 1838 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 1839 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, 1840 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ 1841 KM_SLEEP | KM_NOFS); 1842 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1843 ifp->if_broot_bytes); 1844 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1845 (int)new_size); 1846 ifp->if_broot_bytes = (int)new_size; 1847 ASSERT(ifp->if_broot_bytes <= 1848 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 1849 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 1850 return; 1851 } 1852 1853 /* 1854 * rec_diff is less than 0. In this case, we are shrinking the 1855 * if_broot buffer. It must already exist. If we go to zero 1856 * records, just get rid of the root and clear the status bit. 1857 */ 1858 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); 1859 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); 1860 new_max = cur_max + rec_diff; 1861 ASSERT(new_max >= 0); 1862 if (new_max > 0) 1863 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); 1864 else 1865 new_size = 0; 1866 if (new_size > 0) { 1867 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); 1868 /* 1869 * First copy over the btree block header. 1870 */ 1871 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN); 1872 } else { 1873 new_broot = NULL; 1874 ifp->if_flags &= ~XFS_IFBROOT; 1875 } 1876 1877 /* 1878 * Only copy the records and pointers if there are any. 1879 */ 1880 if (new_max > 0) { 1881 /* 1882 * First copy the records. 1883 */ 1884 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); 1885 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); 1886 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); 1887 1888 /* 1889 * Then copy the pointers. 1890 */ 1891 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 1892 ifp->if_broot_bytes); 1893 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 1894 (int)new_size); 1895 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 1896 } 1897 kmem_free(ifp->if_broot); 1898 ifp->if_broot = new_broot; 1899 ifp->if_broot_bytes = (int)new_size; 1900 ASSERT(ifp->if_broot_bytes <= 1901 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); 1902 return; 1903 } 1904 1905 1906 /* 1907 * This is called when the amount of space needed for if_data 1908 * is increased or decreased. The change in size is indicated by 1909 * the number of bytes that need to be added or deleted in the 1910 * byte_diff parameter. 1911 * 1912 * If the amount of space needed has decreased below the size of the 1913 * inline buffer, then switch to using the inline buffer. Otherwise, 1914 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer 1915 * to what is needed. 1916 * 1917 * ip -- the inode whose if_data area is changing 1918 * byte_diff -- the change in the number of bytes, positive or negative, 1919 * requested for the if_data array. 1920 */ 1921 void 1922 xfs_idata_realloc( 1923 xfs_inode_t *ip, 1924 int byte_diff, 1925 int whichfork) 1926 { 1927 xfs_ifork_t *ifp; 1928 int new_size; 1929 int real_size; 1930 1931 if (byte_diff == 0) { 1932 return; 1933 } 1934 1935 ifp = XFS_IFORK_PTR(ip, whichfork); 1936 new_size = (int)ifp->if_bytes + byte_diff; 1937 ASSERT(new_size >= 0); 1938 1939 if (new_size == 0) { 1940 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 1941 kmem_free(ifp->if_u1.if_data); 1942 } 1943 ifp->if_u1.if_data = NULL; 1944 real_size = 0; 1945 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { 1946 /* 1947 * If the valid extents/data can fit in if_inline_ext/data, 1948 * copy them from the malloc'd vector and free it. 1949 */ 1950 if (ifp->if_u1.if_data == NULL) { 1951 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 1952 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 1953 ASSERT(ifp->if_real_bytes != 0); 1954 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, 1955 new_size); 1956 kmem_free(ifp->if_u1.if_data); 1957 ifp->if_u1.if_data = ifp->if_u2.if_inline_data; 1958 } 1959 real_size = 0; 1960 } else { 1961 /* 1962 * Stuck with malloc/realloc. 1963 * For inline data, the underlying buffer must be 1964 * a multiple of 4 bytes in size so that it can be 1965 * logged and stay on word boundaries. We enforce 1966 * that here. 1967 */ 1968 real_size = roundup(new_size, 4); 1969 if (ifp->if_u1.if_data == NULL) { 1970 ASSERT(ifp->if_real_bytes == 0); 1971 ifp->if_u1.if_data = kmem_alloc(real_size, 1972 KM_SLEEP | KM_NOFS); 1973 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { 1974 /* 1975 * Only do the realloc if the underlying size 1976 * is really changing. 1977 */ 1978 if (ifp->if_real_bytes != real_size) { 1979 ifp->if_u1.if_data = 1980 kmem_realloc(ifp->if_u1.if_data, 1981 real_size, 1982 ifp->if_real_bytes, 1983 KM_SLEEP | KM_NOFS); 1984 } 1985 } else { 1986 ASSERT(ifp->if_real_bytes == 0); 1987 ifp->if_u1.if_data = kmem_alloc(real_size, 1988 KM_SLEEP | KM_NOFS); 1989 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, 1990 ifp->if_bytes); 1991 } 1992 } 1993 ifp->if_real_bytes = real_size; 1994 ifp->if_bytes = new_size; 1995 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 1996 } 1997 1998 void 1999 xfs_idestroy_fork( 2000 xfs_inode_t *ip, 2001 int whichfork) 2002 { 2003 xfs_ifork_t *ifp; 2004 2005 ifp = XFS_IFORK_PTR(ip, whichfork); 2006 if (ifp->if_broot != NULL) { 2007 kmem_free(ifp->if_broot); 2008 ifp->if_broot = NULL; 2009 } 2010 2011 /* 2012 * If the format is local, then we can't have an extents 2013 * array so just look for an inline data array. If we're 2014 * not local then we may or may not have an extents list, 2015 * so check and free it up if we do. 2016 */ 2017 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 2018 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && 2019 (ifp->if_u1.if_data != NULL)) { 2020 ASSERT(ifp->if_real_bytes != 0); 2021 kmem_free(ifp->if_u1.if_data); 2022 ifp->if_u1.if_data = NULL; 2023 ifp->if_real_bytes = 0; 2024 } 2025 } else if ((ifp->if_flags & XFS_IFEXTENTS) && 2026 ((ifp->if_flags & XFS_IFEXTIREC) || 2027 ((ifp->if_u1.if_extents != NULL) && 2028 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { 2029 ASSERT(ifp->if_real_bytes != 0); 2030 xfs_iext_destroy(ifp); 2031 } 2032 ASSERT(ifp->if_u1.if_extents == NULL || 2033 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); 2034 ASSERT(ifp->if_real_bytes == 0); 2035 if (whichfork == XFS_ATTR_FORK) { 2036 kmem_zone_free(xfs_ifork_zone, ip->i_afp); 2037 ip->i_afp = NULL; 2038 } 2039 } 2040 2041 /* 2042 * This is called to unpin an inode. The caller must have the inode locked 2043 * in at least shared mode so that the buffer cannot be subsequently pinned 2044 * once someone is waiting for it to be unpinned. 2045 */ 2046 static void 2047 xfs_iunpin( 2048 struct xfs_inode *ip) 2049 { 2050 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2051 2052 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2053 2054 /* Give the log a push to start the unpinning I/O */ 2055 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); 2056 2057 } 2058 2059 static void 2060 __xfs_iunpin_wait( 2061 struct xfs_inode *ip) 2062 { 2063 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2064 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2065 2066 xfs_iunpin(ip); 2067 2068 do { 2069 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 2070 if (xfs_ipincount(ip)) 2071 io_schedule(); 2072 } while (xfs_ipincount(ip)); 2073 finish_wait(wq, &wait.wait); 2074 } 2075 2076 void 2077 xfs_iunpin_wait( 2078 struct xfs_inode *ip) 2079 { 2080 if (xfs_ipincount(ip)) 2081 __xfs_iunpin_wait(ip); 2082 } 2083 2084 /* 2085 * xfs_iextents_copy() 2086 * 2087 * This is called to copy the REAL extents (as opposed to the delayed 2088 * allocation extents) from the inode into the given buffer. It 2089 * returns the number of bytes copied into the buffer. 2090 * 2091 * If there are no delayed allocation extents, then we can just 2092 * memcpy() the extents into the buffer. Otherwise, we need to 2093 * examine each extent in turn and skip those which are delayed. 2094 */ 2095 int 2096 xfs_iextents_copy( 2097 xfs_inode_t *ip, 2098 xfs_bmbt_rec_t *dp, 2099 int whichfork) 2100 { 2101 int copied; 2102 int i; 2103 xfs_ifork_t *ifp; 2104 int nrecs; 2105 xfs_fsblock_t start_block; 2106 2107 ifp = XFS_IFORK_PTR(ip, whichfork); 2108 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2109 ASSERT(ifp->if_bytes > 0); 2110 2111 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2112 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2113 ASSERT(nrecs > 0); 2114 2115 /* 2116 * There are some delayed allocation extents in the 2117 * inode, so copy the extents one at a time and skip 2118 * the delayed ones. There must be at least one 2119 * non-delayed extent. 2120 */ 2121 copied = 0; 2122 for (i = 0; i < nrecs; i++) { 2123 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2124 start_block = xfs_bmbt_get_startblock(ep); 2125 if (isnullstartblock(start_block)) { 2126 /* 2127 * It's a delayed allocation extent, so skip it. 2128 */ 2129 continue; 2130 } 2131 2132 /* Translate to on disk format */ 2133 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2134 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2135 dp++; 2136 copied++; 2137 } 2138 ASSERT(copied != 0); 2139 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); 2140 2141 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2142 } 2143 2144 /* 2145 * Each of the following cases stores data into the same region 2146 * of the on-disk inode, so only one of them can be valid at 2147 * any given time. While it is possible to have conflicting formats 2148 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is 2149 * in EXTENTS format, this can only happen when the fork has 2150 * changed formats after being modified but before being flushed. 2151 * In these cases, the format always takes precedence, because the 2152 * format indicates the current state of the fork. 2153 */ 2154 /*ARGSUSED*/ 2155 STATIC void 2156 xfs_iflush_fork( 2157 xfs_inode_t *ip, 2158 xfs_dinode_t *dip, 2159 xfs_inode_log_item_t *iip, 2160 int whichfork, 2161 xfs_buf_t *bp) 2162 { 2163 char *cp; 2164 xfs_ifork_t *ifp; 2165 xfs_mount_t *mp; 2166 #ifdef XFS_TRANS_DEBUG 2167 int first; 2168 #endif 2169 static const short brootflag[2] = 2170 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; 2171 static const short dataflag[2] = 2172 { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; 2173 static const short extflag[2] = 2174 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2175 2176 if (!iip) 2177 return; 2178 ifp = XFS_IFORK_PTR(ip, whichfork); 2179 /* 2180 * This can happen if we gave up in iformat in an error path, 2181 * for the attribute fork. 2182 */ 2183 if (!ifp) { 2184 ASSERT(whichfork == XFS_ATTR_FORK); 2185 return; 2186 } 2187 cp = XFS_DFORK_PTR(dip, whichfork); 2188 mp = ip->i_mount; 2189 switch (XFS_IFORK_FORMAT(ip, whichfork)) { 2190 case XFS_DINODE_FMT_LOCAL: 2191 if ((iip->ili_fields & dataflag[whichfork]) && 2192 (ifp->if_bytes > 0)) { 2193 ASSERT(ifp->if_u1.if_data != NULL); 2194 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); 2195 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); 2196 } 2197 break; 2198 2199 case XFS_DINODE_FMT_EXTENTS: 2200 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2201 !(iip->ili_fields & extflag[whichfork])); 2202 if ((iip->ili_fields & extflag[whichfork]) && 2203 (ifp->if_bytes > 0)) { 2204 ASSERT(xfs_iext_get_ext(ifp, 0)); 2205 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2206 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2207 whichfork); 2208 } 2209 break; 2210 2211 case XFS_DINODE_FMT_BTREE: 2212 if ((iip->ili_fields & brootflag[whichfork]) && 2213 (ifp->if_broot_bytes > 0)) { 2214 ASSERT(ifp->if_broot != NULL); 2215 ASSERT(ifp->if_broot_bytes <= 2216 (XFS_IFORK_SIZE(ip, whichfork) + 2217 XFS_BROOT_SIZE_ADJ)); 2218 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2219 (xfs_bmdr_block_t *)cp, 2220 XFS_DFORK_SIZE(dip, mp, whichfork)); 2221 } 2222 break; 2223 2224 case XFS_DINODE_FMT_DEV: 2225 if (iip->ili_fields & XFS_ILOG_DEV) { 2226 ASSERT(whichfork == XFS_DATA_FORK); 2227 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2228 } 2229 break; 2230 2231 case XFS_DINODE_FMT_UUID: 2232 if (iip->ili_fields & XFS_ILOG_UUID) { 2233 ASSERT(whichfork == XFS_DATA_FORK); 2234 memcpy(XFS_DFORK_DPTR(dip), 2235 &ip->i_df.if_u2.if_uuid, 2236 sizeof(uuid_t)); 2237 } 2238 break; 2239 2240 default: 2241 ASSERT(0); 2242 break; 2243 } 2244 } 2245 2246 STATIC int 2247 xfs_iflush_cluster( 2248 xfs_inode_t *ip, 2249 xfs_buf_t *bp) 2250 { 2251 xfs_mount_t *mp = ip->i_mount; 2252 struct xfs_perag *pag; 2253 unsigned long first_index, mask; 2254 unsigned long inodes_per_cluster; 2255 int ilist_size; 2256 xfs_inode_t **ilist; 2257 xfs_inode_t *iq; 2258 int nr_found; 2259 int clcount = 0; 2260 int bufwasdelwri; 2261 int i; 2262 2263 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2264 2265 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2266 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2267 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2268 if (!ilist) 2269 goto out_put; 2270 2271 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2272 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2273 rcu_read_lock(); 2274 /* really need a gang lookup range call here */ 2275 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2276 first_index, inodes_per_cluster); 2277 if (nr_found == 0) 2278 goto out_free; 2279 2280 for (i = 0; i < nr_found; i++) { 2281 iq = ilist[i]; 2282 if (iq == ip) 2283 continue; 2284 2285 /* 2286 * because this is an RCU protected lookup, we could find a 2287 * recently freed or even reallocated inode during the lookup. 2288 * We need to check under the i_flags_lock for a valid inode 2289 * here. Skip it if it is not valid or the wrong inode. 2290 */ 2291 spin_lock(&ip->i_flags_lock); 2292 if (!ip->i_ino || 2293 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { 2294 spin_unlock(&ip->i_flags_lock); 2295 continue; 2296 } 2297 spin_unlock(&ip->i_flags_lock); 2298 2299 /* 2300 * Do an un-protected check to see if the inode is dirty and 2301 * is a candidate for flushing. These checks will be repeated 2302 * later after the appropriate locks are acquired. 2303 */ 2304 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) 2305 continue; 2306 2307 /* 2308 * Try to get locks. If any are unavailable or it is pinned, 2309 * then this inode cannot be flushed and is skipped. 2310 */ 2311 2312 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) 2313 continue; 2314 if (!xfs_iflock_nowait(iq)) { 2315 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2316 continue; 2317 } 2318 if (xfs_ipincount(iq)) { 2319 xfs_ifunlock(iq); 2320 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2321 continue; 2322 } 2323 2324 /* 2325 * arriving here means that this inode can be flushed. First 2326 * re-check that it's dirty before flushing. 2327 */ 2328 if (!xfs_inode_clean(iq)) { 2329 int error; 2330 error = xfs_iflush_int(iq, bp); 2331 if (error) { 2332 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2333 goto cluster_corrupt_out; 2334 } 2335 clcount++; 2336 } else { 2337 xfs_ifunlock(iq); 2338 } 2339 xfs_iunlock(iq, XFS_ILOCK_SHARED); 2340 } 2341 2342 if (clcount) { 2343 XFS_STATS_INC(xs_icluster_flushcnt); 2344 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 2345 } 2346 2347 out_free: 2348 rcu_read_unlock(); 2349 kmem_free(ilist); 2350 out_put: 2351 xfs_perag_put(pag); 2352 return 0; 2353 2354 2355 cluster_corrupt_out: 2356 /* 2357 * Corruption detected in the clustering loop. Invalidate the 2358 * inode buffer and shut down the filesystem. 2359 */ 2360 rcu_read_unlock(); 2361 /* 2362 * Clean up the buffer. If it was delwri, just release it -- 2363 * brelse can handle it with no problems. If not, shut down the 2364 * filesystem before releasing the buffer. 2365 */ 2366 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); 2367 if (bufwasdelwri) 2368 xfs_buf_relse(bp); 2369 2370 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2371 2372 if (!bufwasdelwri) { 2373 /* 2374 * Just like incore_relse: if we have b_iodone functions, 2375 * mark the buffer as an error and call them. Otherwise 2376 * mark it as stale and brelse. 2377 */ 2378 if (bp->b_iodone) { 2379 XFS_BUF_UNDONE(bp); 2380 xfs_buf_stale(bp); 2381 xfs_buf_ioerror(bp, EIO); 2382 xfs_buf_ioend(bp, 0); 2383 } else { 2384 xfs_buf_stale(bp); 2385 xfs_buf_relse(bp); 2386 } 2387 } 2388 2389 /* 2390 * Unlocks the flush lock 2391 */ 2392 xfs_iflush_abort(iq, false); 2393 kmem_free(ilist); 2394 xfs_perag_put(pag); 2395 return XFS_ERROR(EFSCORRUPTED); 2396 } 2397 2398 /* 2399 * Flush dirty inode metadata into the backing buffer. 2400 * 2401 * The caller must have the inode lock and the inode flush lock held. The 2402 * inode lock will still be held upon return to the caller, and the inode 2403 * flush lock will be released after the inode has reached the disk. 2404 * 2405 * The caller must write out the buffer returned in *bpp and release it. 2406 */ 2407 int 2408 xfs_iflush( 2409 struct xfs_inode *ip, 2410 struct xfs_buf **bpp) 2411 { 2412 struct xfs_mount *mp = ip->i_mount; 2413 struct xfs_buf *bp; 2414 struct xfs_dinode *dip; 2415 int error; 2416 2417 XFS_STATS_INC(xs_iflush_count); 2418 2419 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2420 ASSERT(xfs_isiflocked(ip)); 2421 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2422 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2423 2424 *bpp = NULL; 2425 2426 xfs_iunpin_wait(ip); 2427 2428 /* 2429 * For stale inodes we cannot rely on the backing buffer remaining 2430 * stale in cache for the remaining life of the stale inode and so 2431 * xfs_itobp() below may give us a buffer that no longer contains 2432 * inodes below. We have to check this after ensuring the inode is 2433 * unpinned so that it is safe to reclaim the stale inode after the 2434 * flush call. 2435 */ 2436 if (xfs_iflags_test(ip, XFS_ISTALE)) { 2437 xfs_ifunlock(ip); 2438 return 0; 2439 } 2440 2441 /* 2442 * This may have been unpinned because the filesystem is shutting 2443 * down forcibly. If that's the case we must not write this inode 2444 * to disk, because the log record didn't make it to disk. 2445 * 2446 * We also have to remove the log item from the AIL in this case, 2447 * as we wait for an empty AIL as part of the unmount process. 2448 */ 2449 if (XFS_FORCED_SHUTDOWN(mp)) { 2450 error = XFS_ERROR(EIO); 2451 goto abort_out; 2452 } 2453 2454 /* 2455 * Get the buffer containing the on-disk inode. 2456 */ 2457 error = xfs_itobp(mp, NULL, ip, &dip, &bp, XBF_TRYLOCK); 2458 if (error || !bp) { 2459 xfs_ifunlock(ip); 2460 return error; 2461 } 2462 2463 /* 2464 * First flush out the inode that xfs_iflush was called with. 2465 */ 2466 error = xfs_iflush_int(ip, bp); 2467 if (error) 2468 goto corrupt_out; 2469 2470 /* 2471 * If the buffer is pinned then push on the log now so we won't 2472 * get stuck waiting in the write for too long. 2473 */ 2474 if (xfs_buf_ispinned(bp)) 2475 xfs_log_force(mp, 0); 2476 2477 /* 2478 * inode clustering: 2479 * see if other inodes can be gathered into this write 2480 */ 2481 error = xfs_iflush_cluster(ip, bp); 2482 if (error) 2483 goto cluster_corrupt_out; 2484 2485 *bpp = bp; 2486 return 0; 2487 2488 corrupt_out: 2489 xfs_buf_relse(bp); 2490 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 2491 cluster_corrupt_out: 2492 error = XFS_ERROR(EFSCORRUPTED); 2493 abort_out: 2494 /* 2495 * Unlocks the flush lock 2496 */ 2497 xfs_iflush_abort(ip, false); 2498 return error; 2499 } 2500 2501 2502 STATIC int 2503 xfs_iflush_int( 2504 xfs_inode_t *ip, 2505 xfs_buf_t *bp) 2506 { 2507 xfs_inode_log_item_t *iip; 2508 xfs_dinode_t *dip; 2509 xfs_mount_t *mp; 2510 #ifdef XFS_TRANS_DEBUG 2511 int first; 2512 #endif 2513 2514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2515 ASSERT(xfs_isiflocked(ip)); 2516 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2517 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 2518 2519 iip = ip->i_itemp; 2520 mp = ip->i_mount; 2521 2522 /* set *dip = inode's place in the buffer */ 2523 dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); 2524 2525 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 2526 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2527 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2528 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2529 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2530 goto corrupt_out; 2531 } 2532 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2533 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2534 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2535 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2536 __func__, ip->i_ino, ip, ip->i_d.di_magic); 2537 goto corrupt_out; 2538 } 2539 if (S_ISREG(ip->i_d.di_mode)) { 2540 if (XFS_TEST_ERROR( 2541 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2542 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2543 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2544 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2545 "%s: Bad regular inode %Lu, ptr 0x%p", 2546 __func__, ip->i_ino, ip); 2547 goto corrupt_out; 2548 } 2549 } else if (S_ISDIR(ip->i_d.di_mode)) { 2550 if (XFS_TEST_ERROR( 2551 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2552 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2553 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2554 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2555 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2556 "%s: Bad directory inode %Lu, ptr 0x%p", 2557 __func__, ip->i_ino, ip); 2558 goto corrupt_out; 2559 } 2560 } 2561 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2562 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2563 XFS_RANDOM_IFLUSH_5)) { 2564 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2565 "%s: detected corrupt incore inode %Lu, " 2566 "total extents = %d, nblocks = %Ld, ptr 0x%p", 2567 __func__, ip->i_ino, 2568 ip->i_d.di_nextents + ip->i_d.di_anextents, 2569 ip->i_d.di_nblocks, ip); 2570 goto corrupt_out; 2571 } 2572 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2573 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2574 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 2575 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2576 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2577 goto corrupt_out; 2578 } 2579 /* 2580 * bump the flush iteration count, used to detect flushes which 2581 * postdate a log record during recovery. 2582 */ 2583 2584 ip->i_d.di_flushiter++; 2585 2586 /* 2587 * Copy the dirty parts of the inode into the on-disk 2588 * inode. We always copy out the core of the inode, 2589 * because if the inode is dirty at all the core must 2590 * be. 2591 */ 2592 xfs_dinode_to_disk(dip, &ip->i_d); 2593 2594 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 2595 if (ip->i_d.di_flushiter == DI_MAX_FLUSH) 2596 ip->i_d.di_flushiter = 0; 2597 2598 /* 2599 * If this is really an old format inode and the superblock version 2600 * has not been updated to support only new format inodes, then 2601 * convert back to the old inode format. If the superblock version 2602 * has been updated, then make the conversion permanent. 2603 */ 2604 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); 2605 if (ip->i_d.di_version == 1) { 2606 if (!xfs_sb_version_hasnlink(&mp->m_sb)) { 2607 /* 2608 * Convert it back. 2609 */ 2610 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); 2611 dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink); 2612 } else { 2613 /* 2614 * The superblock version has already been bumped, 2615 * so just make the conversion to the new inode 2616 * format permanent. 2617 */ 2618 ip->i_d.di_version = 2; 2619 dip->di_version = 2; 2620 ip->i_d.di_onlink = 0; 2621 dip->di_onlink = 0; 2622 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 2623 memset(&(dip->di_pad[0]), 0, 2624 sizeof(dip->di_pad)); 2625 ASSERT(xfs_get_projid(ip) == 0); 2626 } 2627 } 2628 2629 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp); 2630 if (XFS_IFORK_Q(ip)) 2631 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); 2632 xfs_inobp_check(mp, bp); 2633 2634 /* 2635 * We've recorded everything logged in the inode, so we'd like to clear 2636 * the ili_fields bits so we don't log and flush things unnecessarily. 2637 * However, we can't stop logging all this information until the data 2638 * we've copied into the disk buffer is written to disk. If we did we 2639 * might overwrite the copy of the inode in the log with all the data 2640 * after re-logging only part of it, and in the face of a crash we 2641 * wouldn't have all the data we need to recover. 2642 * 2643 * What we do is move the bits to the ili_last_fields field. When 2644 * logging the inode, these bits are moved back to the ili_fields field. 2645 * In the xfs_iflush_done() routine we clear ili_last_fields, since we 2646 * know that the information those bits represent is permanently on 2647 * disk. As long as the flush completes before the inode is logged 2648 * again, then both ili_fields and ili_last_fields will be cleared. 2649 * 2650 * We can play with the ili_fields bits here, because the inode lock 2651 * must be held exclusively in order to set bits there and the flush 2652 * lock protects the ili_last_fields bits. Set ili_logged so the flush 2653 * done routine can tell whether or not to look in the AIL. Also, store 2654 * the current LSN of the inode so that we can tell whether the item has 2655 * moved in the AIL from xfs_iflush_done(). In order to read the lsn we 2656 * need the AIL lock, because it is a 64 bit value that cannot be read 2657 * atomically. 2658 */ 2659 if (iip != NULL && iip->ili_fields != 0) { 2660 iip->ili_last_fields = iip->ili_fields; 2661 iip->ili_fields = 0; 2662 iip->ili_logged = 1; 2663 2664 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 2665 &iip->ili_item.li_lsn); 2666 2667 /* 2668 * Attach the function xfs_iflush_done to the inode's 2669 * buffer. This will remove the inode from the AIL 2670 * and unlock the inode's flush lock when the inode is 2671 * completely written to disk. 2672 */ 2673 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); 2674 2675 ASSERT(bp->b_fspriv != NULL); 2676 ASSERT(bp->b_iodone != NULL); 2677 } else { 2678 /* 2679 * We're flushing an inode which is not in the AIL and has 2680 * not been logged. For this case we can immediately drop 2681 * the inode flush lock because we can avoid the whole 2682 * AIL state thing. It's OK to drop the flush lock now, 2683 * because we've already locked the buffer and to do anything 2684 * you really need both. 2685 */ 2686 if (iip != NULL) { 2687 ASSERT(iip->ili_logged == 0); 2688 ASSERT(iip->ili_last_fields == 0); 2689 ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); 2690 } 2691 xfs_ifunlock(ip); 2692 } 2693 2694 return 0; 2695 2696 corrupt_out: 2697 return XFS_ERROR(EFSCORRUPTED); 2698 } 2699 2700 /* 2701 * Return a pointer to the extent record at file index idx. 2702 */ 2703 xfs_bmbt_rec_host_t * 2704 xfs_iext_get_ext( 2705 xfs_ifork_t *ifp, /* inode fork pointer */ 2706 xfs_extnum_t idx) /* index of target extent */ 2707 { 2708 ASSERT(idx >= 0); 2709 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 2710 2711 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 2712 return ifp->if_u1.if_ext_irec->er_extbuf; 2713 } else if (ifp->if_flags & XFS_IFEXTIREC) { 2714 xfs_ext_irec_t *erp; /* irec pointer */ 2715 int erp_idx = 0; /* irec index */ 2716 xfs_extnum_t page_idx = idx; /* ext index in target list */ 2717 2718 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 2719 return &erp->er_extbuf[page_idx]; 2720 } else if (ifp->if_bytes) { 2721 return &ifp->if_u1.if_extents[idx]; 2722 } else { 2723 return NULL; 2724 } 2725 } 2726 2727 /* 2728 * Insert new item(s) into the extent records for incore inode 2729 * fork 'ifp'. 'count' new items are inserted at index 'idx'. 2730 */ 2731 void 2732 xfs_iext_insert( 2733 xfs_inode_t *ip, /* incore inode pointer */ 2734 xfs_extnum_t idx, /* starting index of new items */ 2735 xfs_extnum_t count, /* number of inserted items */ 2736 xfs_bmbt_irec_t *new, /* items to insert */ 2737 int state) /* type of extent conversion */ 2738 { 2739 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 2740 xfs_extnum_t i; /* extent record index */ 2741 2742 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); 2743 2744 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 2745 xfs_iext_add(ifp, idx, count); 2746 for (i = idx; i < idx + count; i++, new++) 2747 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); 2748 } 2749 2750 /* 2751 * This is called when the amount of space required for incore file 2752 * extents needs to be increased. The ext_diff parameter stores the 2753 * number of new extents being added and the idx parameter contains 2754 * the extent index where the new extents will be added. If the new 2755 * extents are being appended, then we just need to (re)allocate and 2756 * initialize the space. Otherwise, if the new extents are being 2757 * inserted into the middle of the existing entries, a bit more work 2758 * is required to make room for the new extents to be inserted. The 2759 * caller is responsible for filling in the new extent entries upon 2760 * return. 2761 */ 2762 void 2763 xfs_iext_add( 2764 xfs_ifork_t *ifp, /* inode fork pointer */ 2765 xfs_extnum_t idx, /* index to begin adding exts */ 2766 int ext_diff) /* number of extents to add */ 2767 { 2768 int byte_diff; /* new bytes being added */ 2769 int new_size; /* size of extents after adding */ 2770 xfs_extnum_t nextents; /* number of extents in file */ 2771 2772 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2773 ASSERT((idx >= 0) && (idx <= nextents)); 2774 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); 2775 new_size = ifp->if_bytes + byte_diff; 2776 /* 2777 * If the new number of extents (nextents + ext_diff) 2778 * fits inside the inode, then continue to use the inline 2779 * extent buffer. 2780 */ 2781 if (nextents + ext_diff <= XFS_INLINE_EXTS) { 2782 if (idx < nextents) { 2783 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], 2784 &ifp->if_u2.if_inline_ext[idx], 2785 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2786 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); 2787 } 2788 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 2789 ifp->if_real_bytes = 0; 2790 } 2791 /* 2792 * Otherwise use a linear (direct) extent list. 2793 * If the extents are currently inside the inode, 2794 * xfs_iext_realloc_direct will switch us from 2795 * inline to direct extent allocation mode. 2796 */ 2797 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { 2798 xfs_iext_realloc_direct(ifp, new_size); 2799 if (idx < nextents) { 2800 memmove(&ifp->if_u1.if_extents[idx + ext_diff], 2801 &ifp->if_u1.if_extents[idx], 2802 (nextents - idx) * sizeof(xfs_bmbt_rec_t)); 2803 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); 2804 } 2805 } 2806 /* Indirection array */ 2807 else { 2808 xfs_ext_irec_t *erp; 2809 int erp_idx = 0; 2810 int page_idx = idx; 2811 2812 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); 2813 if (ifp->if_flags & XFS_IFEXTIREC) { 2814 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); 2815 } else { 2816 xfs_iext_irec_init(ifp); 2817 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 2818 erp = ifp->if_u1.if_ext_irec; 2819 } 2820 /* Extents fit in target extent page */ 2821 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { 2822 if (page_idx < erp->er_extcount) { 2823 memmove(&erp->er_extbuf[page_idx + ext_diff], 2824 &erp->er_extbuf[page_idx], 2825 (erp->er_extcount - page_idx) * 2826 sizeof(xfs_bmbt_rec_t)); 2827 memset(&erp->er_extbuf[page_idx], 0, byte_diff); 2828 } 2829 erp->er_extcount += ext_diff; 2830 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 2831 } 2832 /* Insert a new extent page */ 2833 else if (erp) { 2834 xfs_iext_add_indirect_multi(ifp, 2835 erp_idx, page_idx, ext_diff); 2836 } 2837 /* 2838 * If extent(s) are being appended to the last page in 2839 * the indirection array and the new extent(s) don't fit 2840 * in the page, then erp is NULL and erp_idx is set to 2841 * the next index needed in the indirection array. 2842 */ 2843 else { 2844 int count = ext_diff; 2845 2846 while (count) { 2847 erp = xfs_iext_irec_new(ifp, erp_idx); 2848 erp->er_extcount = count; 2849 count -= MIN(count, (int)XFS_LINEAR_EXTS); 2850 if (count) { 2851 erp_idx++; 2852 } 2853 } 2854 } 2855 } 2856 ifp->if_bytes = new_size; 2857 } 2858 2859 /* 2860 * This is called when incore extents are being added to the indirection 2861 * array and the new extents do not fit in the target extent list. The 2862 * erp_idx parameter contains the irec index for the target extent list 2863 * in the indirection array, and the idx parameter contains the extent 2864 * index within the list. The number of extents being added is stored 2865 * in the count parameter. 2866 * 2867 * |-------| |-------| 2868 * | | | | idx - number of extents before idx 2869 * | idx | | count | 2870 * | | | | count - number of extents being inserted at idx 2871 * |-------| |-------| 2872 * | count | | nex2 | nex2 - number of extents after idx + count 2873 * |-------| |-------| 2874 */ 2875 void 2876 xfs_iext_add_indirect_multi( 2877 xfs_ifork_t *ifp, /* inode fork pointer */ 2878 int erp_idx, /* target extent irec index */ 2879 xfs_extnum_t idx, /* index within target list */ 2880 int count) /* new extents being added */ 2881 { 2882 int byte_diff; /* new bytes being added */ 2883 xfs_ext_irec_t *erp; /* pointer to irec entry */ 2884 xfs_extnum_t ext_diff; /* number of extents to add */ 2885 xfs_extnum_t ext_cnt; /* new extents still needed */ 2886 xfs_extnum_t nex2; /* extents after idx + count */ 2887 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ 2888 int nlists; /* number of irec's (lists) */ 2889 2890 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 2891 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 2892 nex2 = erp->er_extcount - idx; 2893 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 2894 2895 /* 2896 * Save second part of target extent list 2897 * (all extents past */ 2898 if (nex2) { 2899 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 2900 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); 2901 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); 2902 erp->er_extcount -= nex2; 2903 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); 2904 memset(&erp->er_extbuf[idx], 0, byte_diff); 2905 } 2906 2907 /* 2908 * Add the new extents to the end of the target 2909 * list, then allocate new irec record(s) and 2910 * extent buffer(s) as needed to store the rest 2911 * of the new extents. 2912 */ 2913 ext_cnt = count; 2914 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); 2915 if (ext_diff) { 2916 erp->er_extcount += ext_diff; 2917 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 2918 ext_cnt -= ext_diff; 2919 } 2920 while (ext_cnt) { 2921 erp_idx++; 2922 erp = xfs_iext_irec_new(ifp, erp_idx); 2923 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); 2924 erp->er_extcount = ext_diff; 2925 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); 2926 ext_cnt -= ext_diff; 2927 } 2928 2929 /* Add nex2 extents back to indirection array */ 2930 if (nex2) { 2931 xfs_extnum_t ext_avail; 2932 int i; 2933 2934 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); 2935 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; 2936 i = 0; 2937 /* 2938 * If nex2 extents fit in the current page, append 2939 * nex2_ep after the new extents. 2940 */ 2941 if (nex2 <= ext_avail) { 2942 i = erp->er_extcount; 2943 } 2944 /* 2945 * Otherwise, check if space is available in the 2946 * next page. 2947 */ 2948 else if ((erp_idx < nlists - 1) && 2949 (nex2 <= (ext_avail = XFS_LINEAR_EXTS - 2950 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { 2951 erp_idx++; 2952 erp++; 2953 /* Create a hole for nex2 extents */ 2954 memmove(&erp->er_extbuf[nex2], erp->er_extbuf, 2955 erp->er_extcount * sizeof(xfs_bmbt_rec_t)); 2956 } 2957 /* 2958 * Final choice, create a new extent page for 2959 * nex2 extents. 2960 */ 2961 else { 2962 erp_idx++; 2963 erp = xfs_iext_irec_new(ifp, erp_idx); 2964 } 2965 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); 2966 kmem_free(nex2_ep); 2967 erp->er_extcount += nex2; 2968 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); 2969 } 2970 } 2971 2972 /* 2973 * This is called when the amount of space required for incore file 2974 * extents needs to be decreased. The ext_diff parameter stores the 2975 * number of extents to be removed and the idx parameter contains 2976 * the extent index where the extents will be removed from. 2977 * 2978 * If the amount of space needed has decreased below the linear 2979 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous 2980 * extent array. Otherwise, use kmem_realloc() to adjust the 2981 * size to what is needed. 2982 */ 2983 void 2984 xfs_iext_remove( 2985 xfs_inode_t *ip, /* incore inode pointer */ 2986 xfs_extnum_t idx, /* index to begin removing exts */ 2987 int ext_diff, /* number of extents to remove */ 2988 int state) /* type of extent conversion */ 2989 { 2990 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; 2991 xfs_extnum_t nextents; /* number of extents in file */ 2992 int new_size; /* size of extents after removal */ 2993 2994 trace_xfs_iext_remove(ip, idx, state, _RET_IP_); 2995 2996 ASSERT(ext_diff > 0); 2997 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2998 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); 2999 3000 if (new_size == 0) { 3001 xfs_iext_destroy(ifp); 3002 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3003 xfs_iext_remove_indirect(ifp, idx, ext_diff); 3004 } else if (ifp->if_real_bytes) { 3005 xfs_iext_remove_direct(ifp, idx, ext_diff); 3006 } else { 3007 xfs_iext_remove_inline(ifp, idx, ext_diff); 3008 } 3009 ifp->if_bytes = new_size; 3010 } 3011 3012 /* 3013 * This removes ext_diff extents from the inline buffer, beginning 3014 * at extent index idx. 3015 */ 3016 void 3017 xfs_iext_remove_inline( 3018 xfs_ifork_t *ifp, /* inode fork pointer */ 3019 xfs_extnum_t idx, /* index to begin removing exts */ 3020 int ext_diff) /* number of extents to remove */ 3021 { 3022 int nextents; /* number of extents in file */ 3023 3024 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3025 ASSERT(idx < XFS_INLINE_EXTS); 3026 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3027 ASSERT(((nextents - ext_diff) > 0) && 3028 (nextents - ext_diff) < XFS_INLINE_EXTS); 3029 3030 if (idx + ext_diff < nextents) { 3031 memmove(&ifp->if_u2.if_inline_ext[idx], 3032 &ifp->if_u2.if_inline_ext[idx + ext_diff], 3033 (nextents - (idx + ext_diff)) * 3034 sizeof(xfs_bmbt_rec_t)); 3035 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], 3036 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3037 } else { 3038 memset(&ifp->if_u2.if_inline_ext[idx], 0, 3039 ext_diff * sizeof(xfs_bmbt_rec_t)); 3040 } 3041 } 3042 3043 /* 3044 * This removes ext_diff extents from a linear (direct) extent list, 3045 * beginning at extent index idx. If the extents are being removed 3046 * from the end of the list (ie. truncate) then we just need to re- 3047 * allocate the list to remove the extra space. Otherwise, if the 3048 * extents are being removed from the middle of the existing extent 3049 * entries, then we first need to move the extent records beginning 3050 * at idx + ext_diff up in the list to overwrite the records being 3051 * removed, then remove the extra space via kmem_realloc. 3052 */ 3053 void 3054 xfs_iext_remove_direct( 3055 xfs_ifork_t *ifp, /* inode fork pointer */ 3056 xfs_extnum_t idx, /* index to begin removing exts */ 3057 int ext_diff) /* number of extents to remove */ 3058 { 3059 xfs_extnum_t nextents; /* number of extents in file */ 3060 int new_size; /* size of extents after removal */ 3061 3062 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3063 new_size = ifp->if_bytes - 3064 (ext_diff * sizeof(xfs_bmbt_rec_t)); 3065 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3066 3067 if (new_size == 0) { 3068 xfs_iext_destroy(ifp); 3069 return; 3070 } 3071 /* Move extents up in the list (if needed) */ 3072 if (idx + ext_diff < nextents) { 3073 memmove(&ifp->if_u1.if_extents[idx], 3074 &ifp->if_u1.if_extents[idx + ext_diff], 3075 (nextents - (idx + ext_diff)) * 3076 sizeof(xfs_bmbt_rec_t)); 3077 } 3078 memset(&ifp->if_u1.if_extents[nextents - ext_diff], 3079 0, ext_diff * sizeof(xfs_bmbt_rec_t)); 3080 /* 3081 * Reallocate the direct extent list. If the extents 3082 * will fit inside the inode then xfs_iext_realloc_direct 3083 * will switch from direct to inline extent allocation 3084 * mode for us. 3085 */ 3086 xfs_iext_realloc_direct(ifp, new_size); 3087 ifp->if_bytes = new_size; 3088 } 3089 3090 /* 3091 * This is called when incore extents are being removed from the 3092 * indirection array and the extents being removed span multiple extent 3093 * buffers. The idx parameter contains the file extent index where we 3094 * want to begin removing extents, and the count parameter contains 3095 * how many extents need to be removed. 3096 * 3097 * |-------| |-------| 3098 * | nex1 | | | nex1 - number of extents before idx 3099 * |-------| | count | 3100 * | | | | count - number of extents being removed at idx 3101 * | count | |-------| 3102 * | | | nex2 | nex2 - number of extents after idx + count 3103 * |-------| |-------| 3104 */ 3105 void 3106 xfs_iext_remove_indirect( 3107 xfs_ifork_t *ifp, /* inode fork pointer */ 3108 xfs_extnum_t idx, /* index to begin removing extents */ 3109 int count) /* number of extents to remove */ 3110 { 3111 xfs_ext_irec_t *erp; /* indirection array pointer */ 3112 int erp_idx = 0; /* indirection array index */ 3113 xfs_extnum_t ext_cnt; /* extents left to remove */ 3114 xfs_extnum_t ext_diff; /* extents to remove in current list */ 3115 xfs_extnum_t nex1; /* number of extents before idx */ 3116 xfs_extnum_t nex2; /* extents after idx + count */ 3117 int page_idx = idx; /* index in target extent list */ 3118 3119 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3120 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); 3121 ASSERT(erp != NULL); 3122 nex1 = page_idx; 3123 ext_cnt = count; 3124 while (ext_cnt) { 3125 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); 3126 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); 3127 /* 3128 * Check for deletion of entire list; 3129 * xfs_iext_irec_remove() updates extent offsets. 3130 */ 3131 if (ext_diff == erp->er_extcount) { 3132 xfs_iext_irec_remove(ifp, erp_idx); 3133 ext_cnt -= ext_diff; 3134 nex1 = 0; 3135 if (ext_cnt) { 3136 ASSERT(erp_idx < ifp->if_real_bytes / 3137 XFS_IEXT_BUFSZ); 3138 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3139 nex1 = 0; 3140 continue; 3141 } else { 3142 break; 3143 } 3144 } 3145 /* Move extents up (if needed) */ 3146 if (nex2) { 3147 memmove(&erp->er_extbuf[nex1], 3148 &erp->er_extbuf[nex1 + ext_diff], 3149 nex2 * sizeof(xfs_bmbt_rec_t)); 3150 } 3151 /* Zero out rest of page */ 3152 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - 3153 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); 3154 /* Update remaining counters */ 3155 erp->er_extcount -= ext_diff; 3156 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); 3157 ext_cnt -= ext_diff; 3158 nex1 = 0; 3159 erp_idx++; 3160 erp++; 3161 } 3162 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); 3163 xfs_iext_irec_compact(ifp); 3164 } 3165 3166 /* 3167 * Create, destroy, or resize a linear (direct) block of extents. 3168 */ 3169 void 3170 xfs_iext_realloc_direct( 3171 xfs_ifork_t *ifp, /* inode fork pointer */ 3172 int new_size) /* new size of extents */ 3173 { 3174 int rnew_size; /* real new size of extents */ 3175 3176 rnew_size = new_size; 3177 3178 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || 3179 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && 3180 (new_size != ifp->if_real_bytes))); 3181 3182 /* Free extent records */ 3183 if (new_size == 0) { 3184 xfs_iext_destroy(ifp); 3185 } 3186 /* Resize direct extent list and zero any new bytes */ 3187 else if (ifp->if_real_bytes) { 3188 /* Check if extents will fit inside the inode */ 3189 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { 3190 xfs_iext_direct_to_inline(ifp, new_size / 3191 (uint)sizeof(xfs_bmbt_rec_t)); 3192 ifp->if_bytes = new_size; 3193 return; 3194 } 3195 if (!is_power_of_2(new_size)){ 3196 rnew_size = roundup_pow_of_two(new_size); 3197 } 3198 if (rnew_size != ifp->if_real_bytes) { 3199 ifp->if_u1.if_extents = 3200 kmem_realloc(ifp->if_u1.if_extents, 3201 rnew_size, 3202 ifp->if_real_bytes, KM_NOFS); 3203 } 3204 if (rnew_size > ifp->if_real_bytes) { 3205 memset(&ifp->if_u1.if_extents[ifp->if_bytes / 3206 (uint)sizeof(xfs_bmbt_rec_t)], 0, 3207 rnew_size - ifp->if_real_bytes); 3208 } 3209 } 3210 /* 3211 * Switch from the inline extent buffer to a direct 3212 * extent list. Be sure to include the inline extent 3213 * bytes in new_size. 3214 */ 3215 else { 3216 new_size += ifp->if_bytes; 3217 if (!is_power_of_2(new_size)) { 3218 rnew_size = roundup_pow_of_two(new_size); 3219 } 3220 xfs_iext_inline_to_direct(ifp, rnew_size); 3221 } 3222 ifp->if_real_bytes = rnew_size; 3223 ifp->if_bytes = new_size; 3224 } 3225 3226 /* 3227 * Switch from linear (direct) extent records to inline buffer. 3228 */ 3229 void 3230 xfs_iext_direct_to_inline( 3231 xfs_ifork_t *ifp, /* inode fork pointer */ 3232 xfs_extnum_t nextents) /* number of extents in file */ 3233 { 3234 ASSERT(ifp->if_flags & XFS_IFEXTENTS); 3235 ASSERT(nextents <= XFS_INLINE_EXTS); 3236 /* 3237 * The inline buffer was zeroed when we switched 3238 * from inline to direct extent allocation mode, 3239 * so we don't need to clear it here. 3240 */ 3241 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, 3242 nextents * sizeof(xfs_bmbt_rec_t)); 3243 kmem_free(ifp->if_u1.if_extents); 3244 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3245 ifp->if_real_bytes = 0; 3246 } 3247 3248 /* 3249 * Switch from inline buffer to linear (direct) extent records. 3250 * new_size should already be rounded up to the next power of 2 3251 * by the caller (when appropriate), so use new_size as it is. 3252 * However, since new_size may be rounded up, we can't update 3253 * if_bytes here. It is the caller's responsibility to update 3254 * if_bytes upon return. 3255 */ 3256 void 3257 xfs_iext_inline_to_direct( 3258 xfs_ifork_t *ifp, /* inode fork pointer */ 3259 int new_size) /* number of extents in file */ 3260 { 3261 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); 3262 memset(ifp->if_u1.if_extents, 0, new_size); 3263 if (ifp->if_bytes) { 3264 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, 3265 ifp->if_bytes); 3266 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3267 sizeof(xfs_bmbt_rec_t)); 3268 } 3269 ifp->if_real_bytes = new_size; 3270 } 3271 3272 /* 3273 * Resize an extent indirection array to new_size bytes. 3274 */ 3275 STATIC void 3276 xfs_iext_realloc_indirect( 3277 xfs_ifork_t *ifp, /* inode fork pointer */ 3278 int new_size) /* new indirection array size */ 3279 { 3280 int nlists; /* number of irec's (ex lists) */ 3281 int size; /* current indirection array size */ 3282 3283 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3284 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3285 size = nlists * sizeof(xfs_ext_irec_t); 3286 ASSERT(ifp->if_real_bytes); 3287 ASSERT((new_size >= 0) && (new_size != size)); 3288 if (new_size == 0) { 3289 xfs_iext_destroy(ifp); 3290 } else { 3291 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) 3292 kmem_realloc(ifp->if_u1.if_ext_irec, 3293 new_size, size, KM_NOFS); 3294 } 3295 } 3296 3297 /* 3298 * Switch from indirection array to linear (direct) extent allocations. 3299 */ 3300 STATIC void 3301 xfs_iext_indirect_to_direct( 3302 xfs_ifork_t *ifp) /* inode fork pointer */ 3303 { 3304 xfs_bmbt_rec_host_t *ep; /* extent record pointer */ 3305 xfs_extnum_t nextents; /* number of extents in file */ 3306 int size; /* size of file extents */ 3307 3308 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3309 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3310 ASSERT(nextents <= XFS_LINEAR_EXTS); 3311 size = nextents * sizeof(xfs_bmbt_rec_t); 3312 3313 xfs_iext_irec_compact_pages(ifp); 3314 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); 3315 3316 ep = ifp->if_u1.if_ext_irec->er_extbuf; 3317 kmem_free(ifp->if_u1.if_ext_irec); 3318 ifp->if_flags &= ~XFS_IFEXTIREC; 3319 ifp->if_u1.if_extents = ep; 3320 ifp->if_bytes = size; 3321 if (nextents < XFS_LINEAR_EXTS) { 3322 xfs_iext_realloc_direct(ifp, size); 3323 } 3324 } 3325 3326 /* 3327 * Free incore file extents. 3328 */ 3329 void 3330 xfs_iext_destroy( 3331 xfs_ifork_t *ifp) /* inode fork pointer */ 3332 { 3333 if (ifp->if_flags & XFS_IFEXTIREC) { 3334 int erp_idx; 3335 int nlists; 3336 3337 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3338 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { 3339 xfs_iext_irec_remove(ifp, erp_idx); 3340 } 3341 ifp->if_flags &= ~XFS_IFEXTIREC; 3342 } else if (ifp->if_real_bytes) { 3343 kmem_free(ifp->if_u1.if_extents); 3344 } else if (ifp->if_bytes) { 3345 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * 3346 sizeof(xfs_bmbt_rec_t)); 3347 } 3348 ifp->if_u1.if_extents = NULL; 3349 ifp->if_real_bytes = 0; 3350 ifp->if_bytes = 0; 3351 } 3352 3353 /* 3354 * Return a pointer to the extent record for file system block bno. 3355 */ 3356 xfs_bmbt_rec_host_t * /* pointer to found extent record */ 3357 xfs_iext_bno_to_ext( 3358 xfs_ifork_t *ifp, /* inode fork pointer */ 3359 xfs_fileoff_t bno, /* block number to search for */ 3360 xfs_extnum_t *idxp) /* index of target extent */ 3361 { 3362 xfs_bmbt_rec_host_t *base; /* pointer to first extent */ 3363 xfs_filblks_t blockcount = 0; /* number of blocks in extent */ 3364 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ 3365 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3366 int high; /* upper boundary in search */ 3367 xfs_extnum_t idx = 0; /* index of target extent */ 3368 int low; /* lower boundary in search */ 3369 xfs_extnum_t nextents; /* number of file extents */ 3370 xfs_fileoff_t startoff = 0; /* start offset of extent */ 3371 3372 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3373 if (nextents == 0) { 3374 *idxp = 0; 3375 return NULL; 3376 } 3377 low = 0; 3378 if (ifp->if_flags & XFS_IFEXTIREC) { 3379 /* Find target extent list */ 3380 int erp_idx = 0; 3381 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); 3382 base = erp->er_extbuf; 3383 high = erp->er_extcount - 1; 3384 } else { 3385 base = ifp->if_u1.if_extents; 3386 high = nextents - 1; 3387 } 3388 /* Binary search extent records */ 3389 while (low <= high) { 3390 idx = (low + high) >> 1; 3391 ep = base + idx; 3392 startoff = xfs_bmbt_get_startoff(ep); 3393 blockcount = xfs_bmbt_get_blockcount(ep); 3394 if (bno < startoff) { 3395 high = idx - 1; 3396 } else if (bno >= startoff + blockcount) { 3397 low = idx + 1; 3398 } else { 3399 /* Convert back to file-based extent index */ 3400 if (ifp->if_flags & XFS_IFEXTIREC) { 3401 idx += erp->er_extoff; 3402 } 3403 *idxp = idx; 3404 return ep; 3405 } 3406 } 3407 /* Convert back to file-based extent index */ 3408 if (ifp->if_flags & XFS_IFEXTIREC) { 3409 idx += erp->er_extoff; 3410 } 3411 if (bno >= startoff + blockcount) { 3412 if (++idx == nextents) { 3413 ep = NULL; 3414 } else { 3415 ep = xfs_iext_get_ext(ifp, idx); 3416 } 3417 } 3418 *idxp = idx; 3419 return ep; 3420 } 3421 3422 /* 3423 * Return a pointer to the indirection array entry containing the 3424 * extent record for filesystem block bno. Store the index of the 3425 * target irec in *erp_idxp. 3426 */ 3427 xfs_ext_irec_t * /* pointer to found extent record */ 3428 xfs_iext_bno_to_irec( 3429 xfs_ifork_t *ifp, /* inode fork pointer */ 3430 xfs_fileoff_t bno, /* block number to search for */ 3431 int *erp_idxp) /* irec index of target ext list */ 3432 { 3433 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ 3434 xfs_ext_irec_t *erp_next; /* next indirection array entry */ 3435 int erp_idx; /* indirection array index */ 3436 int nlists; /* number of extent irec's (lists) */ 3437 int high; /* binary search upper limit */ 3438 int low; /* binary search lower limit */ 3439 3440 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3441 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3442 erp_idx = 0; 3443 low = 0; 3444 high = nlists - 1; 3445 while (low <= high) { 3446 erp_idx = (low + high) >> 1; 3447 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3448 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; 3449 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { 3450 high = erp_idx - 1; 3451 } else if (erp_next && bno >= 3452 xfs_bmbt_get_startoff(erp_next->er_extbuf)) { 3453 low = erp_idx + 1; 3454 } else { 3455 break; 3456 } 3457 } 3458 *erp_idxp = erp_idx; 3459 return erp; 3460 } 3461 3462 /* 3463 * Return a pointer to the indirection array entry containing the 3464 * extent record at file extent index *idxp. Store the index of the 3465 * target irec in *erp_idxp and store the page index of the target 3466 * extent record in *idxp. 3467 */ 3468 xfs_ext_irec_t * 3469 xfs_iext_idx_to_irec( 3470 xfs_ifork_t *ifp, /* inode fork pointer */ 3471 xfs_extnum_t *idxp, /* extent index (file -> page) */ 3472 int *erp_idxp, /* pointer to target irec */ 3473 int realloc) /* new bytes were just added */ 3474 { 3475 xfs_ext_irec_t *prev; /* pointer to previous irec */ 3476 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ 3477 int erp_idx; /* indirection array index */ 3478 int nlists; /* number of irec's (ex lists) */ 3479 int high; /* binary search upper limit */ 3480 int low; /* binary search lower limit */ 3481 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3482 3483 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3484 ASSERT(page_idx >= 0); 3485 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); 3486 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); 3487 3488 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3489 erp_idx = 0; 3490 low = 0; 3491 high = nlists - 1; 3492 3493 /* Binary search extent irec's */ 3494 while (low <= high) { 3495 erp_idx = (low + high) >> 1; 3496 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3497 prev = erp_idx > 0 ? erp - 1 : NULL; 3498 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && 3499 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { 3500 high = erp_idx - 1; 3501 } else if (page_idx > erp->er_extoff + erp->er_extcount || 3502 (page_idx == erp->er_extoff + erp->er_extcount && 3503 !realloc)) { 3504 low = erp_idx + 1; 3505 } else if (page_idx == erp->er_extoff + erp->er_extcount && 3506 erp->er_extcount == XFS_LINEAR_EXTS) { 3507 ASSERT(realloc); 3508 page_idx = 0; 3509 erp_idx++; 3510 erp = erp_idx < nlists ? erp + 1 : NULL; 3511 break; 3512 } else { 3513 page_idx -= erp->er_extoff; 3514 break; 3515 } 3516 } 3517 *idxp = page_idx; 3518 *erp_idxp = erp_idx; 3519 return(erp); 3520 } 3521 3522 /* 3523 * Allocate and initialize an indirection array once the space needed 3524 * for incore extents increases above XFS_IEXT_BUFSZ. 3525 */ 3526 void 3527 xfs_iext_irec_init( 3528 xfs_ifork_t *ifp) /* inode fork pointer */ 3529 { 3530 xfs_ext_irec_t *erp; /* indirection array pointer */ 3531 xfs_extnum_t nextents; /* number of extents in file */ 3532 3533 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); 3534 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3535 ASSERT(nextents <= XFS_LINEAR_EXTS); 3536 3537 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); 3538 3539 if (nextents == 0) { 3540 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3541 } else if (!ifp->if_real_bytes) { 3542 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); 3543 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { 3544 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); 3545 } 3546 erp->er_extbuf = ifp->if_u1.if_extents; 3547 erp->er_extcount = nextents; 3548 erp->er_extoff = 0; 3549 3550 ifp->if_flags |= XFS_IFEXTIREC; 3551 ifp->if_real_bytes = XFS_IEXT_BUFSZ; 3552 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); 3553 ifp->if_u1.if_ext_irec = erp; 3554 3555 return; 3556 } 3557 3558 /* 3559 * Allocate and initialize a new entry in the indirection array. 3560 */ 3561 xfs_ext_irec_t * 3562 xfs_iext_irec_new( 3563 xfs_ifork_t *ifp, /* inode fork pointer */ 3564 int erp_idx) /* index for new irec */ 3565 { 3566 xfs_ext_irec_t *erp; /* indirection array pointer */ 3567 int i; /* loop counter */ 3568 int nlists; /* number of irec's (ex lists) */ 3569 3570 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3571 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3572 3573 /* Resize indirection array */ 3574 xfs_iext_realloc_indirect(ifp, ++nlists * 3575 sizeof(xfs_ext_irec_t)); 3576 /* 3577 * Move records down in the array so the 3578 * new page can use erp_idx. 3579 */ 3580 erp = ifp->if_u1.if_ext_irec; 3581 for (i = nlists - 1; i > erp_idx; i--) { 3582 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); 3583 } 3584 ASSERT(i == erp_idx); 3585 3586 /* Initialize new extent record */ 3587 erp = ifp->if_u1.if_ext_irec; 3588 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); 3589 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3590 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); 3591 erp[erp_idx].er_extcount = 0; 3592 erp[erp_idx].er_extoff = erp_idx > 0 ? 3593 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; 3594 return (&erp[erp_idx]); 3595 } 3596 3597 /* 3598 * Remove a record from the indirection array. 3599 */ 3600 void 3601 xfs_iext_irec_remove( 3602 xfs_ifork_t *ifp, /* inode fork pointer */ 3603 int erp_idx) /* irec index to remove */ 3604 { 3605 xfs_ext_irec_t *erp; /* indirection array pointer */ 3606 int i; /* loop counter */ 3607 int nlists; /* number of irec's (ex lists) */ 3608 3609 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3610 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3611 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3612 if (erp->er_extbuf) { 3613 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, 3614 -erp->er_extcount); 3615 kmem_free(erp->er_extbuf); 3616 } 3617 /* Compact extent records */ 3618 erp = ifp->if_u1.if_ext_irec; 3619 for (i = erp_idx; i < nlists - 1; i++) { 3620 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); 3621 } 3622 /* 3623 * Manually free the last extent record from the indirection 3624 * array. A call to xfs_iext_realloc_indirect() with a size 3625 * of zero would result in a call to xfs_iext_destroy() which 3626 * would in turn call this function again, creating a nasty 3627 * infinite loop. 3628 */ 3629 if (--nlists) { 3630 xfs_iext_realloc_indirect(ifp, 3631 nlists * sizeof(xfs_ext_irec_t)); 3632 } else { 3633 kmem_free(ifp->if_u1.if_ext_irec); 3634 } 3635 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; 3636 } 3637 3638 /* 3639 * This is called to clean up large amounts of unused memory allocated 3640 * by the indirection array. Before compacting anything though, verify 3641 * that the indirection array is still needed and switch back to the 3642 * linear extent list (or even the inline buffer) if possible. The 3643 * compaction policy is as follows: 3644 * 3645 * Full Compaction: Extents fit into a single page (or inline buffer) 3646 * Partial Compaction: Extents occupy less than 50% of allocated space 3647 * No Compaction: Extents occupy at least 50% of allocated space 3648 */ 3649 void 3650 xfs_iext_irec_compact( 3651 xfs_ifork_t *ifp) /* inode fork pointer */ 3652 { 3653 xfs_extnum_t nextents; /* number of extents in file */ 3654 int nlists; /* number of irec's (ex lists) */ 3655 3656 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3657 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3658 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 3659 3660 if (nextents == 0) { 3661 xfs_iext_destroy(ifp); 3662 } else if (nextents <= XFS_INLINE_EXTS) { 3663 xfs_iext_indirect_to_direct(ifp); 3664 xfs_iext_direct_to_inline(ifp, nextents); 3665 } else if (nextents <= XFS_LINEAR_EXTS) { 3666 xfs_iext_indirect_to_direct(ifp); 3667 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { 3668 xfs_iext_irec_compact_pages(ifp); 3669 } 3670 } 3671 3672 /* 3673 * Combine extents from neighboring extent pages. 3674 */ 3675 void 3676 xfs_iext_irec_compact_pages( 3677 xfs_ifork_t *ifp) /* inode fork pointer */ 3678 { 3679 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ 3680 int erp_idx = 0; /* indirection array index */ 3681 int nlists; /* number of irec's (ex lists) */ 3682 3683 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3684 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3685 while (erp_idx < nlists - 1) { 3686 erp = &ifp->if_u1.if_ext_irec[erp_idx]; 3687 erp_next = erp + 1; 3688 if (erp_next->er_extcount <= 3689 (XFS_LINEAR_EXTS - erp->er_extcount)) { 3690 memcpy(&erp->er_extbuf[erp->er_extcount], 3691 erp_next->er_extbuf, erp_next->er_extcount * 3692 sizeof(xfs_bmbt_rec_t)); 3693 erp->er_extcount += erp_next->er_extcount; 3694 /* 3695 * Free page before removing extent record 3696 * so er_extoffs don't get modified in 3697 * xfs_iext_irec_remove. 3698 */ 3699 kmem_free(erp_next->er_extbuf); 3700 erp_next->er_extbuf = NULL; 3701 xfs_iext_irec_remove(ifp, erp_idx + 1); 3702 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3703 } else { 3704 erp_idx++; 3705 } 3706 } 3707 } 3708 3709 /* 3710 * This is called to update the er_extoff field in the indirection 3711 * array when extents have been added or removed from one of the 3712 * extent lists. erp_idx contains the irec index to begin updating 3713 * at and ext_diff contains the number of extents that were added 3714 * or removed. 3715 */ 3716 void 3717 xfs_iext_irec_update_extoffs( 3718 xfs_ifork_t *ifp, /* inode fork pointer */ 3719 int erp_idx, /* irec index to update */ 3720 int ext_diff) /* number of new extents */ 3721 { 3722 int i; /* loop counter */ 3723 int nlists; /* number of irec's (ex lists */ 3724 3725 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3726 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3727 for (i = erp_idx; i < nlists; i++) { 3728 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3729 } 3730 } 3731