1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include <linux/iversion.h> 7 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_dir2.h" 18 #include "xfs_attr.h" 19 #include "xfs_trans_space.h" 20 #include "xfs_trans.h" 21 #include "xfs_buf_item.h" 22 #include "xfs_inode_item.h" 23 #include "xfs_iunlink_item.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_bmap.h" 26 #include "xfs_bmap_util.h" 27 #include "xfs_errortag.h" 28 #include "xfs_error.h" 29 #include "xfs_quota.h" 30 #include "xfs_filestream.h" 31 #include "xfs_trace.h" 32 #include "xfs_icache.h" 33 #include "xfs_symlink.h" 34 #include "xfs_trans_priv.h" 35 #include "xfs_log.h" 36 #include "xfs_bmap_btree.h" 37 #include "xfs_reflink.h" 38 #include "xfs_ag.h" 39 #include "xfs_log_priv.h" 40 #include "xfs_health.h" 41 42 struct kmem_cache *xfs_inode_cache; 43 44 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); 45 STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, 46 struct xfs_inode *); 47 48 /* 49 * helper function to extract extent size hint from inode 50 */ 51 xfs_extlen_t 52 xfs_get_extsz_hint( 53 struct xfs_inode *ip) 54 { 55 /* 56 * No point in aligning allocations if we need to COW to actually 57 * write to them. 58 */ 59 if (xfs_is_always_cow_inode(ip)) 60 return 0; 61 if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 62 return ip->i_extsize; 63 if (XFS_IS_REALTIME_INODE(ip)) 64 return ip->i_mount->m_sb.sb_rextsize; 65 return 0; 66 } 67 68 /* 69 * Helper function to extract CoW extent size hint from inode. 70 * Between the extent size hint and the CoW extent size hint, we 71 * return the greater of the two. If the value is zero (automatic), 72 * use the default size. 73 */ 74 xfs_extlen_t 75 xfs_get_cowextsz_hint( 76 struct xfs_inode *ip) 77 { 78 xfs_extlen_t a, b; 79 80 a = 0; 81 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 82 a = ip->i_cowextsize; 83 b = xfs_get_extsz_hint(ip); 84 85 a = max(a, b); 86 if (a == 0) 87 return XFS_DEFAULT_COWEXTSZ_HINT; 88 return a; 89 } 90 91 /* 92 * These two are wrapper routines around the xfs_ilock() routine used to 93 * centralize some grungy code. They are used in places that wish to lock the 94 * inode solely for reading the extents. The reason these places can't just 95 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 96 * bringing in of the extents from disk for a file in b-tree format. If the 97 * inode is in b-tree format, then we need to lock the inode exclusively until 98 * the extents are read in. Locking it exclusively all the time would limit 99 * our parallelism unnecessarily, though. What we do instead is check to see 100 * if the extents have been read in yet, and only lock the inode exclusively 101 * if they have not. 102 * 103 * The functions return a value which should be given to the corresponding 104 * xfs_iunlock() call. 105 */ 106 uint 107 xfs_ilock_data_map_shared( 108 struct xfs_inode *ip) 109 { 110 uint lock_mode = XFS_ILOCK_SHARED; 111 112 if (xfs_need_iread_extents(&ip->i_df)) 113 lock_mode = XFS_ILOCK_EXCL; 114 xfs_ilock(ip, lock_mode); 115 return lock_mode; 116 } 117 118 uint 119 xfs_ilock_attr_map_shared( 120 struct xfs_inode *ip) 121 { 122 uint lock_mode = XFS_ILOCK_SHARED; 123 124 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 125 lock_mode = XFS_ILOCK_EXCL; 126 xfs_ilock(ip, lock_mode); 127 return lock_mode; 128 } 129 130 /* 131 * You can't set both SHARED and EXCL for the same lock, 132 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, 133 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values 134 * to set in lock_flags. 135 */ 136 static inline void 137 xfs_lock_flags_assert( 138 uint lock_flags) 139 { 140 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 141 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 142 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 143 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 144 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 145 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 146 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 147 ASSERT(lock_flags != 0); 148 } 149 150 /* 151 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 152 * multi-reader locks: invalidate_lock and the i_lock. This routine allows 153 * various combinations of the locks to be obtained. 154 * 155 * The 3 locks should always be ordered so that the IO lock is obtained first, 156 * the mmap lock second and the ilock last in order to prevent deadlock. 157 * 158 * Basic locking order: 159 * 160 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock 161 * 162 * mmap_lock locking order: 163 * 164 * i_rwsem -> page lock -> mmap_lock 165 * mmap_lock -> invalidate_lock -> page_lock 166 * 167 * The difference in mmap_lock locking order mean that we cannot hold the 168 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths 169 * can fault in pages during copy in/out (for buffered IO) or require the 170 * mmap_lock in get_user_pages() to map the user pages into the kernel address 171 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page 172 * fault because page faults already hold the mmap_lock. 173 * 174 * Hence to serialise fully against both syscall and mmap based IO, we need to 175 * take both the i_rwsem and the invalidate_lock. These locks should *only* be 176 * both taken in places where we need to invalidate the page cache in a race 177 * free manner (e.g. truncate, hole punch and other extent manipulation 178 * functions). 179 */ 180 void 181 xfs_ilock( 182 xfs_inode_t *ip, 183 uint lock_flags) 184 { 185 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 186 187 xfs_lock_flags_assert(lock_flags); 188 189 if (lock_flags & XFS_IOLOCK_EXCL) { 190 down_write_nested(&VFS_I(ip)->i_rwsem, 191 XFS_IOLOCK_DEP(lock_flags)); 192 } else if (lock_flags & XFS_IOLOCK_SHARED) { 193 down_read_nested(&VFS_I(ip)->i_rwsem, 194 XFS_IOLOCK_DEP(lock_flags)); 195 } 196 197 if (lock_flags & XFS_MMAPLOCK_EXCL) { 198 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 199 XFS_MMAPLOCK_DEP(lock_flags)); 200 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 201 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 202 XFS_MMAPLOCK_DEP(lock_flags)); 203 } 204 205 if (lock_flags & XFS_ILOCK_EXCL) 206 down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 207 else if (lock_flags & XFS_ILOCK_SHARED) 208 down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 209 } 210 211 /* 212 * This is just like xfs_ilock(), except that the caller 213 * is guaranteed not to sleep. It returns 1 if it gets 214 * the requested locks and 0 otherwise. If the IO lock is 215 * obtained but the inode lock cannot be, then the IO lock 216 * is dropped before returning. 217 * 218 * ip -- the inode being locked 219 * lock_flags -- this parameter indicates the inode's locks to be 220 * to be locked. See the comment for xfs_ilock() for a list 221 * of valid values. 222 */ 223 int 224 xfs_ilock_nowait( 225 xfs_inode_t *ip, 226 uint lock_flags) 227 { 228 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 229 230 xfs_lock_flags_assert(lock_flags); 231 232 if (lock_flags & XFS_IOLOCK_EXCL) { 233 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) 234 goto out; 235 } else if (lock_flags & XFS_IOLOCK_SHARED) { 236 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) 237 goto out; 238 } 239 240 if (lock_flags & XFS_MMAPLOCK_EXCL) { 241 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 242 goto out_undo_iolock; 243 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 244 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 245 goto out_undo_iolock; 246 } 247 248 if (lock_flags & XFS_ILOCK_EXCL) { 249 if (!down_write_trylock(&ip->i_lock)) 250 goto out_undo_mmaplock; 251 } else if (lock_flags & XFS_ILOCK_SHARED) { 252 if (!down_read_trylock(&ip->i_lock)) 253 goto out_undo_mmaplock; 254 } 255 return 1; 256 257 out_undo_mmaplock: 258 if (lock_flags & XFS_MMAPLOCK_EXCL) 259 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 260 else if (lock_flags & XFS_MMAPLOCK_SHARED) 261 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 262 out_undo_iolock: 263 if (lock_flags & XFS_IOLOCK_EXCL) 264 up_write(&VFS_I(ip)->i_rwsem); 265 else if (lock_flags & XFS_IOLOCK_SHARED) 266 up_read(&VFS_I(ip)->i_rwsem); 267 out: 268 return 0; 269 } 270 271 /* 272 * xfs_iunlock() is used to drop the inode locks acquired with 273 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 274 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 275 * that we know which locks to drop. 276 * 277 * ip -- the inode being unlocked 278 * lock_flags -- this parameter indicates the inode's locks to be 279 * to be unlocked. See the comment for xfs_ilock() for a list 280 * of valid values for this parameter. 281 * 282 */ 283 void 284 xfs_iunlock( 285 xfs_inode_t *ip, 286 uint lock_flags) 287 { 288 xfs_lock_flags_assert(lock_flags); 289 290 if (lock_flags & XFS_IOLOCK_EXCL) 291 up_write(&VFS_I(ip)->i_rwsem); 292 else if (lock_flags & XFS_IOLOCK_SHARED) 293 up_read(&VFS_I(ip)->i_rwsem); 294 295 if (lock_flags & XFS_MMAPLOCK_EXCL) 296 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 297 else if (lock_flags & XFS_MMAPLOCK_SHARED) 298 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 299 300 if (lock_flags & XFS_ILOCK_EXCL) 301 up_write(&ip->i_lock); 302 else if (lock_flags & XFS_ILOCK_SHARED) 303 up_read(&ip->i_lock); 304 305 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 306 } 307 308 /* 309 * give up write locks. the i/o lock cannot be held nested 310 * if it is being demoted. 311 */ 312 void 313 xfs_ilock_demote( 314 xfs_inode_t *ip, 315 uint lock_flags) 316 { 317 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); 318 ASSERT((lock_flags & 319 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 320 321 if (lock_flags & XFS_ILOCK_EXCL) 322 downgrade_write(&ip->i_lock); 323 if (lock_flags & XFS_MMAPLOCK_EXCL) 324 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); 325 if (lock_flags & XFS_IOLOCK_EXCL) 326 downgrade_write(&VFS_I(ip)->i_rwsem); 327 328 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 329 } 330 331 void 332 xfs_assert_ilocked( 333 struct xfs_inode *ip, 334 uint lock_flags) 335 { 336 /* 337 * Sometimes we assert the ILOCK is held exclusively, but we're in 338 * a workqueue, so lockdep doesn't know we're the owner. 339 */ 340 if (lock_flags & XFS_ILOCK_SHARED) 341 rwsem_assert_held(&ip->i_lock); 342 else if (lock_flags & XFS_ILOCK_EXCL) 343 rwsem_assert_held_write_nolockdep(&ip->i_lock); 344 345 if (lock_flags & XFS_MMAPLOCK_SHARED) 346 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); 347 else if (lock_flags & XFS_MMAPLOCK_EXCL) 348 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); 349 350 if (lock_flags & XFS_IOLOCK_SHARED) 351 rwsem_assert_held(&VFS_I(ip)->i_rwsem); 352 else if (lock_flags & XFS_IOLOCK_EXCL) 353 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); 354 } 355 356 /* 357 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 358 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 359 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 360 * errors and warnings. 361 */ 362 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 363 static bool 364 xfs_lockdep_subclass_ok( 365 int subclass) 366 { 367 return subclass < MAX_LOCKDEP_SUBCLASSES; 368 } 369 #else 370 #define xfs_lockdep_subclass_ok(subclass) (true) 371 #endif 372 373 /* 374 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 375 * value. This can be called for any type of inode lock combination, including 376 * parent locking. Care must be taken to ensure we don't overrun the subclass 377 * storage fields in the class mask we build. 378 */ 379 static inline uint 380 xfs_lock_inumorder( 381 uint lock_mode, 382 uint subclass) 383 { 384 uint class = 0; 385 386 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | 387 XFS_ILOCK_RTSUM))); 388 ASSERT(xfs_lockdep_subclass_ok(subclass)); 389 390 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 391 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 392 class += subclass << XFS_IOLOCK_SHIFT; 393 } 394 395 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 396 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 397 class += subclass << XFS_MMAPLOCK_SHIFT; 398 } 399 400 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { 401 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 402 class += subclass << XFS_ILOCK_SHIFT; 403 } 404 405 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 406 } 407 408 /* 409 * The following routine will lock n inodes in exclusive mode. We assume the 410 * caller calls us with the inodes in i_ino order. 411 * 412 * We need to detect deadlock where an inode that we lock is in the AIL and we 413 * start waiting for another inode that is locked by a thread in a long running 414 * transaction (such as truncate). This can result in deadlock since the long 415 * running trans might need to wait for the inode we just locked in order to 416 * push the tail and free space in the log. 417 * 418 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 419 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 420 * lock more than one at a time, lockdep will report false positives saying we 421 * have violated locking orders. 422 */ 423 static void 424 xfs_lock_inodes( 425 struct xfs_inode **ips, 426 int inodes, 427 uint lock_mode) 428 { 429 int attempts = 0; 430 uint i; 431 int j; 432 bool try_lock; 433 struct xfs_log_item *lp; 434 435 /* 436 * Currently supports between 2 and 5 inodes with exclusive locking. We 437 * support an arbitrary depth of locking here, but absolute limits on 438 * inodes depend on the type of locking and the limits placed by 439 * lockdep annotations in xfs_lock_inumorder. These are all checked by 440 * the asserts. 441 */ 442 ASSERT(ips && inodes >= 2 && inodes <= 5); 443 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | 444 XFS_ILOCK_EXCL)); 445 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 446 XFS_ILOCK_SHARED))); 447 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 448 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 449 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 450 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 451 452 if (lock_mode & XFS_IOLOCK_EXCL) { 453 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 454 } else if (lock_mode & XFS_MMAPLOCK_EXCL) 455 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 456 457 again: 458 try_lock = false; 459 i = 0; 460 for (; i < inodes; i++) { 461 ASSERT(ips[i]); 462 463 if (i && (ips[i] == ips[i - 1])) /* Already locked */ 464 continue; 465 466 /* 467 * If try_lock is not set yet, make sure all locked inodes are 468 * not in the AIL. If any are, set try_lock to be used later. 469 */ 470 if (!try_lock) { 471 for (j = (i - 1); j >= 0 && !try_lock; j--) { 472 lp = &ips[j]->i_itemp->ili_item; 473 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) 474 try_lock = true; 475 } 476 } 477 478 /* 479 * If any of the previous locks we have locked is in the AIL, 480 * we must TRY to get the second and subsequent locks. If 481 * we can't get any, we must release all we have 482 * and try again. 483 */ 484 if (!try_lock) { 485 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 486 continue; 487 } 488 489 /* try_lock means we have an inode locked that is in the AIL. */ 490 ASSERT(i != 0); 491 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) 492 continue; 493 494 /* 495 * Unlock all previous guys and try again. xfs_iunlock will try 496 * to push the tail if the inode is in the AIL. 497 */ 498 attempts++; 499 for (j = i - 1; j >= 0; j--) { 500 /* 501 * Check to see if we've already unlocked this one. Not 502 * the first one going back, and the inode ptr is the 503 * same. 504 */ 505 if (j != (i - 1) && ips[j] == ips[j + 1]) 506 continue; 507 508 xfs_iunlock(ips[j], lock_mode); 509 } 510 511 if ((attempts % 5) == 0) { 512 delay(1); /* Don't just spin the CPU */ 513 } 514 goto again; 515 } 516 } 517 518 /* 519 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and 520 * mmaplock must be double-locked separately since we use i_rwsem and 521 * invalidate_lock for that. We now support taking one lock EXCL and the 522 * other SHARED. 523 */ 524 void 525 xfs_lock_two_inodes( 526 struct xfs_inode *ip0, 527 uint ip0_mode, 528 struct xfs_inode *ip1, 529 uint ip1_mode) 530 { 531 int attempts = 0; 532 struct xfs_log_item *lp; 533 534 ASSERT(hweight32(ip0_mode) == 1); 535 ASSERT(hweight32(ip1_mode) == 1); 536 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 537 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 538 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 539 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 540 ASSERT(ip0->i_ino != ip1->i_ino); 541 542 if (ip0->i_ino > ip1->i_ino) { 543 swap(ip0, ip1); 544 swap(ip0_mode, ip1_mode); 545 } 546 547 again: 548 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 549 550 /* 551 * If the first lock we have locked is in the AIL, we must TRY to get 552 * the second lock. If we can't get it, we must release the first one 553 * and try again. 554 */ 555 lp = &ip0->i_itemp->ili_item; 556 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 557 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 558 xfs_iunlock(ip0, ip0_mode); 559 if ((++attempts % 5) == 0) 560 delay(1); /* Don't just spin the CPU */ 561 goto again; 562 } 563 } else { 564 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 565 } 566 } 567 568 uint 569 xfs_ip2xflags( 570 struct xfs_inode *ip) 571 { 572 uint flags = 0; 573 574 if (ip->i_diflags & XFS_DIFLAG_ANY) { 575 if (ip->i_diflags & XFS_DIFLAG_REALTIME) 576 flags |= FS_XFLAG_REALTIME; 577 if (ip->i_diflags & XFS_DIFLAG_PREALLOC) 578 flags |= FS_XFLAG_PREALLOC; 579 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) 580 flags |= FS_XFLAG_IMMUTABLE; 581 if (ip->i_diflags & XFS_DIFLAG_APPEND) 582 flags |= FS_XFLAG_APPEND; 583 if (ip->i_diflags & XFS_DIFLAG_SYNC) 584 flags |= FS_XFLAG_SYNC; 585 if (ip->i_diflags & XFS_DIFLAG_NOATIME) 586 flags |= FS_XFLAG_NOATIME; 587 if (ip->i_diflags & XFS_DIFLAG_NODUMP) 588 flags |= FS_XFLAG_NODUMP; 589 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) 590 flags |= FS_XFLAG_RTINHERIT; 591 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) 592 flags |= FS_XFLAG_PROJINHERIT; 593 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) 594 flags |= FS_XFLAG_NOSYMLINKS; 595 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 596 flags |= FS_XFLAG_EXTSIZE; 597 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) 598 flags |= FS_XFLAG_EXTSZINHERIT; 599 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) 600 flags |= FS_XFLAG_NODEFRAG; 601 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) 602 flags |= FS_XFLAG_FILESTREAM; 603 } 604 605 if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { 606 if (ip->i_diflags2 & XFS_DIFLAG2_DAX) 607 flags |= FS_XFLAG_DAX; 608 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 609 flags |= FS_XFLAG_COWEXTSIZE; 610 } 611 612 if (xfs_inode_has_attr_fork(ip)) 613 flags |= FS_XFLAG_HASATTR; 614 return flags; 615 } 616 617 /* 618 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 619 * is allowed, otherwise it has to be an exact match. If a CI match is found, 620 * ci_name->name will point to a the actual name (caller must free) or 621 * will be set to NULL if an exact match is found. 622 */ 623 int 624 xfs_lookup( 625 struct xfs_inode *dp, 626 const struct xfs_name *name, 627 struct xfs_inode **ipp, 628 struct xfs_name *ci_name) 629 { 630 xfs_ino_t inum; 631 int error; 632 633 trace_xfs_lookup(dp, name); 634 635 if (xfs_is_shutdown(dp->i_mount)) 636 return -EIO; 637 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 638 return -EIO; 639 640 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 641 if (error) 642 goto out_unlock; 643 644 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 645 if (error) 646 goto out_free_name; 647 648 return 0; 649 650 out_free_name: 651 if (ci_name) 652 kfree(ci_name->name); 653 out_unlock: 654 *ipp = NULL; 655 return error; 656 } 657 658 /* Propagate di_flags from a parent inode to a child inode. */ 659 static void 660 xfs_inode_inherit_flags( 661 struct xfs_inode *ip, 662 const struct xfs_inode *pip) 663 { 664 unsigned int di_flags = 0; 665 xfs_failaddr_t failaddr; 666 umode_t mode = VFS_I(ip)->i_mode; 667 668 if (S_ISDIR(mode)) { 669 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) 670 di_flags |= XFS_DIFLAG_RTINHERIT; 671 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 672 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 673 ip->i_extsize = pip->i_extsize; 674 } 675 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) 676 di_flags |= XFS_DIFLAG_PROJINHERIT; 677 } else if (S_ISREG(mode)) { 678 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && 679 xfs_has_realtime(ip->i_mount)) 680 di_flags |= XFS_DIFLAG_REALTIME; 681 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 682 di_flags |= XFS_DIFLAG_EXTSIZE; 683 ip->i_extsize = pip->i_extsize; 684 } 685 } 686 if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && 687 xfs_inherit_noatime) 688 di_flags |= XFS_DIFLAG_NOATIME; 689 if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && 690 xfs_inherit_nodump) 691 di_flags |= XFS_DIFLAG_NODUMP; 692 if ((pip->i_diflags & XFS_DIFLAG_SYNC) && 693 xfs_inherit_sync) 694 di_flags |= XFS_DIFLAG_SYNC; 695 if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && 696 xfs_inherit_nosymlinks) 697 di_flags |= XFS_DIFLAG_NOSYMLINKS; 698 if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && 699 xfs_inherit_nodefrag) 700 di_flags |= XFS_DIFLAG_NODEFRAG; 701 if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) 702 di_flags |= XFS_DIFLAG_FILESTREAM; 703 704 ip->i_diflags |= di_flags; 705 706 /* 707 * Inode verifiers on older kernels only check that the extent size 708 * hint is an integer multiple of the rt extent size on realtime files. 709 * They did not check the hint alignment on a directory with both 710 * rtinherit and extszinherit flags set. If the misaligned hint is 711 * propagated from a directory into a new realtime file, new file 712 * allocations will fail due to math errors in the rt allocator and/or 713 * trip the verifiers. Validate the hint settings in the new file so 714 * that we don't let broken hints propagate. 715 */ 716 failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, 717 VFS_I(ip)->i_mode, ip->i_diflags); 718 if (failaddr) { 719 ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | 720 XFS_DIFLAG_EXTSZINHERIT); 721 ip->i_extsize = 0; 722 } 723 } 724 725 /* Propagate di_flags2 from a parent inode to a child inode. */ 726 static void 727 xfs_inode_inherit_flags2( 728 struct xfs_inode *ip, 729 const struct xfs_inode *pip) 730 { 731 xfs_failaddr_t failaddr; 732 733 if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 734 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 735 ip->i_cowextsize = pip->i_cowextsize; 736 } 737 if (pip->i_diflags2 & XFS_DIFLAG2_DAX) 738 ip->i_diflags2 |= XFS_DIFLAG2_DAX; 739 740 /* Don't let invalid cowextsize hints propagate. */ 741 failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, 742 VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); 743 if (failaddr) { 744 ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 745 ip->i_cowextsize = 0; 746 } 747 } 748 749 /* 750 * Initialise a newly allocated inode and return the in-core inode to the 751 * caller locked exclusively. 752 */ 753 int 754 xfs_init_new_inode( 755 struct mnt_idmap *idmap, 756 struct xfs_trans *tp, 757 struct xfs_inode *pip, 758 xfs_ino_t ino, 759 umode_t mode, 760 xfs_nlink_t nlink, 761 dev_t rdev, 762 prid_t prid, 763 bool init_xattrs, 764 struct xfs_inode **ipp) 765 { 766 struct inode *dir = pip ? VFS_I(pip) : NULL; 767 struct xfs_mount *mp = tp->t_mountp; 768 struct xfs_inode *ip; 769 unsigned int flags; 770 int error; 771 struct timespec64 tv; 772 struct inode *inode; 773 774 /* 775 * Protect against obviously corrupt allocation btree records. Later 776 * xfs_iget checks will catch re-allocation of other active in-memory 777 * and on-disk inodes. If we don't catch reallocating the parent inode 778 * here we will deadlock in xfs_iget() so we have to do these checks 779 * first. 780 */ 781 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 782 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 783 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), 784 XFS_SICK_AG_INOBT); 785 return -EFSCORRUPTED; 786 } 787 788 /* 789 * Get the in-core inode with the lock held exclusively to prevent 790 * others from looking at until we're done. 791 */ 792 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 793 if (error) 794 return error; 795 796 ASSERT(ip != NULL); 797 inode = VFS_I(ip); 798 set_nlink(inode, nlink); 799 inode->i_rdev = rdev; 800 ip->i_projid = prid; 801 802 if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { 803 inode_fsuid_set(inode, idmap); 804 inode->i_gid = dir->i_gid; 805 inode->i_mode = mode; 806 } else { 807 inode_init_owner(idmap, inode, dir, mode); 808 } 809 810 /* 811 * If the group ID of the new file does not match the effective group 812 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 813 * (and only if the irix_sgid_inherit compatibility variable is set). 814 */ 815 if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && 816 !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) 817 inode->i_mode &= ~S_ISGID; 818 819 ip->i_disk_size = 0; 820 ip->i_df.if_nextents = 0; 821 ASSERT(ip->i_nblocks == 0); 822 823 tv = inode_set_ctime_current(inode); 824 inode_set_mtime_to_ts(inode, tv); 825 inode_set_atime_to_ts(inode, tv); 826 827 ip->i_extsize = 0; 828 ip->i_diflags = 0; 829 830 if (xfs_has_v3inodes(mp)) { 831 inode_set_iversion(inode, 1); 832 ip->i_cowextsize = 0; 833 ip->i_crtime = tv; 834 } 835 836 flags = XFS_ILOG_CORE; 837 switch (mode & S_IFMT) { 838 case S_IFIFO: 839 case S_IFCHR: 840 case S_IFBLK: 841 case S_IFSOCK: 842 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 843 flags |= XFS_ILOG_DEV; 844 break; 845 case S_IFREG: 846 case S_IFDIR: 847 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) 848 xfs_inode_inherit_flags(ip, pip); 849 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) 850 xfs_inode_inherit_flags2(ip, pip); 851 fallthrough; 852 case S_IFLNK: 853 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 854 ip->i_df.if_bytes = 0; 855 ip->i_df.if_data = NULL; 856 break; 857 default: 858 ASSERT(0); 859 } 860 861 /* 862 * If we need to create attributes immediately after allocating the 863 * inode, initialise an empty attribute fork right now. We use the 864 * default fork offset for attributes here as we don't know exactly what 865 * size or how many attributes we might be adding. We can do this 866 * safely here because we know the data fork is completely empty and 867 * this saves us from needing to run a separate transaction to set the 868 * fork offset in the immediate future. 869 */ 870 if (init_xattrs && xfs_has_attr(mp)) { 871 ip->i_forkoff = xfs_default_attroffset(ip) >> 3; 872 xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); 873 } 874 875 /* 876 * Log the new values stuffed into the inode. 877 */ 878 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 879 xfs_trans_log_inode(tp, ip, flags); 880 881 /* now that we have an i_mode we can setup the inode structure */ 882 xfs_setup_inode(ip); 883 884 *ipp = ip; 885 return 0; 886 } 887 888 /* 889 * Decrement the link count on an inode & log the change. If this causes the 890 * link count to go to zero, move the inode to AGI unlinked list so that it can 891 * be freed when the last active reference goes away via xfs_inactive(). 892 */ 893 static int /* error */ 894 xfs_droplink( 895 xfs_trans_t *tp, 896 xfs_inode_t *ip) 897 { 898 if (VFS_I(ip)->i_nlink == 0) { 899 xfs_alert(ip->i_mount, 900 "%s: Attempt to drop inode (%llu) with nlink zero.", 901 __func__, ip->i_ino); 902 return -EFSCORRUPTED; 903 } 904 905 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 906 907 drop_nlink(VFS_I(ip)); 908 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 909 910 if (VFS_I(ip)->i_nlink) 911 return 0; 912 913 return xfs_iunlink(tp, ip); 914 } 915 916 /* 917 * Increment the link count on an inode & log the change. 918 */ 919 static void 920 xfs_bumplink( 921 xfs_trans_t *tp, 922 xfs_inode_t *ip) 923 { 924 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 925 926 inc_nlink(VFS_I(ip)); 927 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 928 } 929 930 #ifdef CONFIG_XFS_LIVE_HOOKS 931 /* 932 * Use a static key here to reduce the overhead of directory live update hooks. 933 * If the compiler supports jump labels, the static branch will be replaced by 934 * a nop sled when there are no hook users. Online fsck is currently the only 935 * caller, so this is a reasonable tradeoff. 936 * 937 * Note: Patching the kernel code requires taking the cpu hotplug lock. Other 938 * parts of the kernel allocate memory with that lock held, which means that 939 * XFS callers cannot hold any locks that might be used by memory reclaim or 940 * writeback when calling the static_branch_{inc,dec} functions. 941 */ 942 DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); 943 944 void 945 xfs_dir_hook_disable(void) 946 { 947 xfs_hooks_switch_off(&xfs_dir_hooks_switch); 948 } 949 950 void 951 xfs_dir_hook_enable(void) 952 { 953 xfs_hooks_switch_on(&xfs_dir_hooks_switch); 954 } 955 956 /* Call hooks for a directory update relating to a child dirent update. */ 957 inline void 958 xfs_dir_update_hook( 959 struct xfs_inode *dp, 960 struct xfs_inode *ip, 961 int delta, 962 const struct xfs_name *name) 963 { 964 if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { 965 struct xfs_dir_update_params p = { 966 .dp = dp, 967 .ip = ip, 968 .delta = delta, 969 .name = name, 970 }; 971 struct xfs_mount *mp = ip->i_mount; 972 973 xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); 974 } 975 } 976 977 /* Call the specified function during a directory update. */ 978 int 979 xfs_dir_hook_add( 980 struct xfs_mount *mp, 981 struct xfs_dir_hook *hook) 982 { 983 return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); 984 } 985 986 /* Stop calling the specified function during a directory update. */ 987 void 988 xfs_dir_hook_del( 989 struct xfs_mount *mp, 990 struct xfs_dir_hook *hook) 991 { 992 xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); 993 } 994 995 /* Configure directory update hook functions. */ 996 void 997 xfs_dir_hook_setup( 998 struct xfs_dir_hook *hook, 999 notifier_fn_t mod_fn) 1000 { 1001 xfs_hook_setup(&hook->dirent_hook, mod_fn); 1002 } 1003 #endif /* CONFIG_XFS_LIVE_HOOKS */ 1004 1005 int 1006 xfs_create( 1007 struct mnt_idmap *idmap, 1008 xfs_inode_t *dp, 1009 struct xfs_name *name, 1010 umode_t mode, 1011 dev_t rdev, 1012 bool init_xattrs, 1013 xfs_inode_t **ipp) 1014 { 1015 int is_dir = S_ISDIR(mode); 1016 struct xfs_mount *mp = dp->i_mount; 1017 struct xfs_inode *ip = NULL; 1018 struct xfs_trans *tp = NULL; 1019 int error; 1020 bool unlock_dp_on_error = false; 1021 prid_t prid; 1022 struct xfs_dquot *udqp = NULL; 1023 struct xfs_dquot *gdqp = NULL; 1024 struct xfs_dquot *pdqp = NULL; 1025 struct xfs_trans_res *tres; 1026 uint resblks; 1027 xfs_ino_t ino; 1028 1029 trace_xfs_create(dp, name); 1030 1031 if (xfs_is_shutdown(mp)) 1032 return -EIO; 1033 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 1034 return -EIO; 1035 1036 prid = xfs_get_initial_prid(dp); 1037 1038 /* 1039 * Make sure that we have allocated dquot(s) on disk. 1040 */ 1041 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1042 mapped_fsgid(idmap, &init_user_ns), prid, 1043 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1044 &udqp, &gdqp, &pdqp); 1045 if (error) 1046 return error; 1047 1048 if (is_dir) { 1049 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1050 tres = &M_RES(mp)->tr_mkdir; 1051 } else { 1052 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1053 tres = &M_RES(mp)->tr_create; 1054 } 1055 1056 /* 1057 * Initially assume that the file does not exist and 1058 * reserve the resources for that case. If that is not 1059 * the case we'll drop the one we have and get a more 1060 * appropriate transaction later. 1061 */ 1062 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1063 &tp); 1064 if (error == -ENOSPC) { 1065 /* flush outstanding delalloc blocks and retry */ 1066 xfs_flush_inodes(mp); 1067 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, 1068 resblks, &tp); 1069 } 1070 if (error) 1071 goto out_release_dquots; 1072 1073 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1074 unlock_dp_on_error = true; 1075 1076 /* 1077 * A newly created regular or special file just has one directory 1078 * entry pointing to them, but a directory also the "." entry 1079 * pointing to itself. 1080 */ 1081 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1082 if (!error) 1083 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1084 is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); 1085 if (error) 1086 goto out_trans_cancel; 1087 1088 /* 1089 * Now we join the directory inode to the transaction. We do not do it 1090 * earlier because xfs_dialloc might commit the previous transaction 1091 * (and release all the locks). An error from here on will result in 1092 * the transaction cancel unlocking dp so don't do it explicitly in the 1093 * error path. 1094 */ 1095 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1096 unlock_dp_on_error = false; 1097 1098 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1099 resblks - XFS_IALLOC_SPACE_RES(mp)); 1100 if (error) { 1101 ASSERT(error != -ENOSPC); 1102 goto out_trans_cancel; 1103 } 1104 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1105 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1106 1107 if (is_dir) { 1108 error = xfs_dir_init(tp, ip, dp); 1109 if (error) 1110 goto out_trans_cancel; 1111 1112 xfs_bumplink(tp, dp); 1113 } 1114 1115 /* 1116 * Create ip with a reference from dp, and add '.' and '..' references 1117 * if it's a directory. 1118 */ 1119 xfs_dir_update_hook(dp, ip, 1, name); 1120 1121 /* 1122 * If this is a synchronous mount, make sure that the 1123 * create transaction goes to disk before returning to 1124 * the user. 1125 */ 1126 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1127 xfs_trans_set_sync(tp); 1128 1129 /* 1130 * Attach the dquot(s) to the inodes and modify them incore. 1131 * These ids of the inode couldn't have changed since the new 1132 * inode has been locked ever since it was created. 1133 */ 1134 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1135 1136 error = xfs_trans_commit(tp); 1137 if (error) 1138 goto out_release_inode; 1139 1140 xfs_qm_dqrele(udqp); 1141 xfs_qm_dqrele(gdqp); 1142 xfs_qm_dqrele(pdqp); 1143 1144 *ipp = ip; 1145 return 0; 1146 1147 out_trans_cancel: 1148 xfs_trans_cancel(tp); 1149 out_release_inode: 1150 /* 1151 * Wait until after the current transaction is aborted to finish the 1152 * setup of the inode and release the inode. This prevents recursive 1153 * transactions and deadlocks from xfs_inactive. 1154 */ 1155 if (ip) { 1156 xfs_finish_inode_setup(ip); 1157 xfs_irele(ip); 1158 } 1159 out_release_dquots: 1160 xfs_qm_dqrele(udqp); 1161 xfs_qm_dqrele(gdqp); 1162 xfs_qm_dqrele(pdqp); 1163 1164 if (unlock_dp_on_error) 1165 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1166 return error; 1167 } 1168 1169 int 1170 xfs_create_tmpfile( 1171 struct mnt_idmap *idmap, 1172 struct xfs_inode *dp, 1173 umode_t mode, 1174 struct xfs_inode **ipp) 1175 { 1176 struct xfs_mount *mp = dp->i_mount; 1177 struct xfs_inode *ip = NULL; 1178 struct xfs_trans *tp = NULL; 1179 int error; 1180 prid_t prid; 1181 struct xfs_dquot *udqp = NULL; 1182 struct xfs_dquot *gdqp = NULL; 1183 struct xfs_dquot *pdqp = NULL; 1184 struct xfs_trans_res *tres; 1185 uint resblks; 1186 xfs_ino_t ino; 1187 1188 if (xfs_is_shutdown(mp)) 1189 return -EIO; 1190 1191 prid = xfs_get_initial_prid(dp); 1192 1193 /* 1194 * Make sure that we have allocated dquot(s) on disk. 1195 */ 1196 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1197 mapped_fsgid(idmap, &init_user_ns), prid, 1198 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1199 &udqp, &gdqp, &pdqp); 1200 if (error) 1201 return error; 1202 1203 resblks = XFS_IALLOC_SPACE_RES(mp); 1204 tres = &M_RES(mp)->tr_create_tmpfile; 1205 1206 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1207 &tp); 1208 if (error) 1209 goto out_release_dquots; 1210 1211 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1212 if (!error) 1213 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1214 0, 0, prid, false, &ip); 1215 if (error) 1216 goto out_trans_cancel; 1217 1218 if (xfs_has_wsync(mp)) 1219 xfs_trans_set_sync(tp); 1220 1221 /* 1222 * Attach the dquot(s) to the inodes and modify them incore. 1223 * These ids of the inode couldn't have changed since the new 1224 * inode has been locked ever since it was created. 1225 */ 1226 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1227 1228 error = xfs_iunlink(tp, ip); 1229 if (error) 1230 goto out_trans_cancel; 1231 1232 error = xfs_trans_commit(tp); 1233 if (error) 1234 goto out_release_inode; 1235 1236 xfs_qm_dqrele(udqp); 1237 xfs_qm_dqrele(gdqp); 1238 xfs_qm_dqrele(pdqp); 1239 1240 *ipp = ip; 1241 return 0; 1242 1243 out_trans_cancel: 1244 xfs_trans_cancel(tp); 1245 out_release_inode: 1246 /* 1247 * Wait until after the current transaction is aborted to finish the 1248 * setup of the inode and release the inode. This prevents recursive 1249 * transactions and deadlocks from xfs_inactive. 1250 */ 1251 if (ip) { 1252 xfs_finish_inode_setup(ip); 1253 xfs_irele(ip); 1254 } 1255 out_release_dquots: 1256 xfs_qm_dqrele(udqp); 1257 xfs_qm_dqrele(gdqp); 1258 xfs_qm_dqrele(pdqp); 1259 1260 return error; 1261 } 1262 1263 int 1264 xfs_link( 1265 xfs_inode_t *tdp, 1266 xfs_inode_t *sip, 1267 struct xfs_name *target_name) 1268 { 1269 xfs_mount_t *mp = tdp->i_mount; 1270 xfs_trans_t *tp; 1271 int error, nospace_error = 0; 1272 int resblks; 1273 1274 trace_xfs_link(tdp, target_name); 1275 1276 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1277 1278 if (xfs_is_shutdown(mp)) 1279 return -EIO; 1280 if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) 1281 return -EIO; 1282 1283 error = xfs_qm_dqattach(sip); 1284 if (error) 1285 goto std_return; 1286 1287 error = xfs_qm_dqattach(tdp); 1288 if (error) 1289 goto std_return; 1290 1291 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1292 error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, 1293 &tp, &nospace_error); 1294 if (error) 1295 goto std_return; 1296 1297 /* 1298 * If we are using project inheritance, we only allow hard link 1299 * creation in our tree when the project IDs are the same; else 1300 * the tree quota mechanism could be circumvented. 1301 */ 1302 if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 1303 tdp->i_projid != sip->i_projid)) { 1304 error = -EXDEV; 1305 goto error_return; 1306 } 1307 1308 if (!resblks) { 1309 error = xfs_dir_canenter(tp, tdp, target_name); 1310 if (error) 1311 goto error_return; 1312 } 1313 1314 /* 1315 * Handle initial link state of O_TMPFILE inode 1316 */ 1317 if (VFS_I(sip)->i_nlink == 0) { 1318 struct xfs_perag *pag; 1319 1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); 1321 error = xfs_iunlink_remove(tp, pag, sip); 1322 xfs_perag_put(pag); 1323 if (error) 1324 goto error_return; 1325 } 1326 1327 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1328 resblks); 1329 if (error) 1330 goto error_return; 1331 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1332 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1333 1334 xfs_bumplink(tp, sip); 1335 xfs_dir_update_hook(tdp, sip, 1, target_name); 1336 1337 /* 1338 * If this is a synchronous mount, make sure that the 1339 * link transaction goes to disk before returning to 1340 * the user. 1341 */ 1342 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1343 xfs_trans_set_sync(tp); 1344 1345 return xfs_trans_commit(tp); 1346 1347 error_return: 1348 xfs_trans_cancel(tp); 1349 std_return: 1350 if (error == -ENOSPC && nospace_error) 1351 error = nospace_error; 1352 return error; 1353 } 1354 1355 /* Clear the reflink flag and the cowblocks tag if possible. */ 1356 static void 1357 xfs_itruncate_clear_reflink_flags( 1358 struct xfs_inode *ip) 1359 { 1360 struct xfs_ifork *dfork; 1361 struct xfs_ifork *cfork; 1362 1363 if (!xfs_is_reflink_inode(ip)) 1364 return; 1365 dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); 1366 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 1367 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) 1368 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1369 if (cfork->if_bytes == 0) 1370 xfs_inode_clear_cowblocks_tag(ip); 1371 } 1372 1373 /* 1374 * Free up the underlying blocks past new_size. The new size must be smaller 1375 * than the current size. This routine can be used both for the attribute and 1376 * data fork, and does not modify the inode size, which is left to the caller. 1377 * 1378 * The transaction passed to this routine must have made a permanent log 1379 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1380 * given transaction and start new ones, so make sure everything involved in 1381 * the transaction is tidy before calling here. Some transaction will be 1382 * returned to the caller to be committed. The incoming transaction must 1383 * already include the inode, and both inode locks must be held exclusively. 1384 * The inode must also be "held" within the transaction. On return the inode 1385 * will be "held" within the returned transaction. This routine does NOT 1386 * require any disk space to be reserved for it within the transaction. 1387 * 1388 * If we get an error, we must return with the inode locked and linked into the 1389 * current transaction. This keeps things simple for the higher level code, 1390 * because it always knows that the inode is locked and held in the transaction 1391 * that returns to it whether errors occur or not. We don't mark the inode 1392 * dirty on error so that transactions can be easily aborted if possible. 1393 */ 1394 int 1395 xfs_itruncate_extents_flags( 1396 struct xfs_trans **tpp, 1397 struct xfs_inode *ip, 1398 int whichfork, 1399 xfs_fsize_t new_size, 1400 int flags) 1401 { 1402 struct xfs_mount *mp = ip->i_mount; 1403 struct xfs_trans *tp = *tpp; 1404 xfs_fileoff_t first_unmap_block; 1405 int error = 0; 1406 1407 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1408 if (atomic_read(&VFS_I(ip)->i_count)) 1409 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); 1410 ASSERT(new_size <= XFS_ISIZE(ip)); 1411 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1412 ASSERT(ip->i_itemp != NULL); 1413 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1414 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1415 1416 trace_xfs_itruncate_extents_start(ip, new_size); 1417 1418 flags |= xfs_bmapi_aflag(whichfork); 1419 1420 /* 1421 * Since it is possible for space to become allocated beyond 1422 * the end of the file (in a crash where the space is allocated 1423 * but the inode size is not yet updated), simply remove any 1424 * blocks which show up between the new EOF and the maximum 1425 * possible file size. 1426 * 1427 * We have to free all the blocks to the bmbt maximum offset, even if 1428 * the page cache can't scale that far. 1429 */ 1430 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1431 if (!xfs_verify_fileoff(mp, first_unmap_block)) { 1432 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1433 return 0; 1434 } 1435 1436 error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, 1437 XFS_MAX_FILEOFF); 1438 if (error) 1439 goto out; 1440 1441 if (whichfork == XFS_DATA_FORK) { 1442 /* Remove all pending CoW reservations. */ 1443 error = xfs_reflink_cancel_cow_blocks(ip, &tp, 1444 first_unmap_block, XFS_MAX_FILEOFF, true); 1445 if (error) 1446 goto out; 1447 1448 xfs_itruncate_clear_reflink_flags(ip); 1449 } 1450 1451 /* 1452 * Always re-log the inode so that our permanent transaction can keep 1453 * on rolling it forward in the log. 1454 */ 1455 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1456 1457 trace_xfs_itruncate_extents_end(ip, new_size); 1458 1459 out: 1460 *tpp = tp; 1461 return error; 1462 } 1463 1464 int 1465 xfs_release( 1466 xfs_inode_t *ip) 1467 { 1468 xfs_mount_t *mp = ip->i_mount; 1469 int error = 0; 1470 1471 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) 1472 return 0; 1473 1474 /* If this is a read-only mount, don't do this (would generate I/O) */ 1475 if (xfs_is_readonly(mp)) 1476 return 0; 1477 1478 if (!xfs_is_shutdown(mp)) { 1479 int truncated; 1480 1481 /* 1482 * If we previously truncated this file and removed old data 1483 * in the process, we want to initiate "early" writeout on 1484 * the last close. This is an attempt to combat the notorious 1485 * NULL files problem which is particularly noticeable from a 1486 * truncate down, buffered (re-)write (delalloc), followed by 1487 * a crash. What we are effectively doing here is 1488 * significantly reducing the time window where we'd otherwise 1489 * be exposed to that problem. 1490 */ 1491 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1492 if (truncated) { 1493 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1494 if (ip->i_delayed_blks > 0) { 1495 error = filemap_flush(VFS_I(ip)->i_mapping); 1496 if (error) 1497 return error; 1498 } 1499 } 1500 } 1501 1502 if (VFS_I(ip)->i_nlink == 0) 1503 return 0; 1504 1505 /* 1506 * If we can't get the iolock just skip truncating the blocks past EOF 1507 * because we could deadlock with the mmap_lock otherwise. We'll get 1508 * another chance to drop them once the last reference to the inode is 1509 * dropped, so we'll never leak blocks permanently. 1510 */ 1511 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) 1512 return 0; 1513 1514 if (xfs_can_free_eofblocks(ip, false)) { 1515 /* 1516 * Check if the inode is being opened, written and closed 1517 * frequently and we have delayed allocation blocks outstanding 1518 * (e.g. streaming writes from the NFS server), truncating the 1519 * blocks past EOF will cause fragmentation to occur. 1520 * 1521 * In this case don't do the truncation, but we have to be 1522 * careful how we detect this case. Blocks beyond EOF show up as 1523 * i_delayed_blks even when the inode is clean, so we need to 1524 * truncate them away first before checking for a dirty release. 1525 * Hence on the first dirty close we will still remove the 1526 * speculative allocation, but after that we will leave it in 1527 * place. 1528 */ 1529 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1530 goto out_unlock; 1531 1532 error = xfs_free_eofblocks(ip); 1533 if (error) 1534 goto out_unlock; 1535 1536 /* delalloc blocks after truncation means it really is dirty */ 1537 if (ip->i_delayed_blks) 1538 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1539 } 1540 1541 out_unlock: 1542 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1543 return error; 1544 } 1545 1546 /* 1547 * xfs_inactive_truncate 1548 * 1549 * Called to perform a truncate when an inode becomes unlinked. 1550 */ 1551 STATIC int 1552 xfs_inactive_truncate( 1553 struct xfs_inode *ip) 1554 { 1555 struct xfs_mount *mp = ip->i_mount; 1556 struct xfs_trans *tp; 1557 int error; 1558 1559 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1560 if (error) { 1561 ASSERT(xfs_is_shutdown(mp)); 1562 return error; 1563 } 1564 xfs_ilock(ip, XFS_ILOCK_EXCL); 1565 xfs_trans_ijoin(tp, ip, 0); 1566 1567 /* 1568 * Log the inode size first to prevent stale data exposure in the event 1569 * of a system crash before the truncate completes. See the related 1570 * comment in xfs_vn_setattr_size() for details. 1571 */ 1572 ip->i_disk_size = 0; 1573 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1574 1575 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1576 if (error) 1577 goto error_trans_cancel; 1578 1579 ASSERT(ip->i_df.if_nextents == 0); 1580 1581 error = xfs_trans_commit(tp); 1582 if (error) 1583 goto error_unlock; 1584 1585 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1586 return 0; 1587 1588 error_trans_cancel: 1589 xfs_trans_cancel(tp); 1590 error_unlock: 1591 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1592 return error; 1593 } 1594 1595 /* 1596 * xfs_inactive_ifree() 1597 * 1598 * Perform the inode free when an inode is unlinked. 1599 */ 1600 STATIC int 1601 xfs_inactive_ifree( 1602 struct xfs_inode *ip) 1603 { 1604 struct xfs_mount *mp = ip->i_mount; 1605 struct xfs_trans *tp; 1606 int error; 1607 1608 /* 1609 * We try to use a per-AG reservation for any block needed by the finobt 1610 * tree, but as the finobt feature predates the per-AG reservation 1611 * support a degraded file system might not have enough space for the 1612 * reservation at mount time. In that case try to dip into the reserved 1613 * pool and pray. 1614 * 1615 * Send a warning if the reservation does happen to fail, as the inode 1616 * now remains allocated and sits on the unlinked list until the fs is 1617 * repaired. 1618 */ 1619 if (unlikely(mp->m_finobt_nores)) { 1620 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1621 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1622 &tp); 1623 } else { 1624 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1625 } 1626 if (error) { 1627 if (error == -ENOSPC) { 1628 xfs_warn_ratelimited(mp, 1629 "Failed to remove inode(s) from unlinked list. " 1630 "Please free space, unmount and run xfs_repair."); 1631 } else { 1632 ASSERT(xfs_is_shutdown(mp)); 1633 } 1634 return error; 1635 } 1636 1637 /* 1638 * We do not hold the inode locked across the entire rolling transaction 1639 * here. We only need to hold it for the first transaction that 1640 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1641 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1642 * here breaks the relationship between cluster buffer invalidation and 1643 * stale inode invalidation on cluster buffer item journal commit 1644 * completion, and can result in leaving dirty stale inodes hanging 1645 * around in memory. 1646 * 1647 * We have no need for serialising this inode operation against other 1648 * operations - we freed the inode and hence reallocation is required 1649 * and that will serialise on reallocating the space the deferops need 1650 * to free. Hence we can unlock the inode on the first commit of 1651 * the transaction rather than roll it right through the deferops. This 1652 * avoids relogging the XFS_ISTALE inode. 1653 * 1654 * We check that xfs_ifree() hasn't grown an internal transaction roll 1655 * by asserting that the inode is still locked when it returns. 1656 */ 1657 xfs_ilock(ip, XFS_ILOCK_EXCL); 1658 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1659 1660 error = xfs_ifree(tp, ip); 1661 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1662 if (error) { 1663 /* 1664 * If we fail to free the inode, shut down. The cancel 1665 * might do that, we need to make sure. Otherwise the 1666 * inode might be lost for a long time or forever. 1667 */ 1668 if (!xfs_is_shutdown(mp)) { 1669 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1670 __func__, error); 1671 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1672 } 1673 xfs_trans_cancel(tp); 1674 return error; 1675 } 1676 1677 /* 1678 * Credit the quota account(s). The inode is gone. 1679 */ 1680 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1681 1682 return xfs_trans_commit(tp); 1683 } 1684 1685 /* 1686 * Returns true if we need to update the on-disk metadata before we can free 1687 * the memory used by this inode. Updates include freeing post-eof 1688 * preallocations; freeing COW staging extents; and marking the inode free in 1689 * the inobt if it is on the unlinked list. 1690 */ 1691 bool 1692 xfs_inode_needs_inactive( 1693 struct xfs_inode *ip) 1694 { 1695 struct xfs_mount *mp = ip->i_mount; 1696 struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 1697 1698 /* 1699 * If the inode is already free, then there can be nothing 1700 * to clean up here. 1701 */ 1702 if (VFS_I(ip)->i_mode == 0) 1703 return false; 1704 1705 /* 1706 * If this is a read-only mount, don't do this (would generate I/O) 1707 * unless we're in log recovery and cleaning the iunlinked list. 1708 */ 1709 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1710 return false; 1711 1712 /* If the log isn't running, push inodes straight to reclaim. */ 1713 if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) 1714 return false; 1715 1716 /* Metadata inodes require explicit resource cleanup. */ 1717 if (xfs_is_metadata_inode(ip)) 1718 return false; 1719 1720 /* Want to clean out the cow blocks if there are any. */ 1721 if (cow_ifp && cow_ifp->if_bytes > 0) 1722 return true; 1723 1724 /* Unlinked files must be freed. */ 1725 if (VFS_I(ip)->i_nlink == 0) 1726 return true; 1727 1728 /* 1729 * This file isn't being freed, so check if there are post-eof blocks 1730 * to free. @force is true because we are evicting an inode from the 1731 * cache. Post-eof blocks must be freed, lest we end up with broken 1732 * free space accounting. 1733 * 1734 * Note: don't bother with iolock here since lockdep complains about 1735 * acquiring it in reclaim context. We have the only reference to the 1736 * inode at this point anyways. 1737 */ 1738 return xfs_can_free_eofblocks(ip, true); 1739 } 1740 1741 /* 1742 * xfs_inactive 1743 * 1744 * This is called when the vnode reference count for the vnode 1745 * goes to zero. If the file has been unlinked, then it must 1746 * now be truncated. Also, we clear all of the read-ahead state 1747 * kept for the inode here since the file is now closed. 1748 */ 1749 int 1750 xfs_inactive( 1751 xfs_inode_t *ip) 1752 { 1753 struct xfs_mount *mp; 1754 int error = 0; 1755 int truncate = 0; 1756 1757 /* 1758 * If the inode is already free, then there can be nothing 1759 * to clean up here. 1760 */ 1761 if (VFS_I(ip)->i_mode == 0) { 1762 ASSERT(ip->i_df.if_broot_bytes == 0); 1763 goto out; 1764 } 1765 1766 mp = ip->i_mount; 1767 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1768 1769 /* 1770 * If this is a read-only mount, don't do this (would generate I/O) 1771 * unless we're in log recovery and cleaning the iunlinked list. 1772 */ 1773 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1774 goto out; 1775 1776 /* Metadata inodes require explicit resource cleanup. */ 1777 if (xfs_is_metadata_inode(ip)) 1778 goto out; 1779 1780 /* Try to clean out the cow blocks if there are any. */ 1781 if (xfs_inode_has_cow_data(ip)) 1782 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1783 1784 if (VFS_I(ip)->i_nlink != 0) { 1785 /* 1786 * force is true because we are evicting an inode from the 1787 * cache. Post-eof blocks must be freed, lest we end up with 1788 * broken free space accounting. 1789 * 1790 * Note: don't bother with iolock here since lockdep complains 1791 * about acquiring it in reclaim context. We have the only 1792 * reference to the inode at this point anyways. 1793 */ 1794 if (xfs_can_free_eofblocks(ip, true)) 1795 error = xfs_free_eofblocks(ip); 1796 1797 goto out; 1798 } 1799 1800 if (S_ISREG(VFS_I(ip)->i_mode) && 1801 (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || 1802 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) 1803 truncate = 1; 1804 1805 if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { 1806 /* 1807 * If this inode is being inactivated during a quotacheck and 1808 * has not yet been scanned by quotacheck, we /must/ remove 1809 * the dquots from the inode before inactivation changes the 1810 * block and inode counts. Most probably this is a result of 1811 * reloading the incore iunlinked list to purge unrecovered 1812 * unlinked inodes. 1813 */ 1814 xfs_qm_dqdetach(ip); 1815 } else { 1816 error = xfs_qm_dqattach(ip); 1817 if (error) 1818 goto out; 1819 } 1820 1821 if (S_ISLNK(VFS_I(ip)->i_mode)) 1822 error = xfs_inactive_symlink(ip); 1823 else if (truncate) 1824 error = xfs_inactive_truncate(ip); 1825 if (error) 1826 goto out; 1827 1828 /* 1829 * If there are attributes associated with the file then blow them away 1830 * now. The code calls a routine that recursively deconstructs the 1831 * attribute fork. If also blows away the in-core attribute fork. 1832 */ 1833 if (xfs_inode_has_attr_fork(ip)) { 1834 error = xfs_attr_inactive(ip); 1835 if (error) 1836 goto out; 1837 } 1838 1839 ASSERT(ip->i_forkoff == 0); 1840 1841 /* 1842 * Free the inode. 1843 */ 1844 error = xfs_inactive_ifree(ip); 1845 1846 out: 1847 /* 1848 * We're done making metadata updates for this inode, so we can release 1849 * the attached dquots. 1850 */ 1851 xfs_qm_dqdetach(ip); 1852 return error; 1853 } 1854 1855 /* 1856 * In-Core Unlinked List Lookups 1857 * ============================= 1858 * 1859 * Every inode is supposed to be reachable from some other piece of metadata 1860 * with the exception of the root directory. Inodes with a connection to a 1861 * file descriptor but not linked from anywhere in the on-disk directory tree 1862 * are collectively known as unlinked inodes, though the filesystem itself 1863 * maintains links to these inodes so that on-disk metadata are consistent. 1864 * 1865 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1866 * header contains a number of buckets that point to an inode, and each inode 1867 * record has a pointer to the next inode in the hash chain. This 1868 * singly-linked list causes scaling problems in the iunlink remove function 1869 * because we must walk that list to find the inode that points to the inode 1870 * being removed from the unlinked hash bucket list. 1871 * 1872 * Hence we keep an in-memory double linked list to link each inode on an 1873 * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer 1874 * based lists would require having 64 list heads in the perag, one for each 1875 * list. This is expensive in terms of memory (think millions of AGs) and cache 1876 * misses on lookups. Instead, use the fact that inodes on the unlinked list 1877 * must be referenced at the VFS level to keep them on the list and hence we 1878 * have an existence guarantee for inodes on the unlinked list. 1879 * 1880 * Given we have an existence guarantee, we can use lockless inode cache lookups 1881 * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode 1882 * for the double linked unlinked list, and we don't need any extra locking to 1883 * keep the list safe as all manipulations are done under the AGI buffer lock. 1884 * Keeping the list up to date does not require memory allocation, just finding 1885 * the XFS inode and updating the next/prev unlinked list aginos. 1886 */ 1887 1888 /* 1889 * Find an inode on the unlinked list. This does not take references to the 1890 * inode as we have existence guarantees by holding the AGI buffer lock and that 1891 * only unlinked, referenced inodes can be on the unlinked inode list. If we 1892 * don't find the inode in cache, then let the caller handle the situation. 1893 */ 1894 static struct xfs_inode * 1895 xfs_iunlink_lookup( 1896 struct xfs_perag *pag, 1897 xfs_agino_t agino) 1898 { 1899 struct xfs_inode *ip; 1900 1901 rcu_read_lock(); 1902 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 1903 if (!ip) { 1904 /* Caller can handle inode not being in memory. */ 1905 rcu_read_unlock(); 1906 return NULL; 1907 } 1908 1909 /* 1910 * Inode in RCU freeing limbo should not happen. Warn about this and 1911 * let the caller handle the failure. 1912 */ 1913 if (WARN_ON_ONCE(!ip->i_ino)) { 1914 rcu_read_unlock(); 1915 return NULL; 1916 } 1917 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); 1918 rcu_read_unlock(); 1919 return ip; 1920 } 1921 1922 /* 1923 * Update the prev pointer of the next agino. Returns -ENOLINK if the inode 1924 * is not in cache. 1925 */ 1926 static int 1927 xfs_iunlink_update_backref( 1928 struct xfs_perag *pag, 1929 xfs_agino_t prev_agino, 1930 xfs_agino_t next_agino) 1931 { 1932 struct xfs_inode *ip; 1933 1934 /* No update necessary if we are at the end of the list. */ 1935 if (next_agino == NULLAGINO) 1936 return 0; 1937 1938 ip = xfs_iunlink_lookup(pag, next_agino); 1939 if (!ip) 1940 return -ENOLINK; 1941 1942 ip->i_prev_unlinked = prev_agino; 1943 return 0; 1944 } 1945 1946 /* 1947 * Point the AGI unlinked bucket at an inode and log the results. The caller 1948 * is responsible for validating the old value. 1949 */ 1950 STATIC int 1951 xfs_iunlink_update_bucket( 1952 struct xfs_trans *tp, 1953 struct xfs_perag *pag, 1954 struct xfs_buf *agibp, 1955 unsigned int bucket_index, 1956 xfs_agino_t new_agino) 1957 { 1958 struct xfs_agi *agi = agibp->b_addr; 1959 xfs_agino_t old_value; 1960 int offset; 1961 1962 ASSERT(xfs_verify_agino_or_null(pag, new_agino)); 1963 1964 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 1965 trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, 1966 old_value, new_agino); 1967 1968 /* 1969 * We should never find the head of the list already set to the value 1970 * passed in because either we're adding or removing ourselves from the 1971 * head of the list. 1972 */ 1973 if (old_value == new_agino) { 1974 xfs_buf_mark_corrupt(agibp); 1975 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 1976 return -EFSCORRUPTED; 1977 } 1978 1979 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 1980 offset = offsetof(struct xfs_agi, agi_unlinked) + 1981 (sizeof(xfs_agino_t) * bucket_index); 1982 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 1983 return 0; 1984 } 1985 1986 /* 1987 * Load the inode @next_agino into the cache and set its prev_unlinked pointer 1988 * to @prev_agino. Caller must hold the AGI to synchronize with other changes 1989 * to the unlinked list. 1990 */ 1991 STATIC int 1992 xfs_iunlink_reload_next( 1993 struct xfs_trans *tp, 1994 struct xfs_buf *agibp, 1995 xfs_agino_t prev_agino, 1996 xfs_agino_t next_agino) 1997 { 1998 struct xfs_perag *pag = agibp->b_pag; 1999 struct xfs_mount *mp = pag->pag_mount; 2000 struct xfs_inode *next_ip = NULL; 2001 xfs_ino_t ino; 2002 int error; 2003 2004 ASSERT(next_agino != NULLAGINO); 2005 2006 #ifdef DEBUG 2007 rcu_read_lock(); 2008 next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); 2009 ASSERT(next_ip == NULL); 2010 rcu_read_unlock(); 2011 #endif 2012 2013 xfs_info_ratelimited(mp, 2014 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", 2015 next_agino, pag->pag_agno); 2016 2017 /* 2018 * Use an untrusted lookup just to be cautious in case the AGI has been 2019 * corrupted and now points at a free inode. That shouldn't happen, 2020 * but we'd rather shut down now since we're already running in a weird 2021 * situation. 2022 */ 2023 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); 2024 error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); 2025 if (error) { 2026 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2027 return error; 2028 } 2029 2030 /* If this is not an unlinked inode, something is very wrong. */ 2031 if (VFS_I(next_ip)->i_nlink != 0) { 2032 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2033 error = -EFSCORRUPTED; 2034 goto rele; 2035 } 2036 2037 next_ip->i_prev_unlinked = prev_agino; 2038 trace_xfs_iunlink_reload_next(next_ip); 2039 rele: 2040 ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); 2041 if (xfs_is_quotacheck_running(mp) && next_ip) 2042 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); 2043 xfs_irele(next_ip); 2044 return error; 2045 } 2046 2047 static int 2048 xfs_iunlink_insert_inode( 2049 struct xfs_trans *tp, 2050 struct xfs_perag *pag, 2051 struct xfs_buf *agibp, 2052 struct xfs_inode *ip) 2053 { 2054 struct xfs_mount *mp = tp->t_mountp; 2055 struct xfs_agi *agi = agibp->b_addr; 2056 xfs_agino_t next_agino; 2057 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2058 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2059 int error; 2060 2061 /* 2062 * Get the index into the agi hash table for the list this inode will 2063 * go on. Make sure the pointer isn't garbage and that this inode 2064 * isn't already on the list. 2065 */ 2066 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2067 if (next_agino == agino || 2068 !xfs_verify_agino_or_null(pag, next_agino)) { 2069 xfs_buf_mark_corrupt(agibp); 2070 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2071 return -EFSCORRUPTED; 2072 } 2073 2074 /* 2075 * Update the prev pointer in the next inode to point back to this 2076 * inode. 2077 */ 2078 error = xfs_iunlink_update_backref(pag, agino, next_agino); 2079 if (error == -ENOLINK) 2080 error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); 2081 if (error) 2082 return error; 2083 2084 if (next_agino != NULLAGINO) { 2085 /* 2086 * There is already another inode in the bucket, so point this 2087 * inode to the current head of the list. 2088 */ 2089 error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); 2090 if (error) 2091 return error; 2092 ip->i_next_unlinked = next_agino; 2093 } 2094 2095 /* Point the head of the list to point to this inode. */ 2096 ip->i_prev_unlinked = NULLAGINO; 2097 return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); 2098 } 2099 2100 /* 2101 * This is called when the inode's link count has gone to 0 or we are creating 2102 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2103 * 2104 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2105 * list when the inode is freed. 2106 */ 2107 STATIC int 2108 xfs_iunlink( 2109 struct xfs_trans *tp, 2110 struct xfs_inode *ip) 2111 { 2112 struct xfs_mount *mp = tp->t_mountp; 2113 struct xfs_perag *pag; 2114 struct xfs_buf *agibp; 2115 int error; 2116 2117 ASSERT(VFS_I(ip)->i_nlink == 0); 2118 ASSERT(VFS_I(ip)->i_mode != 0); 2119 trace_xfs_iunlink(ip); 2120 2121 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2122 2123 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2124 error = xfs_read_agi(pag, tp, &agibp); 2125 if (error) 2126 goto out; 2127 2128 error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); 2129 out: 2130 xfs_perag_put(pag); 2131 return error; 2132 } 2133 2134 static int 2135 xfs_iunlink_remove_inode( 2136 struct xfs_trans *tp, 2137 struct xfs_perag *pag, 2138 struct xfs_buf *agibp, 2139 struct xfs_inode *ip) 2140 { 2141 struct xfs_mount *mp = tp->t_mountp; 2142 struct xfs_agi *agi = agibp->b_addr; 2143 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2144 xfs_agino_t head_agino; 2145 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2146 int error; 2147 2148 trace_xfs_iunlink_remove(ip); 2149 2150 /* 2151 * Get the index into the agi hash table for the list this inode will 2152 * go on. Make sure the head pointer isn't garbage. 2153 */ 2154 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2155 if (!xfs_verify_agino(pag, head_agino)) { 2156 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2157 agi, sizeof(*agi)); 2158 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2159 return -EFSCORRUPTED; 2160 } 2161 2162 /* 2163 * Set our inode's next_unlinked pointer to NULL and then return 2164 * the old pointer value so that we can update whatever was previous 2165 * to us in the list to point to whatever was next in the list. 2166 */ 2167 error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); 2168 if (error) 2169 return error; 2170 2171 /* 2172 * Update the prev pointer in the next inode to point back to previous 2173 * inode in the chain. 2174 */ 2175 error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, 2176 ip->i_next_unlinked); 2177 if (error == -ENOLINK) 2178 error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, 2179 ip->i_next_unlinked); 2180 if (error) 2181 return error; 2182 2183 if (head_agino != agino) { 2184 struct xfs_inode *prev_ip; 2185 2186 prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); 2187 if (!prev_ip) { 2188 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 2189 return -EFSCORRUPTED; 2190 } 2191 2192 error = xfs_iunlink_log_inode(tp, prev_ip, pag, 2193 ip->i_next_unlinked); 2194 prev_ip->i_next_unlinked = ip->i_next_unlinked; 2195 } else { 2196 /* Point the head of the list to the next unlinked inode. */ 2197 error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, 2198 ip->i_next_unlinked); 2199 } 2200 2201 ip->i_next_unlinked = NULLAGINO; 2202 ip->i_prev_unlinked = 0; 2203 return error; 2204 } 2205 2206 /* 2207 * Pull the on-disk inode from the AGI unlinked list. 2208 */ 2209 STATIC int 2210 xfs_iunlink_remove( 2211 struct xfs_trans *tp, 2212 struct xfs_perag *pag, 2213 struct xfs_inode *ip) 2214 { 2215 struct xfs_buf *agibp; 2216 int error; 2217 2218 trace_xfs_iunlink_remove(ip); 2219 2220 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2221 error = xfs_read_agi(pag, tp, &agibp); 2222 if (error) 2223 return error; 2224 2225 return xfs_iunlink_remove_inode(tp, pag, agibp, ip); 2226 } 2227 2228 /* 2229 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2230 * mark it stale. We should only find clean inodes in this lookup that aren't 2231 * already stale. 2232 */ 2233 static void 2234 xfs_ifree_mark_inode_stale( 2235 struct xfs_perag *pag, 2236 struct xfs_inode *free_ip, 2237 xfs_ino_t inum) 2238 { 2239 struct xfs_mount *mp = pag->pag_mount; 2240 struct xfs_inode_log_item *iip; 2241 struct xfs_inode *ip; 2242 2243 retry: 2244 rcu_read_lock(); 2245 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2246 2247 /* Inode not in memory, nothing to do */ 2248 if (!ip) { 2249 rcu_read_unlock(); 2250 return; 2251 } 2252 2253 /* 2254 * because this is an RCU protected lookup, we could find a recently 2255 * freed or even reallocated inode during the lookup. We need to check 2256 * under the i_flags_lock for a valid inode here. Skip it if it is not 2257 * valid, the wrong inode or stale. 2258 */ 2259 spin_lock(&ip->i_flags_lock); 2260 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) 2261 goto out_iflags_unlock; 2262 2263 /* 2264 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2265 * other inodes that we did not find in the list attached to the buffer 2266 * and are not already marked stale. If we can't lock it, back off and 2267 * retry. 2268 */ 2269 if (ip != free_ip) { 2270 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2271 spin_unlock(&ip->i_flags_lock); 2272 rcu_read_unlock(); 2273 delay(1); 2274 goto retry; 2275 } 2276 } 2277 ip->i_flags |= XFS_ISTALE; 2278 2279 /* 2280 * If the inode is flushing, it is already attached to the buffer. All 2281 * we needed to do here is mark the inode stale so buffer IO completion 2282 * will remove it from the AIL. 2283 */ 2284 iip = ip->i_itemp; 2285 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2286 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2287 ASSERT(iip->ili_last_fields); 2288 goto out_iunlock; 2289 } 2290 2291 /* 2292 * Inodes not attached to the buffer can be released immediately. 2293 * Everything else has to go through xfs_iflush_abort() on journal 2294 * commit as the flock synchronises removal of the inode from the 2295 * cluster buffer against inode reclaim. 2296 */ 2297 if (!iip || list_empty(&iip->ili_item.li_bio_list)) 2298 goto out_iunlock; 2299 2300 __xfs_iflags_set(ip, XFS_IFLUSHING); 2301 spin_unlock(&ip->i_flags_lock); 2302 rcu_read_unlock(); 2303 2304 /* we have a dirty inode in memory that has not yet been flushed. */ 2305 spin_lock(&iip->ili_lock); 2306 iip->ili_last_fields = iip->ili_fields; 2307 iip->ili_fields = 0; 2308 iip->ili_fsync_fields = 0; 2309 spin_unlock(&iip->ili_lock); 2310 ASSERT(iip->ili_last_fields); 2311 2312 if (ip != free_ip) 2313 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2314 return; 2315 2316 out_iunlock: 2317 if (ip != free_ip) 2318 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2319 out_iflags_unlock: 2320 spin_unlock(&ip->i_flags_lock); 2321 rcu_read_unlock(); 2322 } 2323 2324 /* 2325 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2326 * inodes that are in memory - they all must be marked stale and attached to 2327 * the cluster buffer. 2328 */ 2329 static int 2330 xfs_ifree_cluster( 2331 struct xfs_trans *tp, 2332 struct xfs_perag *pag, 2333 struct xfs_inode *free_ip, 2334 struct xfs_icluster *xic) 2335 { 2336 struct xfs_mount *mp = free_ip->i_mount; 2337 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2338 struct xfs_buf *bp; 2339 xfs_daddr_t blkno; 2340 xfs_ino_t inum = xic->first_ino; 2341 int nbufs; 2342 int i, j; 2343 int ioffset; 2344 int error; 2345 2346 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2347 2348 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2349 /* 2350 * The allocation bitmap tells us which inodes of the chunk were 2351 * physically allocated. Skip the cluster if an inode falls into 2352 * a sparse region. 2353 */ 2354 ioffset = inum - xic->first_ino; 2355 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2356 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2357 continue; 2358 } 2359 2360 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2361 XFS_INO_TO_AGBNO(mp, inum)); 2362 2363 /* 2364 * We obtain and lock the backing buffer first in the process 2365 * here to ensure dirty inodes attached to the buffer remain in 2366 * the flushing state while we mark them stale. 2367 * 2368 * If we scan the in-memory inodes first, then buffer IO can 2369 * complete before we get a lock on it, and hence we may fail 2370 * to mark all the active inodes on the buffer stale. 2371 */ 2372 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2373 mp->m_bsize * igeo->blocks_per_cluster, 2374 XBF_UNMAPPED, &bp); 2375 if (error) 2376 return error; 2377 2378 /* 2379 * This buffer may not have been correctly initialised as we 2380 * didn't read it from disk. That's not important because we are 2381 * only using to mark the buffer as stale in the log, and to 2382 * attach stale cached inodes on it. That means it will never be 2383 * dispatched for IO. If it is, we want to know about it, and we 2384 * want it to fail. We can acheive this by adding a write 2385 * verifier to the buffer. 2386 */ 2387 bp->b_ops = &xfs_inode_buf_ops; 2388 2389 /* 2390 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2391 * too. This requires lookups, and will skip inodes that we've 2392 * already marked XFS_ISTALE. 2393 */ 2394 for (i = 0; i < igeo->inodes_per_cluster; i++) 2395 xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); 2396 2397 xfs_trans_stale_inode_buf(tp, bp); 2398 xfs_trans_binval(tp, bp); 2399 } 2400 return 0; 2401 } 2402 2403 /* 2404 * This is called to return an inode to the inode free list. The inode should 2405 * already be truncated to 0 length and have no pages associated with it. This 2406 * routine also assumes that the inode is already a part of the transaction. 2407 * 2408 * The on-disk copy of the inode will have been added to the list of unlinked 2409 * inodes in the AGI. We need to remove the inode from that list atomically with 2410 * respect to freeing it here. 2411 */ 2412 int 2413 xfs_ifree( 2414 struct xfs_trans *tp, 2415 struct xfs_inode *ip) 2416 { 2417 struct xfs_mount *mp = ip->i_mount; 2418 struct xfs_perag *pag; 2419 struct xfs_icluster xic = { 0 }; 2420 struct xfs_inode_log_item *iip = ip->i_itemp; 2421 int error; 2422 2423 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 2424 ASSERT(VFS_I(ip)->i_nlink == 0); 2425 ASSERT(ip->i_df.if_nextents == 0); 2426 ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2427 ASSERT(ip->i_nblocks == 0); 2428 2429 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2430 2431 /* 2432 * Free the inode first so that we guarantee that the AGI lock is going 2433 * to be taken before we remove the inode from the unlinked list. This 2434 * makes the AGI lock -> unlinked list modification order the same as 2435 * used in O_TMPFILE creation. 2436 */ 2437 error = xfs_difree(tp, pag, ip->i_ino, &xic); 2438 if (error) 2439 goto out; 2440 2441 error = xfs_iunlink_remove(tp, pag, ip); 2442 if (error) 2443 goto out; 2444 2445 /* 2446 * Free any local-format data sitting around before we reset the 2447 * data fork to extents format. Note that the attr fork data has 2448 * already been freed by xfs_attr_inactive. 2449 */ 2450 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2451 kfree(ip->i_df.if_data); 2452 ip->i_df.if_data = NULL; 2453 ip->i_df.if_bytes = 0; 2454 } 2455 2456 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2457 ip->i_diflags = 0; 2458 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 2459 ip->i_forkoff = 0; /* mark the attr fork not in use */ 2460 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2461 if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) 2462 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); 2463 2464 /* Don't attempt to replay owner changes for a deleted inode */ 2465 spin_lock(&iip->ili_lock); 2466 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2467 spin_unlock(&iip->ili_lock); 2468 2469 /* 2470 * Bump the generation count so no one will be confused 2471 * by reincarnations of this inode. 2472 */ 2473 VFS_I(ip)->i_generation++; 2474 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2475 2476 if (xic.deleted) 2477 error = xfs_ifree_cluster(tp, pag, ip, &xic); 2478 out: 2479 xfs_perag_put(pag); 2480 return error; 2481 } 2482 2483 /* 2484 * This is called to unpin an inode. The caller must have the inode locked 2485 * in at least shared mode so that the buffer cannot be subsequently pinned 2486 * once someone is waiting for it to be unpinned. 2487 */ 2488 static void 2489 xfs_iunpin( 2490 struct xfs_inode *ip) 2491 { 2492 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 2493 2494 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2495 2496 /* Give the log a push to start the unpinning I/O */ 2497 xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); 2498 2499 } 2500 2501 static void 2502 __xfs_iunpin_wait( 2503 struct xfs_inode *ip) 2504 { 2505 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2506 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2507 2508 xfs_iunpin(ip); 2509 2510 do { 2511 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2512 if (xfs_ipincount(ip)) 2513 io_schedule(); 2514 } while (xfs_ipincount(ip)); 2515 finish_wait(wq, &wait.wq_entry); 2516 } 2517 2518 void 2519 xfs_iunpin_wait( 2520 struct xfs_inode *ip) 2521 { 2522 if (xfs_ipincount(ip)) 2523 __xfs_iunpin_wait(ip); 2524 } 2525 2526 /* 2527 * Removing an inode from the namespace involves removing the directory entry 2528 * and dropping the link count on the inode. Removing the directory entry can 2529 * result in locking an AGF (directory blocks were freed) and removing a link 2530 * count can result in placing the inode on an unlinked list which results in 2531 * locking an AGI. 2532 * 2533 * The big problem here is that we have an ordering constraint on AGF and AGI 2534 * locking - inode allocation locks the AGI, then can allocate a new extent for 2535 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2536 * removes the inode from the unlinked list, requiring that we lock the AGI 2537 * first, and then freeing the inode can result in an inode chunk being freed 2538 * and hence freeing disk space requiring that we lock an AGF. 2539 * 2540 * Hence the ordering that is imposed by other parts of the code is AGI before 2541 * AGF. This means we cannot remove the directory entry before we drop the inode 2542 * reference count and put it on the unlinked list as this results in a lock 2543 * order of AGF then AGI, and this can deadlock against inode allocation and 2544 * freeing. Therefore we must drop the link counts before we remove the 2545 * directory entry. 2546 * 2547 * This is still safe from a transactional point of view - it is not until we 2548 * get to xfs_defer_finish() that we have the possibility of multiple 2549 * transactions in this operation. Hence as long as we remove the directory 2550 * entry and drop the link count in the first transaction of the remove 2551 * operation, there are no transactional constraints on the ordering here. 2552 */ 2553 int 2554 xfs_remove( 2555 xfs_inode_t *dp, 2556 struct xfs_name *name, 2557 xfs_inode_t *ip) 2558 { 2559 xfs_mount_t *mp = dp->i_mount; 2560 xfs_trans_t *tp = NULL; 2561 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2562 int dontcare; 2563 int error = 0; 2564 uint resblks; 2565 2566 trace_xfs_remove(dp, name); 2567 2568 if (xfs_is_shutdown(mp)) 2569 return -EIO; 2570 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 2571 return -EIO; 2572 2573 error = xfs_qm_dqattach(dp); 2574 if (error) 2575 goto std_return; 2576 2577 error = xfs_qm_dqattach(ip); 2578 if (error) 2579 goto std_return; 2580 2581 /* 2582 * We try to get the real space reservation first, allowing for 2583 * directory btree deletion(s) implying possible bmap insert(s). If we 2584 * can't get the space reservation then we use 0 instead, and avoid the 2585 * bmap btree insert(s) in the directory code by, if the bmap insert 2586 * tries to happen, instead trimming the LAST block from the directory. 2587 * 2588 * Ignore EDQUOT and ENOSPC being returned via nospace_error because 2589 * the directory code can handle a reservationless update and we don't 2590 * want to prevent a user from trying to free space by deleting things. 2591 */ 2592 resblks = XFS_REMOVE_SPACE_RES(mp); 2593 error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, 2594 &tp, &dontcare); 2595 if (error) { 2596 ASSERT(error != -ENOSPC); 2597 goto std_return; 2598 } 2599 2600 /* 2601 * If we're removing a directory perform some additional validation. 2602 */ 2603 if (is_dir) { 2604 ASSERT(VFS_I(ip)->i_nlink >= 2); 2605 if (VFS_I(ip)->i_nlink != 2) { 2606 error = -ENOTEMPTY; 2607 goto out_trans_cancel; 2608 } 2609 if (!xfs_dir_isempty(ip)) { 2610 error = -ENOTEMPTY; 2611 goto out_trans_cancel; 2612 } 2613 2614 /* Drop the link from ip's "..". */ 2615 error = xfs_droplink(tp, dp); 2616 if (error) 2617 goto out_trans_cancel; 2618 2619 /* Drop the "." link from ip to self. */ 2620 error = xfs_droplink(tp, ip); 2621 if (error) 2622 goto out_trans_cancel; 2623 2624 /* 2625 * Point the unlinked child directory's ".." entry to the root 2626 * directory to eliminate back-references to inodes that may 2627 * get freed before the child directory is closed. If the fs 2628 * gets shrunk, this can lead to dirent inode validation errors. 2629 */ 2630 if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { 2631 error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, 2632 tp->t_mountp->m_sb.sb_rootino, 0); 2633 if (error) 2634 goto out_trans_cancel; 2635 } 2636 } else { 2637 /* 2638 * When removing a non-directory we need to log the parent 2639 * inode here. For a directory this is done implicitly 2640 * by the xfs_droplink call for the ".." entry. 2641 */ 2642 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2643 } 2644 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2645 2646 /* Drop the link from dp to ip. */ 2647 error = xfs_droplink(tp, ip); 2648 if (error) 2649 goto out_trans_cancel; 2650 2651 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2652 if (error) { 2653 ASSERT(error != -ENOENT); 2654 goto out_trans_cancel; 2655 } 2656 2657 /* 2658 * Drop the link from dp to ip, and if ip was a directory, remove the 2659 * '.' and '..' references since we freed the directory. 2660 */ 2661 xfs_dir_update_hook(dp, ip, -1, name); 2662 2663 /* 2664 * If this is a synchronous mount, make sure that the 2665 * remove transaction goes to disk before returning to 2666 * the user. 2667 */ 2668 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 2669 xfs_trans_set_sync(tp); 2670 2671 error = xfs_trans_commit(tp); 2672 if (error) 2673 goto std_return; 2674 2675 if (is_dir && xfs_inode_is_filestream(ip)) 2676 xfs_filestream_deassociate(ip); 2677 2678 return 0; 2679 2680 out_trans_cancel: 2681 xfs_trans_cancel(tp); 2682 std_return: 2683 return error; 2684 } 2685 2686 /* 2687 * Enter all inodes for a rename transaction into a sorted array. 2688 */ 2689 #define __XFS_SORT_INODES 5 2690 STATIC void 2691 xfs_sort_for_rename( 2692 struct xfs_inode *dp1, /* in: old (source) directory inode */ 2693 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2694 struct xfs_inode *ip1, /* in: inode of old entry */ 2695 struct xfs_inode *ip2, /* in: inode of new entry */ 2696 struct xfs_inode *wip, /* in: whiteout inode */ 2697 struct xfs_inode **i_tab,/* out: sorted array of inodes */ 2698 int *num_inodes) /* in/out: inodes in array */ 2699 { 2700 int i, j; 2701 2702 ASSERT(*num_inodes == __XFS_SORT_INODES); 2703 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2704 2705 /* 2706 * i_tab contains a list of pointers to inodes. We initialize 2707 * the table here & we'll sort it. We will then use it to 2708 * order the acquisition of the inode locks. 2709 * 2710 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2711 */ 2712 i = 0; 2713 i_tab[i++] = dp1; 2714 i_tab[i++] = dp2; 2715 i_tab[i++] = ip1; 2716 if (ip2) 2717 i_tab[i++] = ip2; 2718 if (wip) 2719 i_tab[i++] = wip; 2720 *num_inodes = i; 2721 2722 /* 2723 * Sort the elements via bubble sort. (Remember, there are at 2724 * most 5 elements to sort, so this is adequate.) 2725 */ 2726 for (i = 0; i < *num_inodes; i++) { 2727 for (j = 1; j < *num_inodes; j++) { 2728 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2729 struct xfs_inode *temp = i_tab[j]; 2730 i_tab[j] = i_tab[j-1]; 2731 i_tab[j-1] = temp; 2732 } 2733 } 2734 } 2735 } 2736 2737 static int 2738 xfs_finish_rename( 2739 struct xfs_trans *tp) 2740 { 2741 /* 2742 * If this is a synchronous mount, make sure that the rename transaction 2743 * goes to disk before returning to the user. 2744 */ 2745 if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) 2746 xfs_trans_set_sync(tp); 2747 2748 return xfs_trans_commit(tp); 2749 } 2750 2751 /* 2752 * xfs_cross_rename() 2753 * 2754 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall 2755 */ 2756 STATIC int 2757 xfs_cross_rename( 2758 struct xfs_trans *tp, 2759 struct xfs_inode *dp1, 2760 struct xfs_name *name1, 2761 struct xfs_inode *ip1, 2762 struct xfs_inode *dp2, 2763 struct xfs_name *name2, 2764 struct xfs_inode *ip2, 2765 int spaceres) 2766 { 2767 int error = 0; 2768 int ip1_flags = 0; 2769 int ip2_flags = 0; 2770 int dp2_flags = 0; 2771 2772 /* Swap inode number for dirent in first parent */ 2773 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2774 if (error) 2775 goto out_trans_abort; 2776 2777 /* Swap inode number for dirent in second parent */ 2778 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2779 if (error) 2780 goto out_trans_abort; 2781 2782 /* 2783 * If we're renaming one or more directories across different parents, 2784 * update the respective ".." entries (and link counts) to match the new 2785 * parents. 2786 */ 2787 if (dp1 != dp2) { 2788 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2789 2790 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2791 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2792 dp1->i_ino, spaceres); 2793 if (error) 2794 goto out_trans_abort; 2795 2796 /* transfer ip2 ".." reference to dp1 */ 2797 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2798 error = xfs_droplink(tp, dp2); 2799 if (error) 2800 goto out_trans_abort; 2801 xfs_bumplink(tp, dp1); 2802 } 2803 2804 /* 2805 * Although ip1 isn't changed here, userspace needs 2806 * to be warned about the change, so that applications 2807 * relying on it (like backup ones), will properly 2808 * notify the change 2809 */ 2810 ip1_flags |= XFS_ICHGTIME_CHG; 2811 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2812 } 2813 2814 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2815 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2816 dp2->i_ino, spaceres); 2817 if (error) 2818 goto out_trans_abort; 2819 2820 /* transfer ip1 ".." reference to dp2 */ 2821 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2822 error = xfs_droplink(tp, dp1); 2823 if (error) 2824 goto out_trans_abort; 2825 xfs_bumplink(tp, dp2); 2826 } 2827 2828 /* 2829 * Although ip2 isn't changed here, userspace needs 2830 * to be warned about the change, so that applications 2831 * relying on it (like backup ones), will properly 2832 * notify the change 2833 */ 2834 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2835 ip2_flags |= XFS_ICHGTIME_CHG; 2836 } 2837 } 2838 2839 if (ip1_flags) { 2840 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2841 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2842 } 2843 if (ip2_flags) { 2844 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2845 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2846 } 2847 if (dp2_flags) { 2848 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2849 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2850 } 2851 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2852 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2853 2854 /* 2855 * Inform our hook clients that we've finished an exchange operation as 2856 * follows: removed the source and target files from their directories; 2857 * added the target to the source directory; and added the source to 2858 * the target directory. All inodes are locked, so it's ok to model a 2859 * rename this way so long as we say we deleted entries before we add 2860 * new ones. 2861 */ 2862 xfs_dir_update_hook(dp1, ip1, -1, name1); 2863 xfs_dir_update_hook(dp2, ip2, -1, name2); 2864 xfs_dir_update_hook(dp1, ip2, 1, name1); 2865 xfs_dir_update_hook(dp2, ip1, 1, name2); 2866 2867 return xfs_finish_rename(tp); 2868 2869 out_trans_abort: 2870 xfs_trans_cancel(tp); 2871 return error; 2872 } 2873 2874 /* 2875 * xfs_rename_alloc_whiteout() 2876 * 2877 * Return a referenced, unlinked, unlocked inode that can be used as a 2878 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2879 * crash between allocating the inode and linking it into the rename transaction 2880 * recovery will free the inode and we won't leak it. 2881 */ 2882 static int 2883 xfs_rename_alloc_whiteout( 2884 struct mnt_idmap *idmap, 2885 struct xfs_name *src_name, 2886 struct xfs_inode *dp, 2887 struct xfs_inode **wip) 2888 { 2889 struct xfs_inode *tmpfile; 2890 struct qstr name; 2891 int error; 2892 2893 error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, 2894 &tmpfile); 2895 if (error) 2896 return error; 2897 2898 name.name = src_name->name; 2899 name.len = src_name->len; 2900 error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); 2901 if (error) { 2902 xfs_finish_inode_setup(tmpfile); 2903 xfs_irele(tmpfile); 2904 return error; 2905 } 2906 2907 /* 2908 * Prepare the tmpfile inode as if it were created through the VFS. 2909 * Complete the inode setup and flag it as linkable. nlink is already 2910 * zero, so we can skip the drop_nlink. 2911 */ 2912 xfs_setup_iops(tmpfile); 2913 xfs_finish_inode_setup(tmpfile); 2914 VFS_I(tmpfile)->i_state |= I_LINKABLE; 2915 2916 *wip = tmpfile; 2917 return 0; 2918 } 2919 2920 /* 2921 * xfs_rename 2922 */ 2923 int 2924 xfs_rename( 2925 struct mnt_idmap *idmap, 2926 struct xfs_inode *src_dp, 2927 struct xfs_name *src_name, 2928 struct xfs_inode *src_ip, 2929 struct xfs_inode *target_dp, 2930 struct xfs_name *target_name, 2931 struct xfs_inode *target_ip, 2932 unsigned int flags) 2933 { 2934 struct xfs_mount *mp = src_dp->i_mount; 2935 struct xfs_trans *tp; 2936 struct xfs_inode *wip = NULL; /* whiteout inode */ 2937 struct xfs_inode *inodes[__XFS_SORT_INODES]; 2938 int i; 2939 int num_inodes = __XFS_SORT_INODES; 2940 bool new_parent = (src_dp != target_dp); 2941 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 2942 int spaceres; 2943 bool retried = false; 2944 int error, nospace_error = 0; 2945 2946 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 2947 2948 if ((flags & RENAME_EXCHANGE) && !target_ip) 2949 return -EINVAL; 2950 2951 /* 2952 * If we are doing a whiteout operation, allocate the whiteout inode 2953 * we will be placing at the target and ensure the type is set 2954 * appropriately. 2955 */ 2956 if (flags & RENAME_WHITEOUT) { 2957 error = xfs_rename_alloc_whiteout(idmap, src_name, 2958 target_dp, &wip); 2959 if (error) 2960 return error; 2961 2962 /* setup target dirent info as whiteout */ 2963 src_name->type = XFS_DIR3_FT_CHRDEV; 2964 } 2965 2966 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, 2967 inodes, &num_inodes); 2968 2969 retry: 2970 nospace_error = 0; 2971 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 2972 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 2973 if (error == -ENOSPC) { 2974 nospace_error = error; 2975 spaceres = 0; 2976 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, 2977 &tp); 2978 } 2979 if (error) 2980 goto out_release_wip; 2981 2982 /* 2983 * Attach the dquots to the inodes 2984 */ 2985 error = xfs_qm_vop_rename_dqattach(inodes); 2986 if (error) 2987 goto out_trans_cancel; 2988 2989 /* 2990 * Lock all the participating inodes. Depending upon whether 2991 * the target_name exists in the target directory, and 2992 * whether the target directory is the same as the source 2993 * directory, we can lock from 2 to 5 inodes. 2994 */ 2995 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 2996 2997 /* 2998 * Join all the inodes to the transaction. From this point on, 2999 * we can rely on either trans_commit or trans_cancel to unlock 3000 * them. 3001 */ 3002 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3003 if (new_parent) 3004 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3005 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3006 if (target_ip) 3007 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3008 if (wip) 3009 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3010 3011 /* 3012 * If we are using project inheritance, we only allow renames 3013 * into our tree when the project IDs are the same; else the 3014 * tree quota mechanism would be circumvented. 3015 */ 3016 if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 3017 target_dp->i_projid != src_ip->i_projid)) { 3018 error = -EXDEV; 3019 goto out_trans_cancel; 3020 } 3021 3022 /* RENAME_EXCHANGE is unique from here on. */ 3023 if (flags & RENAME_EXCHANGE) 3024 return xfs_cross_rename(tp, src_dp, src_name, src_ip, 3025 target_dp, target_name, target_ip, 3026 spaceres); 3027 3028 /* 3029 * Try to reserve quota to handle an expansion of the target directory. 3030 * We'll allow the rename to continue in reservationless mode if we hit 3031 * a space usage constraint. If we trigger reservationless mode, save 3032 * the errno if there isn't any free space in the target directory. 3033 */ 3034 if (spaceres != 0) { 3035 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, 3036 0, false); 3037 if (error == -EDQUOT || error == -ENOSPC) { 3038 if (!retried) { 3039 xfs_trans_cancel(tp); 3040 xfs_blockgc_free_quota(target_dp, 0); 3041 retried = true; 3042 goto retry; 3043 } 3044 3045 nospace_error = error; 3046 spaceres = 0; 3047 error = 0; 3048 } 3049 if (error) 3050 goto out_trans_cancel; 3051 } 3052 3053 /* 3054 * Check for expected errors before we dirty the transaction 3055 * so we can return an error without a transaction abort. 3056 */ 3057 if (target_ip == NULL) { 3058 /* 3059 * If there's no space reservation, check the entry will 3060 * fit before actually inserting it. 3061 */ 3062 if (!spaceres) { 3063 error = xfs_dir_canenter(tp, target_dp, target_name); 3064 if (error) 3065 goto out_trans_cancel; 3066 } 3067 } else { 3068 /* 3069 * If target exists and it's a directory, check that whether 3070 * it can be destroyed. 3071 */ 3072 if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3073 (!xfs_dir_isempty(target_ip) || 3074 (VFS_I(target_ip)->i_nlink > 2))) { 3075 error = -EEXIST; 3076 goto out_trans_cancel; 3077 } 3078 } 3079 3080 /* 3081 * Lock the AGI buffers we need to handle bumping the nlink of the 3082 * whiteout inode off the unlinked list and to handle dropping the 3083 * nlink of the target inode. Per locking order rules, do this in 3084 * increasing AG order and before directory block allocation tries to 3085 * grab AGFs because we grab AGIs before AGFs. 3086 * 3087 * The (vfs) caller must ensure that if src is a directory then 3088 * target_ip is either null or an empty directory. 3089 */ 3090 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3091 if (inodes[i] == wip || 3092 (inodes[i] == target_ip && 3093 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3094 struct xfs_perag *pag; 3095 struct xfs_buf *bp; 3096 3097 pag = xfs_perag_get(mp, 3098 XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); 3099 error = xfs_read_agi(pag, tp, &bp); 3100 xfs_perag_put(pag); 3101 if (error) 3102 goto out_trans_cancel; 3103 } 3104 } 3105 3106 /* 3107 * Directory entry creation below may acquire the AGF. Remove 3108 * the whiteout from the unlinked list first to preserve correct 3109 * AGI/AGF locking order. This dirties the transaction so failures 3110 * after this point will abort and log recovery will clean up the 3111 * mess. 3112 * 3113 * For whiteouts, we need to bump the link count on the whiteout 3114 * inode. After this point, we have a real link, clear the tmpfile 3115 * state flag from the inode so it doesn't accidentally get misused 3116 * in future. 3117 */ 3118 if (wip) { 3119 struct xfs_perag *pag; 3120 3121 ASSERT(VFS_I(wip)->i_nlink == 0); 3122 3123 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); 3124 error = xfs_iunlink_remove(tp, pag, wip); 3125 xfs_perag_put(pag); 3126 if (error) 3127 goto out_trans_cancel; 3128 3129 xfs_bumplink(tp, wip); 3130 VFS_I(wip)->i_state &= ~I_LINKABLE; 3131 } 3132 3133 /* 3134 * Set up the target. 3135 */ 3136 if (target_ip == NULL) { 3137 /* 3138 * If target does not exist and the rename crosses 3139 * directories, adjust the target directory link count 3140 * to account for the ".." reference from the new entry. 3141 */ 3142 error = xfs_dir_createname(tp, target_dp, target_name, 3143 src_ip->i_ino, spaceres); 3144 if (error) 3145 goto out_trans_cancel; 3146 3147 xfs_trans_ichgtime(tp, target_dp, 3148 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3149 3150 if (new_parent && src_is_directory) { 3151 xfs_bumplink(tp, target_dp); 3152 } 3153 } else { /* target_ip != NULL */ 3154 /* 3155 * Link the source inode under the target name. 3156 * If the source inode is a directory and we are moving 3157 * it across directories, its ".." entry will be 3158 * inconsistent until we replace that down below. 3159 * 3160 * In case there is already an entry with the same 3161 * name at the destination directory, remove it first. 3162 */ 3163 error = xfs_dir_replace(tp, target_dp, target_name, 3164 src_ip->i_ino, spaceres); 3165 if (error) 3166 goto out_trans_cancel; 3167 3168 xfs_trans_ichgtime(tp, target_dp, 3169 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3170 3171 /* 3172 * Decrement the link count on the target since the target 3173 * dir no longer points to it. 3174 */ 3175 error = xfs_droplink(tp, target_ip); 3176 if (error) 3177 goto out_trans_cancel; 3178 3179 if (src_is_directory) { 3180 /* 3181 * Drop the link from the old "." entry. 3182 */ 3183 error = xfs_droplink(tp, target_ip); 3184 if (error) 3185 goto out_trans_cancel; 3186 } 3187 } /* target_ip != NULL */ 3188 3189 /* 3190 * Remove the source. 3191 */ 3192 if (new_parent && src_is_directory) { 3193 /* 3194 * Rewrite the ".." entry to point to the new 3195 * directory. 3196 */ 3197 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 3198 target_dp->i_ino, spaceres); 3199 ASSERT(error != -EEXIST); 3200 if (error) 3201 goto out_trans_cancel; 3202 } 3203 3204 /* 3205 * We always want to hit the ctime on the source inode. 3206 * 3207 * This isn't strictly required by the standards since the source 3208 * inode isn't really being changed, but old unix file systems did 3209 * it and some incremental backup programs won't work without it. 3210 */ 3211 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3212 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3213 3214 /* 3215 * Adjust the link count on src_dp. This is necessary when 3216 * renaming a directory, either within one parent when 3217 * the target existed, or across two parent directories. 3218 */ 3219 if (src_is_directory && (new_parent || target_ip != NULL)) { 3220 3221 /* 3222 * Decrement link count on src_directory since the 3223 * entry that's moved no longer points to it. 3224 */ 3225 error = xfs_droplink(tp, src_dp); 3226 if (error) 3227 goto out_trans_cancel; 3228 } 3229 3230 /* 3231 * For whiteouts, we only need to update the source dirent with the 3232 * inode number of the whiteout inode rather than removing it 3233 * altogether. 3234 */ 3235 if (wip) 3236 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, 3237 spaceres); 3238 else 3239 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3240 spaceres); 3241 3242 if (error) 3243 goto out_trans_cancel; 3244 3245 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3246 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3247 if (new_parent) 3248 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3249 3250 /* 3251 * Inform our hook clients that we've finished a rename operation as 3252 * follows: removed the source and target files from their directories; 3253 * that we've added the source to the target directory; and finally 3254 * that we've added the whiteout, if there was one. All inodes are 3255 * locked, so it's ok to model a rename this way so long as we say we 3256 * deleted entries before we add new ones. 3257 */ 3258 if (target_ip) 3259 xfs_dir_update_hook(target_dp, target_ip, -1, target_name); 3260 xfs_dir_update_hook(src_dp, src_ip, -1, src_name); 3261 xfs_dir_update_hook(target_dp, src_ip, 1, target_name); 3262 if (wip) 3263 xfs_dir_update_hook(src_dp, wip, 1, src_name); 3264 3265 error = xfs_finish_rename(tp); 3266 if (wip) 3267 xfs_irele(wip); 3268 return error; 3269 3270 out_trans_cancel: 3271 xfs_trans_cancel(tp); 3272 out_release_wip: 3273 if (wip) 3274 xfs_irele(wip); 3275 if (error == -ENOSPC && nospace_error) 3276 error = nospace_error; 3277 return error; 3278 } 3279 3280 static int 3281 xfs_iflush( 3282 struct xfs_inode *ip, 3283 struct xfs_buf *bp) 3284 { 3285 struct xfs_inode_log_item *iip = ip->i_itemp; 3286 struct xfs_dinode *dip; 3287 struct xfs_mount *mp = ip->i_mount; 3288 int error; 3289 3290 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 3291 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3292 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3293 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3294 ASSERT(iip->ili_item.li_buf == bp); 3295 3296 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3297 3298 /* 3299 * We don't flush the inode if any of the following checks fail, but we 3300 * do still update the log item and attach to the backing buffer as if 3301 * the flush happened. This is a formality to facilitate predictable 3302 * error handling as the caller will shutdown and fail the buffer. 3303 */ 3304 error = -EFSCORRUPTED; 3305 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3306 mp, XFS_ERRTAG_IFLUSH_1)) { 3307 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3308 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, 3309 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3310 goto flush_out; 3311 } 3312 if (S_ISREG(VFS_I(ip)->i_mode)) { 3313 if (XFS_TEST_ERROR( 3314 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3315 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3316 mp, XFS_ERRTAG_IFLUSH_3)) { 3317 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3318 "%s: Bad regular inode %llu, ptr "PTR_FMT, 3319 __func__, ip->i_ino, ip); 3320 goto flush_out; 3321 } 3322 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3323 if (XFS_TEST_ERROR( 3324 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3325 ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3326 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3327 mp, XFS_ERRTAG_IFLUSH_4)) { 3328 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3329 "%s: Bad directory inode %llu, ptr "PTR_FMT, 3330 __func__, ip->i_ino, ip); 3331 goto flush_out; 3332 } 3333 } 3334 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > 3335 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { 3336 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3337 "%s: detected corrupt incore inode %llu, " 3338 "total extents = %llu nblocks = %lld, ptr "PTR_FMT, 3339 __func__, ip->i_ino, 3340 ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), 3341 ip->i_nblocks, ip); 3342 goto flush_out; 3343 } 3344 if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, 3345 mp, XFS_ERRTAG_IFLUSH_6)) { 3346 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3347 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, 3348 __func__, ip->i_ino, ip->i_forkoff, ip); 3349 goto flush_out; 3350 } 3351 3352 /* 3353 * Inode item log recovery for v2 inodes are dependent on the flushiter 3354 * count for correct sequencing. We bump the flush iteration count so 3355 * we can detect flushes which postdate a log record during recovery. 3356 * This is redundant as we now log every change and hence this can't 3357 * happen but we need to still do it to ensure backwards compatibility 3358 * with old kernels that predate logging all inode changes. 3359 */ 3360 if (!xfs_has_v3inodes(mp)) 3361 ip->i_flushiter++; 3362 3363 /* 3364 * If there are inline format data / attr forks attached to this inode, 3365 * make sure they are not corrupt. 3366 */ 3367 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && 3368 xfs_ifork_verify_local_data(ip)) 3369 goto flush_out; 3370 if (xfs_inode_has_attr_fork(ip) && 3371 ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && 3372 xfs_ifork_verify_local_attr(ip)) 3373 goto flush_out; 3374 3375 /* 3376 * Copy the dirty parts of the inode into the on-disk inode. We always 3377 * copy out the core of the inode, because if the inode is dirty at all 3378 * the core must be. 3379 */ 3380 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3381 3382 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3383 if (!xfs_has_v3inodes(mp)) { 3384 if (ip->i_flushiter == DI_MAX_FLUSH) 3385 ip->i_flushiter = 0; 3386 } 3387 3388 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3389 if (xfs_inode_has_attr_fork(ip)) 3390 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3391 3392 /* 3393 * We've recorded everything logged in the inode, so we'd like to clear 3394 * the ili_fields bits so we don't log and flush things unnecessarily. 3395 * However, we can't stop logging all this information until the data 3396 * we've copied into the disk buffer is written to disk. If we did we 3397 * might overwrite the copy of the inode in the log with all the data 3398 * after re-logging only part of it, and in the face of a crash we 3399 * wouldn't have all the data we need to recover. 3400 * 3401 * What we do is move the bits to the ili_last_fields field. When 3402 * logging the inode, these bits are moved back to the ili_fields field. 3403 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3404 * we know that the information those bits represent is permanently on 3405 * disk. As long as the flush completes before the inode is logged 3406 * again, then both ili_fields and ili_last_fields will be cleared. 3407 */ 3408 error = 0; 3409 flush_out: 3410 spin_lock(&iip->ili_lock); 3411 iip->ili_last_fields = iip->ili_fields; 3412 iip->ili_fields = 0; 3413 iip->ili_fsync_fields = 0; 3414 spin_unlock(&iip->ili_lock); 3415 3416 /* 3417 * Store the current LSN of the inode so that we can tell whether the 3418 * item has moved in the AIL from xfs_buf_inode_iodone(). 3419 */ 3420 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3421 &iip->ili_item.li_lsn); 3422 3423 /* generate the checksum. */ 3424 xfs_dinode_calc_crc(mp, dip); 3425 if (error) 3426 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 3427 return error; 3428 } 3429 3430 /* 3431 * Non-blocking flush of dirty inode metadata into the backing buffer. 3432 * 3433 * The caller must have a reference to the inode and hold the cluster buffer 3434 * locked. The function will walk across all the inodes on the cluster buffer it 3435 * can find and lock without blocking, and flush them to the cluster buffer. 3436 * 3437 * On successful flushing of at least one inode, the caller must write out the 3438 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3439 * the caller needs to release the buffer. On failure, the filesystem will be 3440 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3441 * will be returned. 3442 */ 3443 int 3444 xfs_iflush_cluster( 3445 struct xfs_buf *bp) 3446 { 3447 struct xfs_mount *mp = bp->b_mount; 3448 struct xfs_log_item *lip, *n; 3449 struct xfs_inode *ip; 3450 struct xfs_inode_log_item *iip; 3451 int clcount = 0; 3452 int error = 0; 3453 3454 /* 3455 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3456 * will remove itself from the list. 3457 */ 3458 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3459 iip = (struct xfs_inode_log_item *)lip; 3460 ip = iip->ili_inode; 3461 3462 /* 3463 * Quick and dirty check to avoid locks if possible. 3464 */ 3465 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) 3466 continue; 3467 if (xfs_ipincount(ip)) 3468 continue; 3469 3470 /* 3471 * The inode is still attached to the buffer, which means it is 3472 * dirty but reclaim might try to grab it. Check carefully for 3473 * that, and grab the ilock while still holding the i_flags_lock 3474 * to guarantee reclaim will not be able to reclaim this inode 3475 * once we drop the i_flags_lock. 3476 */ 3477 spin_lock(&ip->i_flags_lock); 3478 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3479 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3480 spin_unlock(&ip->i_flags_lock); 3481 continue; 3482 } 3483 3484 /* 3485 * ILOCK will pin the inode against reclaim and prevent 3486 * concurrent transactions modifying the inode while we are 3487 * flushing the inode. If we get the lock, set the flushing 3488 * state before we drop the i_flags_lock. 3489 */ 3490 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3491 spin_unlock(&ip->i_flags_lock); 3492 continue; 3493 } 3494 __xfs_iflags_set(ip, XFS_IFLUSHING); 3495 spin_unlock(&ip->i_flags_lock); 3496 3497 /* 3498 * Abort flushing this inode if we are shut down because the 3499 * inode may not currently be in the AIL. This can occur when 3500 * log I/O failure unpins the inode without inserting into the 3501 * AIL, leaving a dirty/unpinned inode attached to the buffer 3502 * that otherwise looks like it should be flushed. 3503 */ 3504 if (xlog_is_shutdown(mp->m_log)) { 3505 xfs_iunpin_wait(ip); 3506 xfs_iflush_abort(ip); 3507 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3508 error = -EIO; 3509 continue; 3510 } 3511 3512 /* don't block waiting on a log force to unpin dirty inodes */ 3513 if (xfs_ipincount(ip)) { 3514 xfs_iflags_clear(ip, XFS_IFLUSHING); 3515 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3516 continue; 3517 } 3518 3519 if (!xfs_inode_clean(ip)) 3520 error = xfs_iflush(ip, bp); 3521 else 3522 xfs_iflags_clear(ip, XFS_IFLUSHING); 3523 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3524 if (error) 3525 break; 3526 clcount++; 3527 } 3528 3529 if (error) { 3530 /* 3531 * Shutdown first so we kill the log before we release this 3532 * buffer. If it is an INODE_ALLOC buffer and pins the tail 3533 * of the log, failing it before the _log_ is shut down can 3534 * result in the log tail being moved forward in the journal 3535 * on disk because log writes can still be taking place. Hence 3536 * unpinning the tail will allow the ICREATE intent to be 3537 * removed from the log an recovery will fail with uninitialised 3538 * inode cluster buffers. 3539 */ 3540 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3541 bp->b_flags |= XBF_ASYNC; 3542 xfs_buf_ioend_fail(bp); 3543 return error; 3544 } 3545 3546 if (!clcount) 3547 return -EAGAIN; 3548 3549 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3550 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3551 return 0; 3552 3553 } 3554 3555 /* Release an inode. */ 3556 void 3557 xfs_irele( 3558 struct xfs_inode *ip) 3559 { 3560 trace_xfs_irele(ip, _RET_IP_); 3561 iput(VFS_I(ip)); 3562 } 3563 3564 /* 3565 * Ensure all commited transactions touching the inode are written to the log. 3566 */ 3567 int 3568 xfs_log_force_inode( 3569 struct xfs_inode *ip) 3570 { 3571 xfs_csn_t seq = 0; 3572 3573 xfs_ilock(ip, XFS_ILOCK_SHARED); 3574 if (xfs_ipincount(ip)) 3575 seq = ip->i_itemp->ili_commit_seq; 3576 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3577 3578 if (!seq) 3579 return 0; 3580 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); 3581 } 3582 3583 /* 3584 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3585 * abide vfs locking order (lowest pointer value goes first) and breaking the 3586 * layout leases before proceeding. The loop is needed because we cannot call 3587 * the blocking break_layout() with the iolocks held, and therefore have to 3588 * back out both locks. 3589 */ 3590 static int 3591 xfs_iolock_two_inodes_and_break_layout( 3592 struct inode *src, 3593 struct inode *dest) 3594 { 3595 int error; 3596 3597 if (src > dest) 3598 swap(src, dest); 3599 3600 retry: 3601 /* Wait to break both inodes' layouts before we start locking. */ 3602 error = break_layout(src, true); 3603 if (error) 3604 return error; 3605 if (src != dest) { 3606 error = break_layout(dest, true); 3607 if (error) 3608 return error; 3609 } 3610 3611 /* Lock one inode and make sure nobody got in and leased it. */ 3612 inode_lock(src); 3613 error = break_layout(src, false); 3614 if (error) { 3615 inode_unlock(src); 3616 if (error == -EWOULDBLOCK) 3617 goto retry; 3618 return error; 3619 } 3620 3621 if (src == dest) 3622 return 0; 3623 3624 /* Lock the other inode and make sure nobody got in and leased it. */ 3625 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3626 error = break_layout(dest, false); 3627 if (error) { 3628 inode_unlock(src); 3629 inode_unlock(dest); 3630 if (error == -EWOULDBLOCK) 3631 goto retry; 3632 return error; 3633 } 3634 3635 return 0; 3636 } 3637 3638 static int 3639 xfs_mmaplock_two_inodes_and_break_dax_layout( 3640 struct xfs_inode *ip1, 3641 struct xfs_inode *ip2) 3642 { 3643 int error; 3644 bool retry; 3645 struct page *page; 3646 3647 if (ip1->i_ino > ip2->i_ino) 3648 swap(ip1, ip2); 3649 3650 again: 3651 retry = false; 3652 /* Lock the first inode */ 3653 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3654 error = xfs_break_dax_layouts(VFS_I(ip1), &retry); 3655 if (error || retry) { 3656 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3657 if (error == 0 && retry) 3658 goto again; 3659 return error; 3660 } 3661 3662 if (ip1 == ip2) 3663 return 0; 3664 3665 /* Nested lock the second inode */ 3666 xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); 3667 /* 3668 * We cannot use xfs_break_dax_layouts() directly here because it may 3669 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable 3670 * for this nested lock case. 3671 */ 3672 page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); 3673 if (page && page_ref_count(page) != 1) { 3674 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3675 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3676 goto again; 3677 } 3678 3679 return 0; 3680 } 3681 3682 /* 3683 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3684 * mmap activity. 3685 */ 3686 int 3687 xfs_ilock2_io_mmap( 3688 struct xfs_inode *ip1, 3689 struct xfs_inode *ip2) 3690 { 3691 int ret; 3692 3693 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3694 if (ret) 3695 return ret; 3696 3697 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3698 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); 3699 if (ret) { 3700 inode_unlock(VFS_I(ip2)); 3701 if (ip1 != ip2) 3702 inode_unlock(VFS_I(ip1)); 3703 return ret; 3704 } 3705 } else 3706 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, 3707 VFS_I(ip2)->i_mapping); 3708 3709 return 0; 3710 } 3711 3712 /* Unlock both inodes to allow IO and mmap activity. */ 3713 void 3714 xfs_iunlock2_io_mmap( 3715 struct xfs_inode *ip1, 3716 struct xfs_inode *ip2) 3717 { 3718 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3719 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3720 if (ip1 != ip2) 3721 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3722 } else 3723 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, 3724 VFS_I(ip2)->i_mapping); 3725 3726 inode_unlock(VFS_I(ip2)); 3727 if (ip1 != ip2) 3728 inode_unlock(VFS_I(ip1)); 3729 } 3730 3731 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ 3732 void 3733 xfs_iunlock2_remapping( 3734 struct xfs_inode *ip1, 3735 struct xfs_inode *ip2) 3736 { 3737 xfs_iflags_clear(ip1, XFS_IREMAPPING); 3738 3739 if (ip1 != ip2) 3740 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); 3741 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3742 3743 if (ip1 != ip2) 3744 inode_unlock_shared(VFS_I(ip1)); 3745 inode_unlock(VFS_I(ip2)); 3746 } 3747 3748 /* 3749 * Reload the incore inode list for this inode. Caller should ensure that 3750 * the link count cannot change, either by taking ILOCK_SHARED or otherwise 3751 * preventing other threads from executing. 3752 */ 3753 int 3754 xfs_inode_reload_unlinked_bucket( 3755 struct xfs_trans *tp, 3756 struct xfs_inode *ip) 3757 { 3758 struct xfs_mount *mp = tp->t_mountp; 3759 struct xfs_buf *agibp; 3760 struct xfs_agi *agi; 3761 struct xfs_perag *pag; 3762 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 3763 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 3764 xfs_agino_t prev_agino, next_agino; 3765 unsigned int bucket; 3766 bool foundit = false; 3767 int error; 3768 3769 /* Grab the first inode in the list */ 3770 pag = xfs_perag_get(mp, agno); 3771 error = xfs_ialloc_read_agi(pag, tp, &agibp); 3772 xfs_perag_put(pag); 3773 if (error) 3774 return error; 3775 3776 /* 3777 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the 3778 * incore unlinked list pointers for this inode. Check once more to 3779 * see if we raced with anyone else to reload the unlinked list. 3780 */ 3781 if (!xfs_inode_unlinked_incomplete(ip)) { 3782 foundit = true; 3783 goto out_agibp; 3784 } 3785 3786 bucket = agino % XFS_AGI_UNLINKED_BUCKETS; 3787 agi = agibp->b_addr; 3788 3789 trace_xfs_inode_reload_unlinked_bucket(ip); 3790 3791 xfs_info_ratelimited(mp, 3792 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", 3793 agino, agno); 3794 3795 prev_agino = NULLAGINO; 3796 next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3797 while (next_agino != NULLAGINO) { 3798 struct xfs_inode *next_ip = NULL; 3799 3800 /* Found this caller's inode, set its backlink. */ 3801 if (next_agino == agino) { 3802 next_ip = ip; 3803 next_ip->i_prev_unlinked = prev_agino; 3804 foundit = true; 3805 goto next_inode; 3806 } 3807 3808 /* Try in-memory lookup first. */ 3809 next_ip = xfs_iunlink_lookup(pag, next_agino); 3810 if (next_ip) 3811 goto next_inode; 3812 3813 /* Inode not in memory, try reloading it. */ 3814 error = xfs_iunlink_reload_next(tp, agibp, prev_agino, 3815 next_agino); 3816 if (error) 3817 break; 3818 3819 /* Grab the reloaded inode. */ 3820 next_ip = xfs_iunlink_lookup(pag, next_agino); 3821 if (!next_ip) { 3822 /* No incore inode at all? We reloaded it... */ 3823 ASSERT(next_ip != NULL); 3824 error = -EFSCORRUPTED; 3825 break; 3826 } 3827 3828 next_inode: 3829 prev_agino = next_agino; 3830 next_agino = next_ip->i_next_unlinked; 3831 } 3832 3833 out_agibp: 3834 xfs_trans_brelse(tp, agibp); 3835 /* Should have found this inode somewhere in the iunlinked bucket. */ 3836 if (!error && !foundit) 3837 error = -EFSCORRUPTED; 3838 return error; 3839 } 3840 3841 /* Decide if this inode is missing its unlinked list and reload it. */ 3842 int 3843 xfs_inode_reload_unlinked( 3844 struct xfs_inode *ip) 3845 { 3846 struct xfs_trans *tp; 3847 int error; 3848 3849 error = xfs_trans_alloc_empty(ip->i_mount, &tp); 3850 if (error) 3851 return error; 3852 3853 xfs_ilock(ip, XFS_ILOCK_SHARED); 3854 if (xfs_inode_unlinked_incomplete(ip)) 3855 error = xfs_inode_reload_unlinked_bucket(tp, ip); 3856 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3857 xfs_trans_cancel(tp); 3858 3859 return error; 3860 } 3861 3862 /* Has this inode fork been zapped by repair? */ 3863 bool 3864 xfs_ifork_zapped( 3865 const struct xfs_inode *ip, 3866 int whichfork) 3867 { 3868 unsigned int datamask = 0; 3869 3870 switch (whichfork) { 3871 case XFS_DATA_FORK: 3872 switch (ip->i_vnode.i_mode & S_IFMT) { 3873 case S_IFDIR: 3874 datamask = XFS_SICK_INO_DIR_ZAPPED; 3875 break; 3876 case S_IFLNK: 3877 datamask = XFS_SICK_INO_SYMLINK_ZAPPED; 3878 break; 3879 } 3880 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); 3881 case XFS_ATTR_FORK: 3882 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; 3883 default: 3884 return false; 3885 } 3886 } 3887 3888 /* Compute the number of data and realtime blocks used by a file. */ 3889 void 3890 xfs_inode_count_blocks( 3891 struct xfs_trans *tp, 3892 struct xfs_inode *ip, 3893 xfs_filblks_t *dblocks, 3894 xfs_filblks_t *rblocks) 3895 { 3896 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 3897 3898 *rblocks = 0; 3899 if (XFS_IS_REALTIME_INODE(ip)) 3900 xfs_bmap_count_leaves(ifp, rblocks); 3901 *dblocks = ip->i_nblocks - *rblocks; 3902 } 3903