1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include <linux/iversion.h> 7 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_dir2.h" 18 #include "xfs_attr.h" 19 #include "xfs_bit.h" 20 #include "xfs_trans_space.h" 21 #include "xfs_trans.h" 22 #include "xfs_buf_item.h" 23 #include "xfs_inode_item.h" 24 #include "xfs_iunlink_item.h" 25 #include "xfs_ialloc.h" 26 #include "xfs_bmap.h" 27 #include "xfs_bmap_util.h" 28 #include "xfs_errortag.h" 29 #include "xfs_error.h" 30 #include "xfs_quota.h" 31 #include "xfs_filestream.h" 32 #include "xfs_trace.h" 33 #include "xfs_icache.h" 34 #include "xfs_symlink.h" 35 #include "xfs_trans_priv.h" 36 #include "xfs_log.h" 37 #include "xfs_bmap_btree.h" 38 #include "xfs_reflink.h" 39 #include "xfs_ag.h" 40 #include "xfs_log_priv.h" 41 #include "xfs_health.h" 42 #include "xfs_pnfs.h" 43 44 struct kmem_cache *xfs_inode_cache; 45 46 /* 47 * helper function to extract extent size hint from inode 48 */ 49 xfs_extlen_t 50 xfs_get_extsz_hint( 51 struct xfs_inode *ip) 52 { 53 /* 54 * No point in aligning allocations if we need to COW to actually 55 * write to them. 56 */ 57 if (xfs_is_always_cow_inode(ip)) 58 return 0; 59 if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 60 return ip->i_extsize; 61 if (XFS_IS_REALTIME_INODE(ip)) 62 return ip->i_mount->m_sb.sb_rextsize; 63 return 0; 64 } 65 66 /* 67 * Helper function to extract CoW extent size hint from inode. 68 * Between the extent size hint and the CoW extent size hint, we 69 * return the greater of the two. If the value is zero (automatic), 70 * use the default size. 71 */ 72 xfs_extlen_t 73 xfs_get_cowextsz_hint( 74 struct xfs_inode *ip) 75 { 76 xfs_extlen_t a, b; 77 78 a = 0; 79 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 80 a = ip->i_cowextsize; 81 b = xfs_get_extsz_hint(ip); 82 83 a = max(a, b); 84 if (a == 0) 85 return XFS_DEFAULT_COWEXTSZ_HINT; 86 return a; 87 } 88 89 /* 90 * These two are wrapper routines around the xfs_ilock() routine used to 91 * centralize some grungy code. They are used in places that wish to lock the 92 * inode solely for reading the extents. The reason these places can't just 93 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 94 * bringing in of the extents from disk for a file in b-tree format. If the 95 * inode is in b-tree format, then we need to lock the inode exclusively until 96 * the extents are read in. Locking it exclusively all the time would limit 97 * our parallelism unnecessarily, though. What we do instead is check to see 98 * if the extents have been read in yet, and only lock the inode exclusively 99 * if they have not. 100 * 101 * The functions return a value which should be given to the corresponding 102 * xfs_iunlock() call. 103 */ 104 uint 105 xfs_ilock_data_map_shared( 106 struct xfs_inode *ip) 107 { 108 uint lock_mode = XFS_ILOCK_SHARED; 109 110 if (xfs_need_iread_extents(&ip->i_df)) 111 lock_mode = XFS_ILOCK_EXCL; 112 xfs_ilock(ip, lock_mode); 113 return lock_mode; 114 } 115 116 uint 117 xfs_ilock_attr_map_shared( 118 struct xfs_inode *ip) 119 { 120 uint lock_mode = XFS_ILOCK_SHARED; 121 122 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 123 lock_mode = XFS_ILOCK_EXCL; 124 xfs_ilock(ip, lock_mode); 125 return lock_mode; 126 } 127 128 /* 129 * You can't set both SHARED and EXCL for the same lock, 130 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, 131 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values 132 * to set in lock_flags. 133 */ 134 static inline void 135 xfs_lock_flags_assert( 136 uint lock_flags) 137 { 138 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 139 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 140 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 141 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 142 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 143 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 144 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 145 ASSERT(lock_flags != 0); 146 } 147 148 /* 149 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 150 * multi-reader locks: invalidate_lock and the i_lock. This routine allows 151 * various combinations of the locks to be obtained. 152 * 153 * The 3 locks should always be ordered so that the IO lock is obtained first, 154 * the mmap lock second and the ilock last in order to prevent deadlock. 155 * 156 * Basic locking order: 157 * 158 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock 159 * 160 * mmap_lock locking order: 161 * 162 * i_rwsem -> page lock -> mmap_lock 163 * mmap_lock -> invalidate_lock -> page_lock 164 * 165 * The difference in mmap_lock locking order mean that we cannot hold the 166 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths 167 * can fault in pages during copy in/out (for buffered IO) or require the 168 * mmap_lock in get_user_pages() to map the user pages into the kernel address 169 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page 170 * fault because page faults already hold the mmap_lock. 171 * 172 * Hence to serialise fully against both syscall and mmap based IO, we need to 173 * take both the i_rwsem and the invalidate_lock. These locks should *only* be 174 * both taken in places where we need to invalidate the page cache in a race 175 * free manner (e.g. truncate, hole punch and other extent manipulation 176 * functions). 177 */ 178 void 179 xfs_ilock( 180 xfs_inode_t *ip, 181 uint lock_flags) 182 { 183 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 184 185 xfs_lock_flags_assert(lock_flags); 186 187 if (lock_flags & XFS_IOLOCK_EXCL) { 188 down_write_nested(&VFS_I(ip)->i_rwsem, 189 XFS_IOLOCK_DEP(lock_flags)); 190 } else if (lock_flags & XFS_IOLOCK_SHARED) { 191 down_read_nested(&VFS_I(ip)->i_rwsem, 192 XFS_IOLOCK_DEP(lock_flags)); 193 } 194 195 if (lock_flags & XFS_MMAPLOCK_EXCL) { 196 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 197 XFS_MMAPLOCK_DEP(lock_flags)); 198 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 199 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 200 XFS_MMAPLOCK_DEP(lock_flags)); 201 } 202 203 if (lock_flags & XFS_ILOCK_EXCL) 204 down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 205 else if (lock_flags & XFS_ILOCK_SHARED) 206 down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 207 } 208 209 /* 210 * This is just like xfs_ilock(), except that the caller 211 * is guaranteed not to sleep. It returns 1 if it gets 212 * the requested locks and 0 otherwise. If the IO lock is 213 * obtained but the inode lock cannot be, then the IO lock 214 * is dropped before returning. 215 * 216 * ip -- the inode being locked 217 * lock_flags -- this parameter indicates the inode's locks to be 218 * to be locked. See the comment for xfs_ilock() for a list 219 * of valid values. 220 */ 221 int 222 xfs_ilock_nowait( 223 xfs_inode_t *ip, 224 uint lock_flags) 225 { 226 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 227 228 xfs_lock_flags_assert(lock_flags); 229 230 if (lock_flags & XFS_IOLOCK_EXCL) { 231 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) 232 goto out; 233 } else if (lock_flags & XFS_IOLOCK_SHARED) { 234 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) 235 goto out; 236 } 237 238 if (lock_flags & XFS_MMAPLOCK_EXCL) { 239 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 240 goto out_undo_iolock; 241 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 242 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 243 goto out_undo_iolock; 244 } 245 246 if (lock_flags & XFS_ILOCK_EXCL) { 247 if (!down_write_trylock(&ip->i_lock)) 248 goto out_undo_mmaplock; 249 } else if (lock_flags & XFS_ILOCK_SHARED) { 250 if (!down_read_trylock(&ip->i_lock)) 251 goto out_undo_mmaplock; 252 } 253 return 1; 254 255 out_undo_mmaplock: 256 if (lock_flags & XFS_MMAPLOCK_EXCL) 257 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 258 else if (lock_flags & XFS_MMAPLOCK_SHARED) 259 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 260 out_undo_iolock: 261 if (lock_flags & XFS_IOLOCK_EXCL) 262 up_write(&VFS_I(ip)->i_rwsem); 263 else if (lock_flags & XFS_IOLOCK_SHARED) 264 up_read(&VFS_I(ip)->i_rwsem); 265 out: 266 return 0; 267 } 268 269 /* 270 * xfs_iunlock() is used to drop the inode locks acquired with 271 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 272 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 273 * that we know which locks to drop. 274 * 275 * ip -- the inode being unlocked 276 * lock_flags -- this parameter indicates the inode's locks to be 277 * to be unlocked. See the comment for xfs_ilock() for a list 278 * of valid values for this parameter. 279 * 280 */ 281 void 282 xfs_iunlock( 283 xfs_inode_t *ip, 284 uint lock_flags) 285 { 286 xfs_lock_flags_assert(lock_flags); 287 288 if (lock_flags & XFS_IOLOCK_EXCL) 289 up_write(&VFS_I(ip)->i_rwsem); 290 else if (lock_flags & XFS_IOLOCK_SHARED) 291 up_read(&VFS_I(ip)->i_rwsem); 292 293 if (lock_flags & XFS_MMAPLOCK_EXCL) 294 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 295 else if (lock_flags & XFS_MMAPLOCK_SHARED) 296 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 297 298 if (lock_flags & XFS_ILOCK_EXCL) 299 up_write(&ip->i_lock); 300 else if (lock_flags & XFS_ILOCK_SHARED) 301 up_read(&ip->i_lock); 302 303 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 304 } 305 306 /* 307 * give up write locks. the i/o lock cannot be held nested 308 * if it is being demoted. 309 */ 310 void 311 xfs_ilock_demote( 312 xfs_inode_t *ip, 313 uint lock_flags) 314 { 315 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); 316 ASSERT((lock_flags & 317 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 318 319 if (lock_flags & XFS_ILOCK_EXCL) 320 downgrade_write(&ip->i_lock); 321 if (lock_flags & XFS_MMAPLOCK_EXCL) 322 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); 323 if (lock_flags & XFS_IOLOCK_EXCL) 324 downgrade_write(&VFS_I(ip)->i_rwsem); 325 326 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 327 } 328 329 void 330 xfs_assert_ilocked( 331 struct xfs_inode *ip, 332 uint lock_flags) 333 { 334 /* 335 * Sometimes we assert the ILOCK is held exclusively, but we're in 336 * a workqueue, so lockdep doesn't know we're the owner. 337 */ 338 if (lock_flags & XFS_ILOCK_SHARED) 339 rwsem_assert_held(&ip->i_lock); 340 else if (lock_flags & XFS_ILOCK_EXCL) 341 rwsem_assert_held_write_nolockdep(&ip->i_lock); 342 343 if (lock_flags & XFS_MMAPLOCK_SHARED) 344 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); 345 else if (lock_flags & XFS_MMAPLOCK_EXCL) 346 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); 347 348 if (lock_flags & XFS_IOLOCK_SHARED) 349 rwsem_assert_held(&VFS_I(ip)->i_rwsem); 350 else if (lock_flags & XFS_IOLOCK_EXCL) 351 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); 352 } 353 354 /* 355 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 356 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 357 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 358 * errors and warnings. 359 */ 360 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 361 static bool 362 xfs_lockdep_subclass_ok( 363 int subclass) 364 { 365 return subclass < MAX_LOCKDEP_SUBCLASSES; 366 } 367 #else 368 #define xfs_lockdep_subclass_ok(subclass) (true) 369 #endif 370 371 /* 372 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 373 * value. This can be called for any type of inode lock combination, including 374 * parent locking. Care must be taken to ensure we don't overrun the subclass 375 * storage fields in the class mask we build. 376 */ 377 static inline uint 378 xfs_lock_inumorder( 379 uint lock_mode, 380 uint subclass) 381 { 382 uint class = 0; 383 384 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | 385 XFS_ILOCK_RTSUM))); 386 ASSERT(xfs_lockdep_subclass_ok(subclass)); 387 388 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 389 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 390 class += subclass << XFS_IOLOCK_SHIFT; 391 } 392 393 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 394 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 395 class += subclass << XFS_MMAPLOCK_SHIFT; 396 } 397 398 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { 399 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 400 class += subclass << XFS_ILOCK_SHIFT; 401 } 402 403 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 404 } 405 406 /* 407 * The following routine will lock n inodes in exclusive mode. We assume the 408 * caller calls us with the inodes in i_ino order. 409 * 410 * We need to detect deadlock where an inode that we lock is in the AIL and we 411 * start waiting for another inode that is locked by a thread in a long running 412 * transaction (such as truncate). This can result in deadlock since the long 413 * running trans might need to wait for the inode we just locked in order to 414 * push the tail and free space in the log. 415 * 416 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 417 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 418 * lock more than one at a time, lockdep will report false positives saying we 419 * have violated locking orders. 420 */ 421 static void 422 xfs_lock_inodes( 423 struct xfs_inode **ips, 424 int inodes, 425 uint lock_mode) 426 { 427 int attempts = 0; 428 uint i; 429 int j; 430 bool try_lock; 431 struct xfs_log_item *lp; 432 433 /* 434 * Currently supports between 2 and 5 inodes with exclusive locking. We 435 * support an arbitrary depth of locking here, but absolute limits on 436 * inodes depend on the type of locking and the limits placed by 437 * lockdep annotations in xfs_lock_inumorder. These are all checked by 438 * the asserts. 439 */ 440 ASSERT(ips && inodes >= 2 && inodes <= 5); 441 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | 442 XFS_ILOCK_EXCL)); 443 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 444 XFS_ILOCK_SHARED))); 445 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 446 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 447 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 448 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 449 450 if (lock_mode & XFS_IOLOCK_EXCL) { 451 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 452 } else if (lock_mode & XFS_MMAPLOCK_EXCL) 453 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 454 455 again: 456 try_lock = false; 457 i = 0; 458 for (; i < inodes; i++) { 459 ASSERT(ips[i]); 460 461 if (i && (ips[i] == ips[i - 1])) /* Already locked */ 462 continue; 463 464 /* 465 * If try_lock is not set yet, make sure all locked inodes are 466 * not in the AIL. If any are, set try_lock to be used later. 467 */ 468 if (!try_lock) { 469 for (j = (i - 1); j >= 0 && !try_lock; j--) { 470 lp = &ips[j]->i_itemp->ili_item; 471 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) 472 try_lock = true; 473 } 474 } 475 476 /* 477 * If any of the previous locks we have locked is in the AIL, 478 * we must TRY to get the second and subsequent locks. If 479 * we can't get any, we must release all we have 480 * and try again. 481 */ 482 if (!try_lock) { 483 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 484 continue; 485 } 486 487 /* try_lock means we have an inode locked that is in the AIL. */ 488 ASSERT(i != 0); 489 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) 490 continue; 491 492 /* 493 * Unlock all previous guys and try again. xfs_iunlock will try 494 * to push the tail if the inode is in the AIL. 495 */ 496 attempts++; 497 for (j = i - 1; j >= 0; j--) { 498 /* 499 * Check to see if we've already unlocked this one. Not 500 * the first one going back, and the inode ptr is the 501 * same. 502 */ 503 if (j != (i - 1) && ips[j] == ips[j + 1]) 504 continue; 505 506 xfs_iunlock(ips[j], lock_mode); 507 } 508 509 if ((attempts % 5) == 0) { 510 delay(1); /* Don't just spin the CPU */ 511 } 512 goto again; 513 } 514 } 515 516 /* 517 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and 518 * mmaplock must be double-locked separately since we use i_rwsem and 519 * invalidate_lock for that. We now support taking one lock EXCL and the 520 * other SHARED. 521 */ 522 void 523 xfs_lock_two_inodes( 524 struct xfs_inode *ip0, 525 uint ip0_mode, 526 struct xfs_inode *ip1, 527 uint ip1_mode) 528 { 529 int attempts = 0; 530 struct xfs_log_item *lp; 531 532 ASSERT(hweight32(ip0_mode) == 1); 533 ASSERT(hweight32(ip1_mode) == 1); 534 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 535 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 536 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 537 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 538 ASSERT(ip0->i_ino != ip1->i_ino); 539 540 if (ip0->i_ino > ip1->i_ino) { 541 swap(ip0, ip1); 542 swap(ip0_mode, ip1_mode); 543 } 544 545 again: 546 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 547 548 /* 549 * If the first lock we have locked is in the AIL, we must TRY to get 550 * the second lock. If we can't get it, we must release the first one 551 * and try again. 552 */ 553 lp = &ip0->i_itemp->ili_item; 554 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 555 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 556 xfs_iunlock(ip0, ip0_mode); 557 if ((++attempts % 5) == 0) 558 delay(1); /* Don't just spin the CPU */ 559 goto again; 560 } 561 } else { 562 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 563 } 564 } 565 566 uint 567 xfs_ip2xflags( 568 struct xfs_inode *ip) 569 { 570 uint flags = 0; 571 572 if (ip->i_diflags & XFS_DIFLAG_ANY) { 573 if (ip->i_diflags & XFS_DIFLAG_REALTIME) 574 flags |= FS_XFLAG_REALTIME; 575 if (ip->i_diflags & XFS_DIFLAG_PREALLOC) 576 flags |= FS_XFLAG_PREALLOC; 577 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) 578 flags |= FS_XFLAG_IMMUTABLE; 579 if (ip->i_diflags & XFS_DIFLAG_APPEND) 580 flags |= FS_XFLAG_APPEND; 581 if (ip->i_diflags & XFS_DIFLAG_SYNC) 582 flags |= FS_XFLAG_SYNC; 583 if (ip->i_diflags & XFS_DIFLAG_NOATIME) 584 flags |= FS_XFLAG_NOATIME; 585 if (ip->i_diflags & XFS_DIFLAG_NODUMP) 586 flags |= FS_XFLAG_NODUMP; 587 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) 588 flags |= FS_XFLAG_RTINHERIT; 589 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) 590 flags |= FS_XFLAG_PROJINHERIT; 591 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) 592 flags |= FS_XFLAG_NOSYMLINKS; 593 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 594 flags |= FS_XFLAG_EXTSIZE; 595 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) 596 flags |= FS_XFLAG_EXTSZINHERIT; 597 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) 598 flags |= FS_XFLAG_NODEFRAG; 599 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) 600 flags |= FS_XFLAG_FILESTREAM; 601 } 602 603 if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { 604 if (ip->i_diflags2 & XFS_DIFLAG2_DAX) 605 flags |= FS_XFLAG_DAX; 606 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 607 flags |= FS_XFLAG_COWEXTSIZE; 608 } 609 610 if (xfs_inode_has_attr_fork(ip)) 611 flags |= FS_XFLAG_HASATTR; 612 return flags; 613 } 614 615 /* 616 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 617 * is allowed, otherwise it has to be an exact match. If a CI match is found, 618 * ci_name->name will point to a the actual name (caller must free) or 619 * will be set to NULL if an exact match is found. 620 */ 621 int 622 xfs_lookup( 623 struct xfs_inode *dp, 624 const struct xfs_name *name, 625 struct xfs_inode **ipp, 626 struct xfs_name *ci_name) 627 { 628 xfs_ino_t inum; 629 int error; 630 631 trace_xfs_lookup(dp, name); 632 633 if (xfs_is_shutdown(dp->i_mount)) 634 return -EIO; 635 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 636 return -EIO; 637 638 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 639 if (error) 640 goto out_unlock; 641 642 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 643 if (error) 644 goto out_free_name; 645 646 return 0; 647 648 out_free_name: 649 if (ci_name) 650 kfree(ci_name->name); 651 out_unlock: 652 *ipp = NULL; 653 return error; 654 } 655 656 /* Propagate di_flags from a parent inode to a child inode. */ 657 static void 658 xfs_inode_inherit_flags( 659 struct xfs_inode *ip, 660 const struct xfs_inode *pip) 661 { 662 unsigned int di_flags = 0; 663 xfs_failaddr_t failaddr; 664 umode_t mode = VFS_I(ip)->i_mode; 665 666 if (S_ISDIR(mode)) { 667 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) 668 di_flags |= XFS_DIFLAG_RTINHERIT; 669 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 670 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 671 ip->i_extsize = pip->i_extsize; 672 } 673 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) 674 di_flags |= XFS_DIFLAG_PROJINHERIT; 675 } else if (S_ISREG(mode)) { 676 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && 677 xfs_has_realtime(ip->i_mount)) 678 di_flags |= XFS_DIFLAG_REALTIME; 679 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 680 di_flags |= XFS_DIFLAG_EXTSIZE; 681 ip->i_extsize = pip->i_extsize; 682 } 683 } 684 if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && 685 xfs_inherit_noatime) 686 di_flags |= XFS_DIFLAG_NOATIME; 687 if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && 688 xfs_inherit_nodump) 689 di_flags |= XFS_DIFLAG_NODUMP; 690 if ((pip->i_diflags & XFS_DIFLAG_SYNC) && 691 xfs_inherit_sync) 692 di_flags |= XFS_DIFLAG_SYNC; 693 if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && 694 xfs_inherit_nosymlinks) 695 di_flags |= XFS_DIFLAG_NOSYMLINKS; 696 if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && 697 xfs_inherit_nodefrag) 698 di_flags |= XFS_DIFLAG_NODEFRAG; 699 if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) 700 di_flags |= XFS_DIFLAG_FILESTREAM; 701 702 ip->i_diflags |= di_flags; 703 704 /* 705 * Inode verifiers on older kernels only check that the extent size 706 * hint is an integer multiple of the rt extent size on realtime files. 707 * They did not check the hint alignment on a directory with both 708 * rtinherit and extszinherit flags set. If the misaligned hint is 709 * propagated from a directory into a new realtime file, new file 710 * allocations will fail due to math errors in the rt allocator and/or 711 * trip the verifiers. Validate the hint settings in the new file so 712 * that we don't let broken hints propagate. 713 */ 714 failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, 715 VFS_I(ip)->i_mode, ip->i_diflags); 716 if (failaddr) { 717 ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | 718 XFS_DIFLAG_EXTSZINHERIT); 719 ip->i_extsize = 0; 720 } 721 } 722 723 /* Propagate di_flags2 from a parent inode to a child inode. */ 724 static void 725 xfs_inode_inherit_flags2( 726 struct xfs_inode *ip, 727 const struct xfs_inode *pip) 728 { 729 xfs_failaddr_t failaddr; 730 731 if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 732 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 733 ip->i_cowextsize = pip->i_cowextsize; 734 } 735 if (pip->i_diflags2 & XFS_DIFLAG2_DAX) 736 ip->i_diflags2 |= XFS_DIFLAG2_DAX; 737 738 /* Don't let invalid cowextsize hints propagate. */ 739 failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, 740 VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); 741 if (failaddr) { 742 ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 743 ip->i_cowextsize = 0; 744 } 745 } 746 747 /* 748 * Initialise a newly allocated inode and return the in-core inode to the 749 * caller locked exclusively. 750 */ 751 int 752 xfs_init_new_inode( 753 struct mnt_idmap *idmap, 754 struct xfs_trans *tp, 755 struct xfs_inode *pip, 756 xfs_ino_t ino, 757 umode_t mode, 758 xfs_nlink_t nlink, 759 dev_t rdev, 760 prid_t prid, 761 bool init_xattrs, 762 struct xfs_inode **ipp) 763 { 764 struct inode *dir = pip ? VFS_I(pip) : NULL; 765 struct xfs_mount *mp = tp->t_mountp; 766 struct xfs_inode *ip; 767 unsigned int flags; 768 int error; 769 struct timespec64 tv; 770 struct inode *inode; 771 772 /* 773 * Protect against obviously corrupt allocation btree records. Later 774 * xfs_iget checks will catch re-allocation of other active in-memory 775 * and on-disk inodes. If we don't catch reallocating the parent inode 776 * here we will deadlock in xfs_iget() so we have to do these checks 777 * first. 778 */ 779 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 780 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 781 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), 782 XFS_SICK_AG_INOBT); 783 return -EFSCORRUPTED; 784 } 785 786 /* 787 * Get the in-core inode with the lock held exclusively to prevent 788 * others from looking at until we're done. 789 */ 790 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 791 if (error) 792 return error; 793 794 ASSERT(ip != NULL); 795 inode = VFS_I(ip); 796 set_nlink(inode, nlink); 797 inode->i_rdev = rdev; 798 ip->i_projid = prid; 799 800 if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { 801 inode_fsuid_set(inode, idmap); 802 inode->i_gid = dir->i_gid; 803 inode->i_mode = mode; 804 } else { 805 inode_init_owner(idmap, inode, dir, mode); 806 } 807 808 /* 809 * If the group ID of the new file does not match the effective group 810 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 811 * (and only if the irix_sgid_inherit compatibility variable is set). 812 */ 813 if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && 814 !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) 815 inode->i_mode &= ~S_ISGID; 816 817 ip->i_disk_size = 0; 818 ip->i_df.if_nextents = 0; 819 ASSERT(ip->i_nblocks == 0); 820 821 tv = inode_set_ctime_current(inode); 822 inode_set_mtime_to_ts(inode, tv); 823 inode_set_atime_to_ts(inode, tv); 824 825 ip->i_extsize = 0; 826 ip->i_diflags = 0; 827 828 if (xfs_has_v3inodes(mp)) { 829 inode_set_iversion(inode, 1); 830 ip->i_cowextsize = 0; 831 ip->i_crtime = tv; 832 } 833 834 flags = XFS_ILOG_CORE; 835 switch (mode & S_IFMT) { 836 case S_IFIFO: 837 case S_IFCHR: 838 case S_IFBLK: 839 case S_IFSOCK: 840 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 841 flags |= XFS_ILOG_DEV; 842 break; 843 case S_IFREG: 844 case S_IFDIR: 845 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) 846 xfs_inode_inherit_flags(ip, pip); 847 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) 848 xfs_inode_inherit_flags2(ip, pip); 849 fallthrough; 850 case S_IFLNK: 851 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 852 ip->i_df.if_bytes = 0; 853 ip->i_df.if_data = NULL; 854 break; 855 default: 856 ASSERT(0); 857 } 858 859 /* 860 * If we need to create attributes immediately after allocating the 861 * inode, initialise an empty attribute fork right now. We use the 862 * default fork offset for attributes here as we don't know exactly what 863 * size or how many attributes we might be adding. We can do this 864 * safely here because we know the data fork is completely empty and 865 * this saves us from needing to run a separate transaction to set the 866 * fork offset in the immediate future. 867 */ 868 if (init_xattrs && xfs_has_attr(mp)) { 869 ip->i_forkoff = xfs_default_attroffset(ip) >> 3; 870 xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); 871 } 872 873 /* 874 * Log the new values stuffed into the inode. 875 */ 876 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 877 xfs_trans_log_inode(tp, ip, flags); 878 879 /* now that we have an i_mode we can setup the inode structure */ 880 xfs_setup_inode(ip); 881 882 *ipp = ip; 883 return 0; 884 } 885 886 /* 887 * Decrement the link count on an inode & log the change. If this causes the 888 * link count to go to zero, move the inode to AGI unlinked list so that it can 889 * be freed when the last active reference goes away via xfs_inactive(). 890 */ 891 static int /* error */ 892 xfs_droplink( 893 struct xfs_trans *tp, 894 struct xfs_inode *ip) 895 { 896 struct inode *inode = VFS_I(ip); 897 898 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 899 900 if (inode->i_nlink == 0) { 901 xfs_info_ratelimited(tp->t_mountp, 902 "Inode 0x%llx link count dropped below zero. Pinning link count.", 903 ip->i_ino); 904 set_nlink(inode, XFS_NLINK_PINNED); 905 } 906 if (inode->i_nlink != XFS_NLINK_PINNED) 907 drop_nlink(inode); 908 909 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 910 911 if (inode->i_nlink) 912 return 0; 913 914 return xfs_iunlink(tp, ip); 915 } 916 917 /* 918 * Increment the link count on an inode & log the change. 919 */ 920 void 921 xfs_bumplink( 922 struct xfs_trans *tp, 923 struct xfs_inode *ip) 924 { 925 struct inode *inode = VFS_I(ip); 926 927 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 928 929 if (inode->i_nlink == XFS_NLINK_PINNED - 1) 930 xfs_info_ratelimited(tp->t_mountp, 931 "Inode 0x%llx link count exceeded maximum. Pinning link count.", 932 ip->i_ino); 933 if (inode->i_nlink != XFS_NLINK_PINNED) 934 inc_nlink(inode); 935 936 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 937 } 938 939 #ifdef CONFIG_XFS_LIVE_HOOKS 940 /* 941 * Use a static key here to reduce the overhead of directory live update hooks. 942 * If the compiler supports jump labels, the static branch will be replaced by 943 * a nop sled when there are no hook users. Online fsck is currently the only 944 * caller, so this is a reasonable tradeoff. 945 * 946 * Note: Patching the kernel code requires taking the cpu hotplug lock. Other 947 * parts of the kernel allocate memory with that lock held, which means that 948 * XFS callers cannot hold any locks that might be used by memory reclaim or 949 * writeback when calling the static_branch_{inc,dec} functions. 950 */ 951 DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); 952 953 void 954 xfs_dir_hook_disable(void) 955 { 956 xfs_hooks_switch_off(&xfs_dir_hooks_switch); 957 } 958 959 void 960 xfs_dir_hook_enable(void) 961 { 962 xfs_hooks_switch_on(&xfs_dir_hooks_switch); 963 } 964 965 /* Call hooks for a directory update relating to a child dirent update. */ 966 inline void 967 xfs_dir_update_hook( 968 struct xfs_inode *dp, 969 struct xfs_inode *ip, 970 int delta, 971 const struct xfs_name *name) 972 { 973 if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { 974 struct xfs_dir_update_params p = { 975 .dp = dp, 976 .ip = ip, 977 .delta = delta, 978 .name = name, 979 }; 980 struct xfs_mount *mp = ip->i_mount; 981 982 xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); 983 } 984 } 985 986 /* Call the specified function during a directory update. */ 987 int 988 xfs_dir_hook_add( 989 struct xfs_mount *mp, 990 struct xfs_dir_hook *hook) 991 { 992 return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); 993 } 994 995 /* Stop calling the specified function during a directory update. */ 996 void 997 xfs_dir_hook_del( 998 struct xfs_mount *mp, 999 struct xfs_dir_hook *hook) 1000 { 1001 xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); 1002 } 1003 1004 /* Configure directory update hook functions. */ 1005 void 1006 xfs_dir_hook_setup( 1007 struct xfs_dir_hook *hook, 1008 notifier_fn_t mod_fn) 1009 { 1010 xfs_hook_setup(&hook->dirent_hook, mod_fn); 1011 } 1012 #endif /* CONFIG_XFS_LIVE_HOOKS */ 1013 1014 int 1015 xfs_create( 1016 struct mnt_idmap *idmap, 1017 xfs_inode_t *dp, 1018 struct xfs_name *name, 1019 umode_t mode, 1020 dev_t rdev, 1021 bool init_xattrs, 1022 xfs_inode_t **ipp) 1023 { 1024 int is_dir = S_ISDIR(mode); 1025 struct xfs_mount *mp = dp->i_mount; 1026 struct xfs_inode *ip = NULL; 1027 struct xfs_trans *tp = NULL; 1028 int error; 1029 bool unlock_dp_on_error = false; 1030 prid_t prid; 1031 struct xfs_dquot *udqp = NULL; 1032 struct xfs_dquot *gdqp = NULL; 1033 struct xfs_dquot *pdqp = NULL; 1034 struct xfs_trans_res *tres; 1035 uint resblks; 1036 xfs_ino_t ino; 1037 1038 trace_xfs_create(dp, name); 1039 1040 if (xfs_is_shutdown(mp)) 1041 return -EIO; 1042 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 1043 return -EIO; 1044 1045 prid = xfs_get_initial_prid(dp); 1046 1047 /* 1048 * Make sure that we have allocated dquot(s) on disk. 1049 */ 1050 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1051 mapped_fsgid(idmap, &init_user_ns), prid, 1052 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1053 &udqp, &gdqp, &pdqp); 1054 if (error) 1055 return error; 1056 1057 if (is_dir) { 1058 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1059 tres = &M_RES(mp)->tr_mkdir; 1060 } else { 1061 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1062 tres = &M_RES(mp)->tr_create; 1063 } 1064 1065 /* 1066 * Initially assume that the file does not exist and 1067 * reserve the resources for that case. If that is not 1068 * the case we'll drop the one we have and get a more 1069 * appropriate transaction later. 1070 */ 1071 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1072 &tp); 1073 if (error == -ENOSPC) { 1074 /* flush outstanding delalloc blocks and retry */ 1075 xfs_flush_inodes(mp); 1076 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, 1077 resblks, &tp); 1078 } 1079 if (error) 1080 goto out_release_dquots; 1081 1082 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1083 unlock_dp_on_error = true; 1084 1085 /* 1086 * A newly created regular or special file just has one directory 1087 * entry pointing to them, but a directory also the "." entry 1088 * pointing to itself. 1089 */ 1090 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1091 if (!error) 1092 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1093 is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); 1094 if (error) 1095 goto out_trans_cancel; 1096 1097 /* 1098 * Now we join the directory inode to the transaction. We do not do it 1099 * earlier because xfs_dialloc might commit the previous transaction 1100 * (and release all the locks). An error from here on will result in 1101 * the transaction cancel unlocking dp so don't do it explicitly in the 1102 * error path. 1103 */ 1104 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1105 unlock_dp_on_error = false; 1106 1107 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1108 resblks - XFS_IALLOC_SPACE_RES(mp)); 1109 if (error) { 1110 ASSERT(error != -ENOSPC); 1111 goto out_trans_cancel; 1112 } 1113 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1114 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1115 1116 if (is_dir) { 1117 error = xfs_dir_init(tp, ip, dp); 1118 if (error) 1119 goto out_trans_cancel; 1120 1121 xfs_bumplink(tp, dp); 1122 } 1123 1124 /* 1125 * Create ip with a reference from dp, and add '.' and '..' references 1126 * if it's a directory. 1127 */ 1128 xfs_dir_update_hook(dp, ip, 1, name); 1129 1130 /* 1131 * If this is a synchronous mount, make sure that the 1132 * create transaction goes to disk before returning to 1133 * the user. 1134 */ 1135 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1136 xfs_trans_set_sync(tp); 1137 1138 /* 1139 * Attach the dquot(s) to the inodes and modify them incore. 1140 * These ids of the inode couldn't have changed since the new 1141 * inode has been locked ever since it was created. 1142 */ 1143 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1144 1145 error = xfs_trans_commit(tp); 1146 if (error) 1147 goto out_release_inode; 1148 1149 xfs_qm_dqrele(udqp); 1150 xfs_qm_dqrele(gdqp); 1151 xfs_qm_dqrele(pdqp); 1152 1153 *ipp = ip; 1154 return 0; 1155 1156 out_trans_cancel: 1157 xfs_trans_cancel(tp); 1158 out_release_inode: 1159 /* 1160 * Wait until after the current transaction is aborted to finish the 1161 * setup of the inode and release the inode. This prevents recursive 1162 * transactions and deadlocks from xfs_inactive. 1163 */ 1164 if (ip) { 1165 xfs_finish_inode_setup(ip); 1166 xfs_irele(ip); 1167 } 1168 out_release_dquots: 1169 xfs_qm_dqrele(udqp); 1170 xfs_qm_dqrele(gdqp); 1171 xfs_qm_dqrele(pdqp); 1172 1173 if (unlock_dp_on_error) 1174 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1175 return error; 1176 } 1177 1178 int 1179 xfs_create_tmpfile( 1180 struct mnt_idmap *idmap, 1181 struct xfs_inode *dp, 1182 umode_t mode, 1183 struct xfs_inode **ipp) 1184 { 1185 struct xfs_mount *mp = dp->i_mount; 1186 struct xfs_inode *ip = NULL; 1187 struct xfs_trans *tp = NULL; 1188 int error; 1189 prid_t prid; 1190 struct xfs_dquot *udqp = NULL; 1191 struct xfs_dquot *gdqp = NULL; 1192 struct xfs_dquot *pdqp = NULL; 1193 struct xfs_trans_res *tres; 1194 uint resblks; 1195 xfs_ino_t ino; 1196 1197 if (xfs_is_shutdown(mp)) 1198 return -EIO; 1199 1200 prid = xfs_get_initial_prid(dp); 1201 1202 /* 1203 * Make sure that we have allocated dquot(s) on disk. 1204 */ 1205 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1206 mapped_fsgid(idmap, &init_user_ns), prid, 1207 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1208 &udqp, &gdqp, &pdqp); 1209 if (error) 1210 return error; 1211 1212 resblks = XFS_IALLOC_SPACE_RES(mp); 1213 tres = &M_RES(mp)->tr_create_tmpfile; 1214 1215 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1216 &tp); 1217 if (error) 1218 goto out_release_dquots; 1219 1220 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1221 if (!error) 1222 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1223 0, 0, prid, false, &ip); 1224 if (error) 1225 goto out_trans_cancel; 1226 1227 if (xfs_has_wsync(mp)) 1228 xfs_trans_set_sync(tp); 1229 1230 /* 1231 * Attach the dquot(s) to the inodes and modify them incore. 1232 * These ids of the inode couldn't have changed since the new 1233 * inode has been locked ever since it was created. 1234 */ 1235 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1236 1237 error = xfs_iunlink(tp, ip); 1238 if (error) 1239 goto out_trans_cancel; 1240 1241 error = xfs_trans_commit(tp); 1242 if (error) 1243 goto out_release_inode; 1244 1245 xfs_qm_dqrele(udqp); 1246 xfs_qm_dqrele(gdqp); 1247 xfs_qm_dqrele(pdqp); 1248 1249 *ipp = ip; 1250 return 0; 1251 1252 out_trans_cancel: 1253 xfs_trans_cancel(tp); 1254 out_release_inode: 1255 /* 1256 * Wait until after the current transaction is aborted to finish the 1257 * setup of the inode and release the inode. This prevents recursive 1258 * transactions and deadlocks from xfs_inactive. 1259 */ 1260 if (ip) { 1261 xfs_finish_inode_setup(ip); 1262 xfs_irele(ip); 1263 } 1264 out_release_dquots: 1265 xfs_qm_dqrele(udqp); 1266 xfs_qm_dqrele(gdqp); 1267 xfs_qm_dqrele(pdqp); 1268 1269 return error; 1270 } 1271 1272 int 1273 xfs_link( 1274 xfs_inode_t *tdp, 1275 xfs_inode_t *sip, 1276 struct xfs_name *target_name) 1277 { 1278 xfs_mount_t *mp = tdp->i_mount; 1279 xfs_trans_t *tp; 1280 int error, nospace_error = 0; 1281 int resblks; 1282 1283 trace_xfs_link(tdp, target_name); 1284 1285 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1286 1287 if (xfs_is_shutdown(mp)) 1288 return -EIO; 1289 if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) 1290 return -EIO; 1291 1292 error = xfs_qm_dqattach(sip); 1293 if (error) 1294 goto std_return; 1295 1296 error = xfs_qm_dqattach(tdp); 1297 if (error) 1298 goto std_return; 1299 1300 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1301 error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, 1302 &tp, &nospace_error); 1303 if (error) 1304 goto std_return; 1305 1306 /* 1307 * If we are using project inheritance, we only allow hard link 1308 * creation in our tree when the project IDs are the same; else 1309 * the tree quota mechanism could be circumvented. 1310 */ 1311 if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 1312 tdp->i_projid != sip->i_projid)) { 1313 /* 1314 * Project quota setup skips special files which can 1315 * leave inodes in a PROJINHERIT directory without a 1316 * project ID set. We need to allow links to be made 1317 * to these "project-less" inodes because userspace 1318 * expects them to succeed after project ID setup, 1319 * but everything else should be rejected. 1320 */ 1321 if (!special_file(VFS_I(sip)->i_mode) || 1322 sip->i_projid != 0) { 1323 error = -EXDEV; 1324 goto error_return; 1325 } 1326 } 1327 1328 if (!resblks) { 1329 error = xfs_dir_canenter(tp, tdp, target_name); 1330 if (error) 1331 goto error_return; 1332 } 1333 1334 /* 1335 * Handle initial link state of O_TMPFILE inode 1336 */ 1337 if (VFS_I(sip)->i_nlink == 0) { 1338 struct xfs_perag *pag; 1339 1340 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); 1341 error = xfs_iunlink_remove(tp, pag, sip); 1342 xfs_perag_put(pag); 1343 if (error) 1344 goto error_return; 1345 } 1346 1347 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1348 resblks); 1349 if (error) 1350 goto error_return; 1351 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1352 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1353 1354 xfs_bumplink(tp, sip); 1355 xfs_dir_update_hook(tdp, sip, 1, target_name); 1356 1357 /* 1358 * If this is a synchronous mount, make sure that the 1359 * link transaction goes to disk before returning to 1360 * the user. 1361 */ 1362 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1363 xfs_trans_set_sync(tp); 1364 1365 return xfs_trans_commit(tp); 1366 1367 error_return: 1368 xfs_trans_cancel(tp); 1369 std_return: 1370 if (error == -ENOSPC && nospace_error) 1371 error = nospace_error; 1372 return error; 1373 } 1374 1375 /* Clear the reflink flag and the cowblocks tag if possible. */ 1376 static void 1377 xfs_itruncate_clear_reflink_flags( 1378 struct xfs_inode *ip) 1379 { 1380 struct xfs_ifork *dfork; 1381 struct xfs_ifork *cfork; 1382 1383 if (!xfs_is_reflink_inode(ip)) 1384 return; 1385 dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); 1386 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 1387 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) 1388 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1389 if (cfork->if_bytes == 0) 1390 xfs_inode_clear_cowblocks_tag(ip); 1391 } 1392 1393 /* 1394 * Free up the underlying blocks past new_size. The new size must be smaller 1395 * than the current size. This routine can be used both for the attribute and 1396 * data fork, and does not modify the inode size, which is left to the caller. 1397 * 1398 * The transaction passed to this routine must have made a permanent log 1399 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1400 * given transaction and start new ones, so make sure everything involved in 1401 * the transaction is tidy before calling here. Some transaction will be 1402 * returned to the caller to be committed. The incoming transaction must 1403 * already include the inode, and both inode locks must be held exclusively. 1404 * The inode must also be "held" within the transaction. On return the inode 1405 * will be "held" within the returned transaction. This routine does NOT 1406 * require any disk space to be reserved for it within the transaction. 1407 * 1408 * If we get an error, we must return with the inode locked and linked into the 1409 * current transaction. This keeps things simple for the higher level code, 1410 * because it always knows that the inode is locked and held in the transaction 1411 * that returns to it whether errors occur or not. We don't mark the inode 1412 * dirty on error so that transactions can be easily aborted if possible. 1413 */ 1414 int 1415 xfs_itruncate_extents_flags( 1416 struct xfs_trans **tpp, 1417 struct xfs_inode *ip, 1418 int whichfork, 1419 xfs_fsize_t new_size, 1420 int flags) 1421 { 1422 struct xfs_mount *mp = ip->i_mount; 1423 struct xfs_trans *tp = *tpp; 1424 xfs_fileoff_t first_unmap_block; 1425 int error = 0; 1426 1427 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1428 if (atomic_read(&VFS_I(ip)->i_count)) 1429 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); 1430 ASSERT(new_size <= XFS_ISIZE(ip)); 1431 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1432 ASSERT(ip->i_itemp != NULL); 1433 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1434 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1435 1436 trace_xfs_itruncate_extents_start(ip, new_size); 1437 1438 flags |= xfs_bmapi_aflag(whichfork); 1439 1440 /* 1441 * Since it is possible for space to become allocated beyond 1442 * the end of the file (in a crash where the space is allocated 1443 * but the inode size is not yet updated), simply remove any 1444 * blocks which show up between the new EOF and the maximum 1445 * possible file size. 1446 * 1447 * We have to free all the blocks to the bmbt maximum offset, even if 1448 * the page cache can't scale that far. 1449 */ 1450 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1451 if (!xfs_verify_fileoff(mp, first_unmap_block)) { 1452 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1453 return 0; 1454 } 1455 1456 error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, 1457 XFS_MAX_FILEOFF); 1458 if (error) 1459 goto out; 1460 1461 if (whichfork == XFS_DATA_FORK) { 1462 /* Remove all pending CoW reservations. */ 1463 error = xfs_reflink_cancel_cow_blocks(ip, &tp, 1464 first_unmap_block, XFS_MAX_FILEOFF, true); 1465 if (error) 1466 goto out; 1467 1468 xfs_itruncate_clear_reflink_flags(ip); 1469 } 1470 1471 /* 1472 * Always re-log the inode so that our permanent transaction can keep 1473 * on rolling it forward in the log. 1474 */ 1475 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1476 1477 trace_xfs_itruncate_extents_end(ip, new_size); 1478 1479 out: 1480 *tpp = tp; 1481 return error; 1482 } 1483 1484 int 1485 xfs_release( 1486 xfs_inode_t *ip) 1487 { 1488 xfs_mount_t *mp = ip->i_mount; 1489 int error = 0; 1490 1491 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) 1492 return 0; 1493 1494 /* If this is a read-only mount, don't do this (would generate I/O) */ 1495 if (xfs_is_readonly(mp)) 1496 return 0; 1497 1498 if (!xfs_is_shutdown(mp)) { 1499 int truncated; 1500 1501 /* 1502 * If we previously truncated this file and removed old data 1503 * in the process, we want to initiate "early" writeout on 1504 * the last close. This is an attempt to combat the notorious 1505 * NULL files problem which is particularly noticeable from a 1506 * truncate down, buffered (re-)write (delalloc), followed by 1507 * a crash. What we are effectively doing here is 1508 * significantly reducing the time window where we'd otherwise 1509 * be exposed to that problem. 1510 */ 1511 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1512 if (truncated) { 1513 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1514 if (ip->i_delayed_blks > 0) { 1515 error = filemap_flush(VFS_I(ip)->i_mapping); 1516 if (error) 1517 return error; 1518 } 1519 } 1520 } 1521 1522 if (VFS_I(ip)->i_nlink == 0) 1523 return 0; 1524 1525 /* 1526 * If we can't get the iolock just skip truncating the blocks past EOF 1527 * because we could deadlock with the mmap_lock otherwise. We'll get 1528 * another chance to drop them once the last reference to the inode is 1529 * dropped, so we'll never leak blocks permanently. 1530 */ 1531 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) 1532 return 0; 1533 1534 if (xfs_can_free_eofblocks(ip, false)) { 1535 /* 1536 * Check if the inode is being opened, written and closed 1537 * frequently and we have delayed allocation blocks outstanding 1538 * (e.g. streaming writes from the NFS server), truncating the 1539 * blocks past EOF will cause fragmentation to occur. 1540 * 1541 * In this case don't do the truncation, but we have to be 1542 * careful how we detect this case. Blocks beyond EOF show up as 1543 * i_delayed_blks even when the inode is clean, so we need to 1544 * truncate them away first before checking for a dirty release. 1545 * Hence on the first dirty close we will still remove the 1546 * speculative allocation, but after that we will leave it in 1547 * place. 1548 */ 1549 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1550 goto out_unlock; 1551 1552 error = xfs_free_eofblocks(ip); 1553 if (error) 1554 goto out_unlock; 1555 1556 /* delalloc blocks after truncation means it really is dirty */ 1557 if (ip->i_delayed_blks) 1558 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1559 } 1560 1561 out_unlock: 1562 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1563 return error; 1564 } 1565 1566 /* 1567 * Mark all the buffers attached to this directory stale. In theory we should 1568 * never be freeing a directory with any blocks at all, but this covers the 1569 * case where we've recovered a directory swap with a "temporary" directory 1570 * created by online repair and now need to dump it. 1571 */ 1572 STATIC void 1573 xfs_inactive_dir( 1574 struct xfs_inode *dp) 1575 { 1576 struct xfs_iext_cursor icur; 1577 struct xfs_bmbt_irec got; 1578 struct xfs_mount *mp = dp->i_mount; 1579 struct xfs_da_geometry *geo = mp->m_dir_geo; 1580 struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); 1581 xfs_fileoff_t off; 1582 1583 /* 1584 * Invalidate each directory block. All directory blocks are of 1585 * fsbcount length and alignment, so we only need to walk those same 1586 * offsets. We hold the only reference to this inode, so we must wait 1587 * for the buffer locks. 1588 */ 1589 for_each_xfs_iext(ifp, &icur, &got) { 1590 for (off = round_up(got.br_startoff, geo->fsbcount); 1591 off < got.br_startoff + got.br_blockcount; 1592 off += geo->fsbcount) { 1593 struct xfs_buf *bp = NULL; 1594 xfs_fsblock_t fsbno; 1595 int error; 1596 1597 fsbno = (off - got.br_startoff) + got.br_startblock; 1598 error = xfs_buf_incore(mp->m_ddev_targp, 1599 XFS_FSB_TO_DADDR(mp, fsbno), 1600 XFS_FSB_TO_BB(mp, geo->fsbcount), 1601 XBF_LIVESCAN, &bp); 1602 if (error) 1603 continue; 1604 1605 xfs_buf_stale(bp); 1606 xfs_buf_relse(bp); 1607 } 1608 } 1609 } 1610 1611 /* 1612 * xfs_inactive_truncate 1613 * 1614 * Called to perform a truncate when an inode becomes unlinked. 1615 */ 1616 STATIC int 1617 xfs_inactive_truncate( 1618 struct xfs_inode *ip) 1619 { 1620 struct xfs_mount *mp = ip->i_mount; 1621 struct xfs_trans *tp; 1622 int error; 1623 1624 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1625 if (error) { 1626 ASSERT(xfs_is_shutdown(mp)); 1627 return error; 1628 } 1629 xfs_ilock(ip, XFS_ILOCK_EXCL); 1630 xfs_trans_ijoin(tp, ip, 0); 1631 1632 /* 1633 * Log the inode size first to prevent stale data exposure in the event 1634 * of a system crash before the truncate completes. See the related 1635 * comment in xfs_vn_setattr_size() for details. 1636 */ 1637 ip->i_disk_size = 0; 1638 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1639 1640 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1641 if (error) 1642 goto error_trans_cancel; 1643 1644 ASSERT(ip->i_df.if_nextents == 0); 1645 1646 error = xfs_trans_commit(tp); 1647 if (error) 1648 goto error_unlock; 1649 1650 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1651 return 0; 1652 1653 error_trans_cancel: 1654 xfs_trans_cancel(tp); 1655 error_unlock: 1656 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1657 return error; 1658 } 1659 1660 /* 1661 * xfs_inactive_ifree() 1662 * 1663 * Perform the inode free when an inode is unlinked. 1664 */ 1665 STATIC int 1666 xfs_inactive_ifree( 1667 struct xfs_inode *ip) 1668 { 1669 struct xfs_mount *mp = ip->i_mount; 1670 struct xfs_trans *tp; 1671 int error; 1672 1673 /* 1674 * We try to use a per-AG reservation for any block needed by the finobt 1675 * tree, but as the finobt feature predates the per-AG reservation 1676 * support a degraded file system might not have enough space for the 1677 * reservation at mount time. In that case try to dip into the reserved 1678 * pool and pray. 1679 * 1680 * Send a warning if the reservation does happen to fail, as the inode 1681 * now remains allocated and sits on the unlinked list until the fs is 1682 * repaired. 1683 */ 1684 if (unlikely(mp->m_finobt_nores)) { 1685 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1686 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1687 &tp); 1688 } else { 1689 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1690 } 1691 if (error) { 1692 if (error == -ENOSPC) { 1693 xfs_warn_ratelimited(mp, 1694 "Failed to remove inode(s) from unlinked list. " 1695 "Please free space, unmount and run xfs_repair."); 1696 } else { 1697 ASSERT(xfs_is_shutdown(mp)); 1698 } 1699 return error; 1700 } 1701 1702 /* 1703 * We do not hold the inode locked across the entire rolling transaction 1704 * here. We only need to hold it for the first transaction that 1705 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1706 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1707 * here breaks the relationship between cluster buffer invalidation and 1708 * stale inode invalidation on cluster buffer item journal commit 1709 * completion, and can result in leaving dirty stale inodes hanging 1710 * around in memory. 1711 * 1712 * We have no need for serialising this inode operation against other 1713 * operations - we freed the inode and hence reallocation is required 1714 * and that will serialise on reallocating the space the deferops need 1715 * to free. Hence we can unlock the inode on the first commit of 1716 * the transaction rather than roll it right through the deferops. This 1717 * avoids relogging the XFS_ISTALE inode. 1718 * 1719 * We check that xfs_ifree() hasn't grown an internal transaction roll 1720 * by asserting that the inode is still locked when it returns. 1721 */ 1722 xfs_ilock(ip, XFS_ILOCK_EXCL); 1723 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1724 1725 error = xfs_ifree(tp, ip); 1726 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1727 if (error) { 1728 /* 1729 * If we fail to free the inode, shut down. The cancel 1730 * might do that, we need to make sure. Otherwise the 1731 * inode might be lost for a long time or forever. 1732 */ 1733 if (!xfs_is_shutdown(mp)) { 1734 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1735 __func__, error); 1736 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1737 } 1738 xfs_trans_cancel(tp); 1739 return error; 1740 } 1741 1742 /* 1743 * Credit the quota account(s). The inode is gone. 1744 */ 1745 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1746 1747 return xfs_trans_commit(tp); 1748 } 1749 1750 /* 1751 * Returns true if we need to update the on-disk metadata before we can free 1752 * the memory used by this inode. Updates include freeing post-eof 1753 * preallocations; freeing COW staging extents; and marking the inode free in 1754 * the inobt if it is on the unlinked list. 1755 */ 1756 bool 1757 xfs_inode_needs_inactive( 1758 struct xfs_inode *ip) 1759 { 1760 struct xfs_mount *mp = ip->i_mount; 1761 struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 1762 1763 /* 1764 * If the inode is already free, then there can be nothing 1765 * to clean up here. 1766 */ 1767 if (VFS_I(ip)->i_mode == 0) 1768 return false; 1769 1770 /* 1771 * If this is a read-only mount, don't do this (would generate I/O) 1772 * unless we're in log recovery and cleaning the iunlinked list. 1773 */ 1774 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1775 return false; 1776 1777 /* If the log isn't running, push inodes straight to reclaim. */ 1778 if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) 1779 return false; 1780 1781 /* Metadata inodes require explicit resource cleanup. */ 1782 if (xfs_is_metadata_inode(ip)) 1783 return false; 1784 1785 /* Want to clean out the cow blocks if there are any. */ 1786 if (cow_ifp && cow_ifp->if_bytes > 0) 1787 return true; 1788 1789 /* Unlinked files must be freed. */ 1790 if (VFS_I(ip)->i_nlink == 0) 1791 return true; 1792 1793 /* 1794 * This file isn't being freed, so check if there are post-eof blocks 1795 * to free. @force is true because we are evicting an inode from the 1796 * cache. Post-eof blocks must be freed, lest we end up with broken 1797 * free space accounting. 1798 * 1799 * Note: don't bother with iolock here since lockdep complains about 1800 * acquiring it in reclaim context. We have the only reference to the 1801 * inode at this point anyways. 1802 */ 1803 return xfs_can_free_eofblocks(ip, true); 1804 } 1805 1806 /* 1807 * Save health status somewhere, if we're dumping an inode with uncorrected 1808 * errors and online repair isn't running. 1809 */ 1810 static inline void 1811 xfs_inactive_health( 1812 struct xfs_inode *ip) 1813 { 1814 struct xfs_mount *mp = ip->i_mount; 1815 struct xfs_perag *pag; 1816 unsigned int sick; 1817 unsigned int checked; 1818 1819 xfs_inode_measure_sickness(ip, &sick, &checked); 1820 if (!sick) 1821 return; 1822 1823 trace_xfs_inode_unfixed_corruption(ip, sick); 1824 1825 if (sick & XFS_SICK_INO_FORGET) 1826 return; 1827 1828 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1829 if (!pag) { 1830 /* There had better still be a perag structure! */ 1831 ASSERT(0); 1832 return; 1833 } 1834 1835 xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); 1836 xfs_perag_put(pag); 1837 } 1838 1839 /* 1840 * xfs_inactive 1841 * 1842 * This is called when the vnode reference count for the vnode 1843 * goes to zero. If the file has been unlinked, then it must 1844 * now be truncated. Also, we clear all of the read-ahead state 1845 * kept for the inode here since the file is now closed. 1846 */ 1847 int 1848 xfs_inactive( 1849 xfs_inode_t *ip) 1850 { 1851 struct xfs_mount *mp; 1852 int error = 0; 1853 int truncate = 0; 1854 1855 /* 1856 * If the inode is already free, then there can be nothing 1857 * to clean up here. 1858 */ 1859 if (VFS_I(ip)->i_mode == 0) { 1860 ASSERT(ip->i_df.if_broot_bytes == 0); 1861 goto out; 1862 } 1863 1864 mp = ip->i_mount; 1865 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1866 1867 xfs_inactive_health(ip); 1868 1869 /* 1870 * If this is a read-only mount, don't do this (would generate I/O) 1871 * unless we're in log recovery and cleaning the iunlinked list. 1872 */ 1873 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1874 goto out; 1875 1876 /* Metadata inodes require explicit resource cleanup. */ 1877 if (xfs_is_metadata_inode(ip)) 1878 goto out; 1879 1880 /* Try to clean out the cow blocks if there are any. */ 1881 if (xfs_inode_has_cow_data(ip)) 1882 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1883 1884 if (VFS_I(ip)->i_nlink != 0) { 1885 /* 1886 * force is true because we are evicting an inode from the 1887 * cache. Post-eof blocks must be freed, lest we end up with 1888 * broken free space accounting. 1889 * 1890 * Note: don't bother with iolock here since lockdep complains 1891 * about acquiring it in reclaim context. We have the only 1892 * reference to the inode at this point anyways. 1893 */ 1894 if (xfs_can_free_eofblocks(ip, true)) 1895 error = xfs_free_eofblocks(ip); 1896 1897 goto out; 1898 } 1899 1900 if (S_ISREG(VFS_I(ip)->i_mode) && 1901 (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || 1902 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) 1903 truncate = 1; 1904 1905 if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { 1906 /* 1907 * If this inode is being inactivated during a quotacheck and 1908 * has not yet been scanned by quotacheck, we /must/ remove 1909 * the dquots from the inode before inactivation changes the 1910 * block and inode counts. Most probably this is a result of 1911 * reloading the incore iunlinked list to purge unrecovered 1912 * unlinked inodes. 1913 */ 1914 xfs_qm_dqdetach(ip); 1915 } else { 1916 error = xfs_qm_dqattach(ip); 1917 if (error) 1918 goto out; 1919 } 1920 1921 if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) { 1922 xfs_inactive_dir(ip); 1923 truncate = 1; 1924 } 1925 1926 if (S_ISLNK(VFS_I(ip)->i_mode)) 1927 error = xfs_inactive_symlink(ip); 1928 else if (truncate) 1929 error = xfs_inactive_truncate(ip); 1930 if (error) 1931 goto out; 1932 1933 /* 1934 * If there are attributes associated with the file then blow them away 1935 * now. The code calls a routine that recursively deconstructs the 1936 * attribute fork. If also blows away the in-core attribute fork. 1937 */ 1938 if (xfs_inode_has_attr_fork(ip)) { 1939 error = xfs_attr_inactive(ip); 1940 if (error) 1941 goto out; 1942 } 1943 1944 ASSERT(ip->i_forkoff == 0); 1945 1946 /* 1947 * Free the inode. 1948 */ 1949 error = xfs_inactive_ifree(ip); 1950 1951 out: 1952 /* 1953 * We're done making metadata updates for this inode, so we can release 1954 * the attached dquots. 1955 */ 1956 xfs_qm_dqdetach(ip); 1957 return error; 1958 } 1959 1960 /* 1961 * In-Core Unlinked List Lookups 1962 * ============================= 1963 * 1964 * Every inode is supposed to be reachable from some other piece of metadata 1965 * with the exception of the root directory. Inodes with a connection to a 1966 * file descriptor but not linked from anywhere in the on-disk directory tree 1967 * are collectively known as unlinked inodes, though the filesystem itself 1968 * maintains links to these inodes so that on-disk metadata are consistent. 1969 * 1970 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1971 * header contains a number of buckets that point to an inode, and each inode 1972 * record has a pointer to the next inode in the hash chain. This 1973 * singly-linked list causes scaling problems in the iunlink remove function 1974 * because we must walk that list to find the inode that points to the inode 1975 * being removed from the unlinked hash bucket list. 1976 * 1977 * Hence we keep an in-memory double linked list to link each inode on an 1978 * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer 1979 * based lists would require having 64 list heads in the perag, one for each 1980 * list. This is expensive in terms of memory (think millions of AGs) and cache 1981 * misses on lookups. Instead, use the fact that inodes on the unlinked list 1982 * must be referenced at the VFS level to keep them on the list and hence we 1983 * have an existence guarantee for inodes on the unlinked list. 1984 * 1985 * Given we have an existence guarantee, we can use lockless inode cache lookups 1986 * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode 1987 * for the double linked unlinked list, and we don't need any extra locking to 1988 * keep the list safe as all manipulations are done under the AGI buffer lock. 1989 * Keeping the list up to date does not require memory allocation, just finding 1990 * the XFS inode and updating the next/prev unlinked list aginos. 1991 */ 1992 1993 /* 1994 * Find an inode on the unlinked list. This does not take references to the 1995 * inode as we have existence guarantees by holding the AGI buffer lock and that 1996 * only unlinked, referenced inodes can be on the unlinked inode list. If we 1997 * don't find the inode in cache, then let the caller handle the situation. 1998 */ 1999 struct xfs_inode * 2000 xfs_iunlink_lookup( 2001 struct xfs_perag *pag, 2002 xfs_agino_t agino) 2003 { 2004 struct xfs_inode *ip; 2005 2006 rcu_read_lock(); 2007 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 2008 if (!ip) { 2009 /* Caller can handle inode not being in memory. */ 2010 rcu_read_unlock(); 2011 return NULL; 2012 } 2013 2014 /* 2015 * Inode in RCU freeing limbo should not happen. Warn about this and 2016 * let the caller handle the failure. 2017 */ 2018 if (WARN_ON_ONCE(!ip->i_ino)) { 2019 rcu_read_unlock(); 2020 return NULL; 2021 } 2022 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); 2023 rcu_read_unlock(); 2024 return ip; 2025 } 2026 2027 /* 2028 * Update the prev pointer of the next agino. Returns -ENOLINK if the inode 2029 * is not in cache. 2030 */ 2031 static int 2032 xfs_iunlink_update_backref( 2033 struct xfs_perag *pag, 2034 xfs_agino_t prev_agino, 2035 xfs_agino_t next_agino) 2036 { 2037 struct xfs_inode *ip; 2038 2039 /* No update necessary if we are at the end of the list. */ 2040 if (next_agino == NULLAGINO) 2041 return 0; 2042 2043 ip = xfs_iunlink_lookup(pag, next_agino); 2044 if (!ip) 2045 return -ENOLINK; 2046 2047 ip->i_prev_unlinked = prev_agino; 2048 return 0; 2049 } 2050 2051 /* 2052 * Point the AGI unlinked bucket at an inode and log the results. The caller 2053 * is responsible for validating the old value. 2054 */ 2055 STATIC int 2056 xfs_iunlink_update_bucket( 2057 struct xfs_trans *tp, 2058 struct xfs_perag *pag, 2059 struct xfs_buf *agibp, 2060 unsigned int bucket_index, 2061 xfs_agino_t new_agino) 2062 { 2063 struct xfs_agi *agi = agibp->b_addr; 2064 xfs_agino_t old_value; 2065 int offset; 2066 2067 ASSERT(xfs_verify_agino_or_null(pag, new_agino)); 2068 2069 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2070 trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, 2071 old_value, new_agino); 2072 2073 /* 2074 * We should never find the head of the list already set to the value 2075 * passed in because either we're adding or removing ourselves from the 2076 * head of the list. 2077 */ 2078 if (old_value == new_agino) { 2079 xfs_buf_mark_corrupt(agibp); 2080 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2081 return -EFSCORRUPTED; 2082 } 2083 2084 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 2085 offset = offsetof(struct xfs_agi, agi_unlinked) + 2086 (sizeof(xfs_agino_t) * bucket_index); 2087 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 2088 return 0; 2089 } 2090 2091 /* 2092 * Load the inode @next_agino into the cache and set its prev_unlinked pointer 2093 * to @prev_agino. Caller must hold the AGI to synchronize with other changes 2094 * to the unlinked list. 2095 */ 2096 STATIC int 2097 xfs_iunlink_reload_next( 2098 struct xfs_trans *tp, 2099 struct xfs_buf *agibp, 2100 xfs_agino_t prev_agino, 2101 xfs_agino_t next_agino) 2102 { 2103 struct xfs_perag *pag = agibp->b_pag; 2104 struct xfs_mount *mp = pag->pag_mount; 2105 struct xfs_inode *next_ip = NULL; 2106 xfs_ino_t ino; 2107 int error; 2108 2109 ASSERT(next_agino != NULLAGINO); 2110 2111 #ifdef DEBUG 2112 rcu_read_lock(); 2113 next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); 2114 ASSERT(next_ip == NULL); 2115 rcu_read_unlock(); 2116 #endif 2117 2118 xfs_info_ratelimited(mp, 2119 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", 2120 next_agino, pag->pag_agno); 2121 2122 /* 2123 * Use an untrusted lookup just to be cautious in case the AGI has been 2124 * corrupted and now points at a free inode. That shouldn't happen, 2125 * but we'd rather shut down now since we're already running in a weird 2126 * situation. 2127 */ 2128 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); 2129 error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); 2130 if (error) { 2131 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2132 return error; 2133 } 2134 2135 /* If this is not an unlinked inode, something is very wrong. */ 2136 if (VFS_I(next_ip)->i_nlink != 0) { 2137 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2138 error = -EFSCORRUPTED; 2139 goto rele; 2140 } 2141 2142 next_ip->i_prev_unlinked = prev_agino; 2143 trace_xfs_iunlink_reload_next(next_ip); 2144 rele: 2145 ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); 2146 if (xfs_is_quotacheck_running(mp) && next_ip) 2147 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); 2148 xfs_irele(next_ip); 2149 return error; 2150 } 2151 2152 static int 2153 xfs_iunlink_insert_inode( 2154 struct xfs_trans *tp, 2155 struct xfs_perag *pag, 2156 struct xfs_buf *agibp, 2157 struct xfs_inode *ip) 2158 { 2159 struct xfs_mount *mp = tp->t_mountp; 2160 struct xfs_agi *agi = agibp->b_addr; 2161 xfs_agino_t next_agino; 2162 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2163 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2164 int error; 2165 2166 /* 2167 * Get the index into the agi hash table for the list this inode will 2168 * go on. Make sure the pointer isn't garbage and that this inode 2169 * isn't already on the list. 2170 */ 2171 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2172 if (next_agino == agino || 2173 !xfs_verify_agino_or_null(pag, next_agino)) { 2174 xfs_buf_mark_corrupt(agibp); 2175 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2176 return -EFSCORRUPTED; 2177 } 2178 2179 /* 2180 * Update the prev pointer in the next inode to point back to this 2181 * inode. 2182 */ 2183 error = xfs_iunlink_update_backref(pag, agino, next_agino); 2184 if (error == -ENOLINK) 2185 error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); 2186 if (error) 2187 return error; 2188 2189 if (next_agino != NULLAGINO) { 2190 /* 2191 * There is already another inode in the bucket, so point this 2192 * inode to the current head of the list. 2193 */ 2194 error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); 2195 if (error) 2196 return error; 2197 ip->i_next_unlinked = next_agino; 2198 } 2199 2200 /* Point the head of the list to point to this inode. */ 2201 ip->i_prev_unlinked = NULLAGINO; 2202 return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); 2203 } 2204 2205 /* 2206 * This is called when the inode's link count has gone to 0 or we are creating 2207 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2208 * 2209 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2210 * list when the inode is freed. 2211 */ 2212 int 2213 xfs_iunlink( 2214 struct xfs_trans *tp, 2215 struct xfs_inode *ip) 2216 { 2217 struct xfs_mount *mp = tp->t_mountp; 2218 struct xfs_perag *pag; 2219 struct xfs_buf *agibp; 2220 int error; 2221 2222 ASSERT(VFS_I(ip)->i_nlink == 0); 2223 ASSERT(VFS_I(ip)->i_mode != 0); 2224 trace_xfs_iunlink(ip); 2225 2226 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2227 2228 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2229 error = xfs_read_agi(pag, tp, 0, &agibp); 2230 if (error) 2231 goto out; 2232 2233 error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); 2234 out: 2235 xfs_perag_put(pag); 2236 return error; 2237 } 2238 2239 static int 2240 xfs_iunlink_remove_inode( 2241 struct xfs_trans *tp, 2242 struct xfs_perag *pag, 2243 struct xfs_buf *agibp, 2244 struct xfs_inode *ip) 2245 { 2246 struct xfs_mount *mp = tp->t_mountp; 2247 struct xfs_agi *agi = agibp->b_addr; 2248 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2249 xfs_agino_t head_agino; 2250 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2251 int error; 2252 2253 trace_xfs_iunlink_remove(ip); 2254 2255 /* 2256 * Get the index into the agi hash table for the list this inode will 2257 * go on. Make sure the head pointer isn't garbage. 2258 */ 2259 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2260 if (!xfs_verify_agino(pag, head_agino)) { 2261 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2262 agi, sizeof(*agi)); 2263 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2264 return -EFSCORRUPTED; 2265 } 2266 2267 /* 2268 * Set our inode's next_unlinked pointer to NULL and then return 2269 * the old pointer value so that we can update whatever was previous 2270 * to us in the list to point to whatever was next in the list. 2271 */ 2272 error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); 2273 if (error) 2274 return error; 2275 2276 /* 2277 * Update the prev pointer in the next inode to point back to previous 2278 * inode in the chain. 2279 */ 2280 error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, 2281 ip->i_next_unlinked); 2282 if (error == -ENOLINK) 2283 error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, 2284 ip->i_next_unlinked); 2285 if (error) 2286 return error; 2287 2288 if (head_agino != agino) { 2289 struct xfs_inode *prev_ip; 2290 2291 prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); 2292 if (!prev_ip) { 2293 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 2294 return -EFSCORRUPTED; 2295 } 2296 2297 error = xfs_iunlink_log_inode(tp, prev_ip, pag, 2298 ip->i_next_unlinked); 2299 prev_ip->i_next_unlinked = ip->i_next_unlinked; 2300 } else { 2301 /* Point the head of the list to the next unlinked inode. */ 2302 error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, 2303 ip->i_next_unlinked); 2304 } 2305 2306 ip->i_next_unlinked = NULLAGINO; 2307 ip->i_prev_unlinked = 0; 2308 return error; 2309 } 2310 2311 /* 2312 * Pull the on-disk inode from the AGI unlinked list. 2313 */ 2314 int 2315 xfs_iunlink_remove( 2316 struct xfs_trans *tp, 2317 struct xfs_perag *pag, 2318 struct xfs_inode *ip) 2319 { 2320 struct xfs_buf *agibp; 2321 int error; 2322 2323 trace_xfs_iunlink_remove(ip); 2324 2325 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2326 error = xfs_read_agi(pag, tp, 0, &agibp); 2327 if (error) 2328 return error; 2329 2330 return xfs_iunlink_remove_inode(tp, pag, agibp, ip); 2331 } 2332 2333 /* 2334 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2335 * mark it stale. We should only find clean inodes in this lookup that aren't 2336 * already stale. 2337 */ 2338 static void 2339 xfs_ifree_mark_inode_stale( 2340 struct xfs_perag *pag, 2341 struct xfs_inode *free_ip, 2342 xfs_ino_t inum) 2343 { 2344 struct xfs_mount *mp = pag->pag_mount; 2345 struct xfs_inode_log_item *iip; 2346 struct xfs_inode *ip; 2347 2348 retry: 2349 rcu_read_lock(); 2350 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2351 2352 /* Inode not in memory, nothing to do */ 2353 if (!ip) { 2354 rcu_read_unlock(); 2355 return; 2356 } 2357 2358 /* 2359 * because this is an RCU protected lookup, we could find a recently 2360 * freed or even reallocated inode during the lookup. We need to check 2361 * under the i_flags_lock for a valid inode here. Skip it if it is not 2362 * valid, the wrong inode or stale. 2363 */ 2364 spin_lock(&ip->i_flags_lock); 2365 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) 2366 goto out_iflags_unlock; 2367 2368 /* 2369 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2370 * other inodes that we did not find in the list attached to the buffer 2371 * and are not already marked stale. If we can't lock it, back off and 2372 * retry. 2373 */ 2374 if (ip != free_ip) { 2375 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2376 spin_unlock(&ip->i_flags_lock); 2377 rcu_read_unlock(); 2378 delay(1); 2379 goto retry; 2380 } 2381 } 2382 ip->i_flags |= XFS_ISTALE; 2383 2384 /* 2385 * If the inode is flushing, it is already attached to the buffer. All 2386 * we needed to do here is mark the inode stale so buffer IO completion 2387 * will remove it from the AIL. 2388 */ 2389 iip = ip->i_itemp; 2390 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2391 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2392 ASSERT(iip->ili_last_fields); 2393 goto out_iunlock; 2394 } 2395 2396 /* 2397 * Inodes not attached to the buffer can be released immediately. 2398 * Everything else has to go through xfs_iflush_abort() on journal 2399 * commit as the flock synchronises removal of the inode from the 2400 * cluster buffer against inode reclaim. 2401 */ 2402 if (!iip || list_empty(&iip->ili_item.li_bio_list)) 2403 goto out_iunlock; 2404 2405 __xfs_iflags_set(ip, XFS_IFLUSHING); 2406 spin_unlock(&ip->i_flags_lock); 2407 rcu_read_unlock(); 2408 2409 /* we have a dirty inode in memory that has not yet been flushed. */ 2410 spin_lock(&iip->ili_lock); 2411 iip->ili_last_fields = iip->ili_fields; 2412 iip->ili_fields = 0; 2413 iip->ili_fsync_fields = 0; 2414 spin_unlock(&iip->ili_lock); 2415 ASSERT(iip->ili_last_fields); 2416 2417 if (ip != free_ip) 2418 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2419 return; 2420 2421 out_iunlock: 2422 if (ip != free_ip) 2423 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2424 out_iflags_unlock: 2425 spin_unlock(&ip->i_flags_lock); 2426 rcu_read_unlock(); 2427 } 2428 2429 /* 2430 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2431 * inodes that are in memory - they all must be marked stale and attached to 2432 * the cluster buffer. 2433 */ 2434 static int 2435 xfs_ifree_cluster( 2436 struct xfs_trans *tp, 2437 struct xfs_perag *pag, 2438 struct xfs_inode *free_ip, 2439 struct xfs_icluster *xic) 2440 { 2441 struct xfs_mount *mp = free_ip->i_mount; 2442 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2443 struct xfs_buf *bp; 2444 xfs_daddr_t blkno; 2445 xfs_ino_t inum = xic->first_ino; 2446 int nbufs; 2447 int i, j; 2448 int ioffset; 2449 int error; 2450 2451 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2452 2453 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2454 /* 2455 * The allocation bitmap tells us which inodes of the chunk were 2456 * physically allocated. Skip the cluster if an inode falls into 2457 * a sparse region. 2458 */ 2459 ioffset = inum - xic->first_ino; 2460 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2461 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2462 continue; 2463 } 2464 2465 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2466 XFS_INO_TO_AGBNO(mp, inum)); 2467 2468 /* 2469 * We obtain and lock the backing buffer first in the process 2470 * here to ensure dirty inodes attached to the buffer remain in 2471 * the flushing state while we mark them stale. 2472 * 2473 * If we scan the in-memory inodes first, then buffer IO can 2474 * complete before we get a lock on it, and hence we may fail 2475 * to mark all the active inodes on the buffer stale. 2476 */ 2477 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2478 mp->m_bsize * igeo->blocks_per_cluster, 2479 XBF_UNMAPPED, &bp); 2480 if (error) 2481 return error; 2482 2483 /* 2484 * This buffer may not have been correctly initialised as we 2485 * didn't read it from disk. That's not important because we are 2486 * only using to mark the buffer as stale in the log, and to 2487 * attach stale cached inodes on it. That means it will never be 2488 * dispatched for IO. If it is, we want to know about it, and we 2489 * want it to fail. We can acheive this by adding a write 2490 * verifier to the buffer. 2491 */ 2492 bp->b_ops = &xfs_inode_buf_ops; 2493 2494 /* 2495 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2496 * too. This requires lookups, and will skip inodes that we've 2497 * already marked XFS_ISTALE. 2498 */ 2499 for (i = 0; i < igeo->inodes_per_cluster; i++) 2500 xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); 2501 2502 xfs_trans_stale_inode_buf(tp, bp); 2503 xfs_trans_binval(tp, bp); 2504 } 2505 return 0; 2506 } 2507 2508 /* 2509 * This is called to return an inode to the inode free list. The inode should 2510 * already be truncated to 0 length and have no pages associated with it. This 2511 * routine also assumes that the inode is already a part of the transaction. 2512 * 2513 * The on-disk copy of the inode will have been added to the list of unlinked 2514 * inodes in the AGI. We need to remove the inode from that list atomically with 2515 * respect to freeing it here. 2516 */ 2517 int 2518 xfs_ifree( 2519 struct xfs_trans *tp, 2520 struct xfs_inode *ip) 2521 { 2522 struct xfs_mount *mp = ip->i_mount; 2523 struct xfs_perag *pag; 2524 struct xfs_icluster xic = { 0 }; 2525 struct xfs_inode_log_item *iip = ip->i_itemp; 2526 int error; 2527 2528 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 2529 ASSERT(VFS_I(ip)->i_nlink == 0); 2530 ASSERT(ip->i_df.if_nextents == 0); 2531 ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2532 ASSERT(ip->i_nblocks == 0); 2533 2534 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2535 2536 /* 2537 * Free the inode first so that we guarantee that the AGI lock is going 2538 * to be taken before we remove the inode from the unlinked list. This 2539 * makes the AGI lock -> unlinked list modification order the same as 2540 * used in O_TMPFILE creation. 2541 */ 2542 error = xfs_difree(tp, pag, ip->i_ino, &xic); 2543 if (error) 2544 goto out; 2545 2546 error = xfs_iunlink_remove(tp, pag, ip); 2547 if (error) 2548 goto out; 2549 2550 /* 2551 * Free any local-format data sitting around before we reset the 2552 * data fork to extents format. Note that the attr fork data has 2553 * already been freed by xfs_attr_inactive. 2554 */ 2555 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2556 kfree(ip->i_df.if_data); 2557 ip->i_df.if_data = NULL; 2558 ip->i_df.if_bytes = 0; 2559 } 2560 2561 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2562 ip->i_diflags = 0; 2563 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 2564 ip->i_forkoff = 0; /* mark the attr fork not in use */ 2565 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2566 if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) 2567 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); 2568 2569 /* Don't attempt to replay owner changes for a deleted inode */ 2570 spin_lock(&iip->ili_lock); 2571 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2572 spin_unlock(&iip->ili_lock); 2573 2574 /* 2575 * Bump the generation count so no one will be confused 2576 * by reincarnations of this inode. 2577 */ 2578 VFS_I(ip)->i_generation++; 2579 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2580 2581 if (xic.deleted) 2582 error = xfs_ifree_cluster(tp, pag, ip, &xic); 2583 out: 2584 xfs_perag_put(pag); 2585 return error; 2586 } 2587 2588 /* 2589 * This is called to unpin an inode. The caller must have the inode locked 2590 * in at least shared mode so that the buffer cannot be subsequently pinned 2591 * once someone is waiting for it to be unpinned. 2592 */ 2593 static void 2594 xfs_iunpin( 2595 struct xfs_inode *ip) 2596 { 2597 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 2598 2599 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2600 2601 /* Give the log a push to start the unpinning I/O */ 2602 xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); 2603 2604 } 2605 2606 static void 2607 __xfs_iunpin_wait( 2608 struct xfs_inode *ip) 2609 { 2610 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2611 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2612 2613 xfs_iunpin(ip); 2614 2615 do { 2616 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2617 if (xfs_ipincount(ip)) 2618 io_schedule(); 2619 } while (xfs_ipincount(ip)); 2620 finish_wait(wq, &wait.wq_entry); 2621 } 2622 2623 void 2624 xfs_iunpin_wait( 2625 struct xfs_inode *ip) 2626 { 2627 if (xfs_ipincount(ip)) 2628 __xfs_iunpin_wait(ip); 2629 } 2630 2631 /* 2632 * Removing an inode from the namespace involves removing the directory entry 2633 * and dropping the link count on the inode. Removing the directory entry can 2634 * result in locking an AGF (directory blocks were freed) and removing a link 2635 * count can result in placing the inode on an unlinked list which results in 2636 * locking an AGI. 2637 * 2638 * The big problem here is that we have an ordering constraint on AGF and AGI 2639 * locking - inode allocation locks the AGI, then can allocate a new extent for 2640 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2641 * removes the inode from the unlinked list, requiring that we lock the AGI 2642 * first, and then freeing the inode can result in an inode chunk being freed 2643 * and hence freeing disk space requiring that we lock an AGF. 2644 * 2645 * Hence the ordering that is imposed by other parts of the code is AGI before 2646 * AGF. This means we cannot remove the directory entry before we drop the inode 2647 * reference count and put it on the unlinked list as this results in a lock 2648 * order of AGF then AGI, and this can deadlock against inode allocation and 2649 * freeing. Therefore we must drop the link counts before we remove the 2650 * directory entry. 2651 * 2652 * This is still safe from a transactional point of view - it is not until we 2653 * get to xfs_defer_finish() that we have the possibility of multiple 2654 * transactions in this operation. Hence as long as we remove the directory 2655 * entry and drop the link count in the first transaction of the remove 2656 * operation, there are no transactional constraints on the ordering here. 2657 */ 2658 int 2659 xfs_remove( 2660 xfs_inode_t *dp, 2661 struct xfs_name *name, 2662 xfs_inode_t *ip) 2663 { 2664 xfs_mount_t *mp = dp->i_mount; 2665 xfs_trans_t *tp = NULL; 2666 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2667 int dontcare; 2668 int error = 0; 2669 uint resblks; 2670 2671 trace_xfs_remove(dp, name); 2672 2673 if (xfs_is_shutdown(mp)) 2674 return -EIO; 2675 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 2676 return -EIO; 2677 2678 error = xfs_qm_dqattach(dp); 2679 if (error) 2680 goto std_return; 2681 2682 error = xfs_qm_dqattach(ip); 2683 if (error) 2684 goto std_return; 2685 2686 /* 2687 * We try to get the real space reservation first, allowing for 2688 * directory btree deletion(s) implying possible bmap insert(s). If we 2689 * can't get the space reservation then we use 0 instead, and avoid the 2690 * bmap btree insert(s) in the directory code by, if the bmap insert 2691 * tries to happen, instead trimming the LAST block from the directory. 2692 * 2693 * Ignore EDQUOT and ENOSPC being returned via nospace_error because 2694 * the directory code can handle a reservationless update and we don't 2695 * want to prevent a user from trying to free space by deleting things. 2696 */ 2697 resblks = XFS_REMOVE_SPACE_RES(mp); 2698 error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, 2699 &tp, &dontcare); 2700 if (error) { 2701 ASSERT(error != -ENOSPC); 2702 goto std_return; 2703 } 2704 2705 /* 2706 * If we're removing a directory perform some additional validation. 2707 */ 2708 if (is_dir) { 2709 ASSERT(VFS_I(ip)->i_nlink >= 2); 2710 if (VFS_I(ip)->i_nlink != 2) { 2711 error = -ENOTEMPTY; 2712 goto out_trans_cancel; 2713 } 2714 if (!xfs_dir_isempty(ip)) { 2715 error = -ENOTEMPTY; 2716 goto out_trans_cancel; 2717 } 2718 2719 /* Drop the link from ip's "..". */ 2720 error = xfs_droplink(tp, dp); 2721 if (error) 2722 goto out_trans_cancel; 2723 2724 /* Drop the "." link from ip to self. */ 2725 error = xfs_droplink(tp, ip); 2726 if (error) 2727 goto out_trans_cancel; 2728 2729 /* 2730 * Point the unlinked child directory's ".." entry to the root 2731 * directory to eliminate back-references to inodes that may 2732 * get freed before the child directory is closed. If the fs 2733 * gets shrunk, this can lead to dirent inode validation errors. 2734 */ 2735 if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { 2736 error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, 2737 tp->t_mountp->m_sb.sb_rootino, 0); 2738 if (error) 2739 goto out_trans_cancel; 2740 } 2741 } else { 2742 /* 2743 * When removing a non-directory we need to log the parent 2744 * inode here. For a directory this is done implicitly 2745 * by the xfs_droplink call for the ".." entry. 2746 */ 2747 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2748 } 2749 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2750 2751 /* Drop the link from dp to ip. */ 2752 error = xfs_droplink(tp, ip); 2753 if (error) 2754 goto out_trans_cancel; 2755 2756 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2757 if (error) { 2758 ASSERT(error != -ENOENT); 2759 goto out_trans_cancel; 2760 } 2761 2762 /* 2763 * Drop the link from dp to ip, and if ip was a directory, remove the 2764 * '.' and '..' references since we freed the directory. 2765 */ 2766 xfs_dir_update_hook(dp, ip, -1, name); 2767 2768 /* 2769 * If this is a synchronous mount, make sure that the 2770 * remove transaction goes to disk before returning to 2771 * the user. 2772 */ 2773 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 2774 xfs_trans_set_sync(tp); 2775 2776 error = xfs_trans_commit(tp); 2777 if (error) 2778 goto std_return; 2779 2780 if (is_dir && xfs_inode_is_filestream(ip)) 2781 xfs_filestream_deassociate(ip); 2782 2783 return 0; 2784 2785 out_trans_cancel: 2786 xfs_trans_cancel(tp); 2787 std_return: 2788 return error; 2789 } 2790 2791 /* 2792 * Enter all inodes for a rename transaction into a sorted array. 2793 */ 2794 #define __XFS_SORT_INODES 5 2795 STATIC void 2796 xfs_sort_for_rename( 2797 struct xfs_inode *dp1, /* in: old (source) directory inode */ 2798 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2799 struct xfs_inode *ip1, /* in: inode of old entry */ 2800 struct xfs_inode *ip2, /* in: inode of new entry */ 2801 struct xfs_inode *wip, /* in: whiteout inode */ 2802 struct xfs_inode **i_tab,/* out: sorted array of inodes */ 2803 int *num_inodes) /* in/out: inodes in array */ 2804 { 2805 int i, j; 2806 2807 ASSERT(*num_inodes == __XFS_SORT_INODES); 2808 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2809 2810 /* 2811 * i_tab contains a list of pointers to inodes. We initialize 2812 * the table here & we'll sort it. We will then use it to 2813 * order the acquisition of the inode locks. 2814 * 2815 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2816 */ 2817 i = 0; 2818 i_tab[i++] = dp1; 2819 i_tab[i++] = dp2; 2820 i_tab[i++] = ip1; 2821 if (ip2) 2822 i_tab[i++] = ip2; 2823 if (wip) 2824 i_tab[i++] = wip; 2825 *num_inodes = i; 2826 2827 /* 2828 * Sort the elements via bubble sort. (Remember, there are at 2829 * most 5 elements to sort, so this is adequate.) 2830 */ 2831 for (i = 0; i < *num_inodes; i++) { 2832 for (j = 1; j < *num_inodes; j++) { 2833 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2834 struct xfs_inode *temp = i_tab[j]; 2835 i_tab[j] = i_tab[j-1]; 2836 i_tab[j-1] = temp; 2837 } 2838 } 2839 } 2840 } 2841 2842 static int 2843 xfs_finish_rename( 2844 struct xfs_trans *tp) 2845 { 2846 /* 2847 * If this is a synchronous mount, make sure that the rename transaction 2848 * goes to disk before returning to the user. 2849 */ 2850 if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) 2851 xfs_trans_set_sync(tp); 2852 2853 return xfs_trans_commit(tp); 2854 } 2855 2856 /* 2857 * xfs_cross_rename() 2858 * 2859 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall 2860 */ 2861 STATIC int 2862 xfs_cross_rename( 2863 struct xfs_trans *tp, 2864 struct xfs_inode *dp1, 2865 struct xfs_name *name1, 2866 struct xfs_inode *ip1, 2867 struct xfs_inode *dp2, 2868 struct xfs_name *name2, 2869 struct xfs_inode *ip2, 2870 int spaceres) 2871 { 2872 int error = 0; 2873 int ip1_flags = 0; 2874 int ip2_flags = 0; 2875 int dp2_flags = 0; 2876 2877 /* Swap inode number for dirent in first parent */ 2878 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2879 if (error) 2880 goto out_trans_abort; 2881 2882 /* Swap inode number for dirent in second parent */ 2883 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2884 if (error) 2885 goto out_trans_abort; 2886 2887 /* 2888 * If we're renaming one or more directories across different parents, 2889 * update the respective ".." entries (and link counts) to match the new 2890 * parents. 2891 */ 2892 if (dp1 != dp2) { 2893 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2894 2895 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2896 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2897 dp1->i_ino, spaceres); 2898 if (error) 2899 goto out_trans_abort; 2900 2901 /* transfer ip2 ".." reference to dp1 */ 2902 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2903 error = xfs_droplink(tp, dp2); 2904 if (error) 2905 goto out_trans_abort; 2906 xfs_bumplink(tp, dp1); 2907 } 2908 2909 /* 2910 * Although ip1 isn't changed here, userspace needs 2911 * to be warned about the change, so that applications 2912 * relying on it (like backup ones), will properly 2913 * notify the change 2914 */ 2915 ip1_flags |= XFS_ICHGTIME_CHG; 2916 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2917 } 2918 2919 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2920 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2921 dp2->i_ino, spaceres); 2922 if (error) 2923 goto out_trans_abort; 2924 2925 /* transfer ip1 ".." reference to dp2 */ 2926 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2927 error = xfs_droplink(tp, dp1); 2928 if (error) 2929 goto out_trans_abort; 2930 xfs_bumplink(tp, dp2); 2931 } 2932 2933 /* 2934 * Although ip2 isn't changed here, userspace needs 2935 * to be warned about the change, so that applications 2936 * relying on it (like backup ones), will properly 2937 * notify the change 2938 */ 2939 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2940 ip2_flags |= XFS_ICHGTIME_CHG; 2941 } 2942 } 2943 2944 if (ip1_flags) { 2945 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2946 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2947 } 2948 if (ip2_flags) { 2949 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2950 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2951 } 2952 if (dp2_flags) { 2953 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2954 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2955 } 2956 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2957 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2958 2959 /* 2960 * Inform our hook clients that we've finished an exchange operation as 2961 * follows: removed the source and target files from their directories; 2962 * added the target to the source directory; and added the source to 2963 * the target directory. All inodes are locked, so it's ok to model a 2964 * rename this way so long as we say we deleted entries before we add 2965 * new ones. 2966 */ 2967 xfs_dir_update_hook(dp1, ip1, -1, name1); 2968 xfs_dir_update_hook(dp2, ip2, -1, name2); 2969 xfs_dir_update_hook(dp1, ip2, 1, name1); 2970 xfs_dir_update_hook(dp2, ip1, 1, name2); 2971 2972 return xfs_finish_rename(tp); 2973 2974 out_trans_abort: 2975 xfs_trans_cancel(tp); 2976 return error; 2977 } 2978 2979 /* 2980 * xfs_rename_alloc_whiteout() 2981 * 2982 * Return a referenced, unlinked, unlocked inode that can be used as a 2983 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2984 * crash between allocating the inode and linking it into the rename transaction 2985 * recovery will free the inode and we won't leak it. 2986 */ 2987 static int 2988 xfs_rename_alloc_whiteout( 2989 struct mnt_idmap *idmap, 2990 struct xfs_name *src_name, 2991 struct xfs_inode *dp, 2992 struct xfs_inode **wip) 2993 { 2994 struct xfs_inode *tmpfile; 2995 struct qstr name; 2996 int error; 2997 2998 error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, 2999 &tmpfile); 3000 if (error) 3001 return error; 3002 3003 name.name = src_name->name; 3004 name.len = src_name->len; 3005 error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); 3006 if (error) { 3007 xfs_finish_inode_setup(tmpfile); 3008 xfs_irele(tmpfile); 3009 return error; 3010 } 3011 3012 /* 3013 * Prepare the tmpfile inode as if it were created through the VFS. 3014 * Complete the inode setup and flag it as linkable. nlink is already 3015 * zero, so we can skip the drop_nlink. 3016 */ 3017 xfs_setup_iops(tmpfile); 3018 xfs_finish_inode_setup(tmpfile); 3019 VFS_I(tmpfile)->i_state |= I_LINKABLE; 3020 3021 *wip = tmpfile; 3022 return 0; 3023 } 3024 3025 /* 3026 * xfs_rename 3027 */ 3028 int 3029 xfs_rename( 3030 struct mnt_idmap *idmap, 3031 struct xfs_inode *src_dp, 3032 struct xfs_name *src_name, 3033 struct xfs_inode *src_ip, 3034 struct xfs_inode *target_dp, 3035 struct xfs_name *target_name, 3036 struct xfs_inode *target_ip, 3037 unsigned int flags) 3038 { 3039 struct xfs_mount *mp = src_dp->i_mount; 3040 struct xfs_trans *tp; 3041 struct xfs_inode *wip = NULL; /* whiteout inode */ 3042 struct xfs_inode *inodes[__XFS_SORT_INODES]; 3043 int i; 3044 int num_inodes = __XFS_SORT_INODES; 3045 bool new_parent = (src_dp != target_dp); 3046 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 3047 int spaceres; 3048 bool retried = false; 3049 int error, nospace_error = 0; 3050 3051 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 3052 3053 if ((flags & RENAME_EXCHANGE) && !target_ip) 3054 return -EINVAL; 3055 3056 /* 3057 * If we are doing a whiteout operation, allocate the whiteout inode 3058 * we will be placing at the target and ensure the type is set 3059 * appropriately. 3060 */ 3061 if (flags & RENAME_WHITEOUT) { 3062 error = xfs_rename_alloc_whiteout(idmap, src_name, 3063 target_dp, &wip); 3064 if (error) 3065 return error; 3066 3067 /* setup target dirent info as whiteout */ 3068 src_name->type = XFS_DIR3_FT_CHRDEV; 3069 } 3070 3071 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, 3072 inodes, &num_inodes); 3073 3074 retry: 3075 nospace_error = 0; 3076 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 3077 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 3078 if (error == -ENOSPC) { 3079 nospace_error = error; 3080 spaceres = 0; 3081 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, 3082 &tp); 3083 } 3084 if (error) 3085 goto out_release_wip; 3086 3087 /* 3088 * Attach the dquots to the inodes 3089 */ 3090 error = xfs_qm_vop_rename_dqattach(inodes); 3091 if (error) 3092 goto out_trans_cancel; 3093 3094 /* 3095 * Lock all the participating inodes. Depending upon whether 3096 * the target_name exists in the target directory, and 3097 * whether the target directory is the same as the source 3098 * directory, we can lock from 2 to 5 inodes. 3099 */ 3100 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 3101 3102 /* 3103 * Join all the inodes to the transaction. From this point on, 3104 * we can rely on either trans_commit or trans_cancel to unlock 3105 * them. 3106 */ 3107 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3108 if (new_parent) 3109 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3110 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3111 if (target_ip) 3112 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3113 if (wip) 3114 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3115 3116 /* 3117 * If we are using project inheritance, we only allow renames 3118 * into our tree when the project IDs are the same; else the 3119 * tree quota mechanism would be circumvented. 3120 */ 3121 if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 3122 target_dp->i_projid != src_ip->i_projid)) { 3123 error = -EXDEV; 3124 goto out_trans_cancel; 3125 } 3126 3127 /* RENAME_EXCHANGE is unique from here on. */ 3128 if (flags & RENAME_EXCHANGE) 3129 return xfs_cross_rename(tp, src_dp, src_name, src_ip, 3130 target_dp, target_name, target_ip, 3131 spaceres); 3132 3133 /* 3134 * Try to reserve quota to handle an expansion of the target directory. 3135 * We'll allow the rename to continue in reservationless mode if we hit 3136 * a space usage constraint. If we trigger reservationless mode, save 3137 * the errno if there isn't any free space in the target directory. 3138 */ 3139 if (spaceres != 0) { 3140 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, 3141 0, false); 3142 if (error == -EDQUOT || error == -ENOSPC) { 3143 if (!retried) { 3144 xfs_trans_cancel(tp); 3145 xfs_blockgc_free_quota(target_dp, 0); 3146 retried = true; 3147 goto retry; 3148 } 3149 3150 nospace_error = error; 3151 spaceres = 0; 3152 error = 0; 3153 } 3154 if (error) 3155 goto out_trans_cancel; 3156 } 3157 3158 /* 3159 * Check for expected errors before we dirty the transaction 3160 * so we can return an error without a transaction abort. 3161 */ 3162 if (target_ip == NULL) { 3163 /* 3164 * If there's no space reservation, check the entry will 3165 * fit before actually inserting it. 3166 */ 3167 if (!spaceres) { 3168 error = xfs_dir_canenter(tp, target_dp, target_name); 3169 if (error) 3170 goto out_trans_cancel; 3171 } 3172 } else { 3173 /* 3174 * If target exists and it's a directory, check that whether 3175 * it can be destroyed. 3176 */ 3177 if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3178 (!xfs_dir_isempty(target_ip) || 3179 (VFS_I(target_ip)->i_nlink > 2))) { 3180 error = -EEXIST; 3181 goto out_trans_cancel; 3182 } 3183 } 3184 3185 /* 3186 * Lock the AGI buffers we need to handle bumping the nlink of the 3187 * whiteout inode off the unlinked list and to handle dropping the 3188 * nlink of the target inode. Per locking order rules, do this in 3189 * increasing AG order and before directory block allocation tries to 3190 * grab AGFs because we grab AGIs before AGFs. 3191 * 3192 * The (vfs) caller must ensure that if src is a directory then 3193 * target_ip is either null or an empty directory. 3194 */ 3195 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3196 if (inodes[i] == wip || 3197 (inodes[i] == target_ip && 3198 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3199 struct xfs_perag *pag; 3200 struct xfs_buf *bp; 3201 3202 pag = xfs_perag_get(mp, 3203 XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); 3204 error = xfs_read_agi(pag, tp, 0, &bp); 3205 xfs_perag_put(pag); 3206 if (error) 3207 goto out_trans_cancel; 3208 } 3209 } 3210 3211 /* 3212 * Directory entry creation below may acquire the AGF. Remove 3213 * the whiteout from the unlinked list first to preserve correct 3214 * AGI/AGF locking order. This dirties the transaction so failures 3215 * after this point will abort and log recovery will clean up the 3216 * mess. 3217 * 3218 * For whiteouts, we need to bump the link count on the whiteout 3219 * inode. After this point, we have a real link, clear the tmpfile 3220 * state flag from the inode so it doesn't accidentally get misused 3221 * in future. 3222 */ 3223 if (wip) { 3224 struct xfs_perag *pag; 3225 3226 ASSERT(VFS_I(wip)->i_nlink == 0); 3227 3228 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); 3229 error = xfs_iunlink_remove(tp, pag, wip); 3230 xfs_perag_put(pag); 3231 if (error) 3232 goto out_trans_cancel; 3233 3234 xfs_bumplink(tp, wip); 3235 VFS_I(wip)->i_state &= ~I_LINKABLE; 3236 } 3237 3238 /* 3239 * Set up the target. 3240 */ 3241 if (target_ip == NULL) { 3242 /* 3243 * If target does not exist and the rename crosses 3244 * directories, adjust the target directory link count 3245 * to account for the ".." reference from the new entry. 3246 */ 3247 error = xfs_dir_createname(tp, target_dp, target_name, 3248 src_ip->i_ino, spaceres); 3249 if (error) 3250 goto out_trans_cancel; 3251 3252 xfs_trans_ichgtime(tp, target_dp, 3253 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3254 3255 if (new_parent && src_is_directory) { 3256 xfs_bumplink(tp, target_dp); 3257 } 3258 } else { /* target_ip != NULL */ 3259 /* 3260 * Link the source inode under the target name. 3261 * If the source inode is a directory and we are moving 3262 * it across directories, its ".." entry will be 3263 * inconsistent until we replace that down below. 3264 * 3265 * In case there is already an entry with the same 3266 * name at the destination directory, remove it first. 3267 */ 3268 error = xfs_dir_replace(tp, target_dp, target_name, 3269 src_ip->i_ino, spaceres); 3270 if (error) 3271 goto out_trans_cancel; 3272 3273 xfs_trans_ichgtime(tp, target_dp, 3274 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3275 3276 /* 3277 * Decrement the link count on the target since the target 3278 * dir no longer points to it. 3279 */ 3280 error = xfs_droplink(tp, target_ip); 3281 if (error) 3282 goto out_trans_cancel; 3283 3284 if (src_is_directory) { 3285 /* 3286 * Drop the link from the old "." entry. 3287 */ 3288 error = xfs_droplink(tp, target_ip); 3289 if (error) 3290 goto out_trans_cancel; 3291 } 3292 } /* target_ip != NULL */ 3293 3294 /* 3295 * Remove the source. 3296 */ 3297 if (new_parent && src_is_directory) { 3298 /* 3299 * Rewrite the ".." entry to point to the new 3300 * directory. 3301 */ 3302 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 3303 target_dp->i_ino, spaceres); 3304 ASSERT(error != -EEXIST); 3305 if (error) 3306 goto out_trans_cancel; 3307 } 3308 3309 /* 3310 * We always want to hit the ctime on the source inode. 3311 * 3312 * This isn't strictly required by the standards since the source 3313 * inode isn't really being changed, but old unix file systems did 3314 * it and some incremental backup programs won't work without it. 3315 */ 3316 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3317 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3318 3319 /* 3320 * Adjust the link count on src_dp. This is necessary when 3321 * renaming a directory, either within one parent when 3322 * the target existed, or across two parent directories. 3323 */ 3324 if (src_is_directory && (new_parent || target_ip != NULL)) { 3325 3326 /* 3327 * Decrement link count on src_directory since the 3328 * entry that's moved no longer points to it. 3329 */ 3330 error = xfs_droplink(tp, src_dp); 3331 if (error) 3332 goto out_trans_cancel; 3333 } 3334 3335 /* 3336 * For whiteouts, we only need to update the source dirent with the 3337 * inode number of the whiteout inode rather than removing it 3338 * altogether. 3339 */ 3340 if (wip) 3341 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, 3342 spaceres); 3343 else 3344 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3345 spaceres); 3346 3347 if (error) 3348 goto out_trans_cancel; 3349 3350 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3351 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3352 if (new_parent) 3353 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3354 3355 /* 3356 * Inform our hook clients that we've finished a rename operation as 3357 * follows: removed the source and target files from their directories; 3358 * that we've added the source to the target directory; and finally 3359 * that we've added the whiteout, if there was one. All inodes are 3360 * locked, so it's ok to model a rename this way so long as we say we 3361 * deleted entries before we add new ones. 3362 */ 3363 if (target_ip) 3364 xfs_dir_update_hook(target_dp, target_ip, -1, target_name); 3365 xfs_dir_update_hook(src_dp, src_ip, -1, src_name); 3366 xfs_dir_update_hook(target_dp, src_ip, 1, target_name); 3367 if (wip) 3368 xfs_dir_update_hook(src_dp, wip, 1, src_name); 3369 3370 error = xfs_finish_rename(tp); 3371 if (wip) 3372 xfs_irele(wip); 3373 return error; 3374 3375 out_trans_cancel: 3376 xfs_trans_cancel(tp); 3377 out_release_wip: 3378 if (wip) 3379 xfs_irele(wip); 3380 if (error == -ENOSPC && nospace_error) 3381 error = nospace_error; 3382 return error; 3383 } 3384 3385 static int 3386 xfs_iflush( 3387 struct xfs_inode *ip, 3388 struct xfs_buf *bp) 3389 { 3390 struct xfs_inode_log_item *iip = ip->i_itemp; 3391 struct xfs_dinode *dip; 3392 struct xfs_mount *mp = ip->i_mount; 3393 int error; 3394 3395 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 3396 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3397 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3398 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3399 ASSERT(iip->ili_item.li_buf == bp); 3400 3401 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3402 3403 /* 3404 * We don't flush the inode if any of the following checks fail, but we 3405 * do still update the log item and attach to the backing buffer as if 3406 * the flush happened. This is a formality to facilitate predictable 3407 * error handling as the caller will shutdown and fail the buffer. 3408 */ 3409 error = -EFSCORRUPTED; 3410 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3411 mp, XFS_ERRTAG_IFLUSH_1)) { 3412 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3413 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, 3414 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3415 goto flush_out; 3416 } 3417 if (S_ISREG(VFS_I(ip)->i_mode)) { 3418 if (XFS_TEST_ERROR( 3419 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3420 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3421 mp, XFS_ERRTAG_IFLUSH_3)) { 3422 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3423 "%s: Bad regular inode %llu, ptr "PTR_FMT, 3424 __func__, ip->i_ino, ip); 3425 goto flush_out; 3426 } 3427 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3428 if (XFS_TEST_ERROR( 3429 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3430 ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3431 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3432 mp, XFS_ERRTAG_IFLUSH_4)) { 3433 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3434 "%s: Bad directory inode %llu, ptr "PTR_FMT, 3435 __func__, ip->i_ino, ip); 3436 goto flush_out; 3437 } 3438 } 3439 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > 3440 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { 3441 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3442 "%s: detected corrupt incore inode %llu, " 3443 "total extents = %llu nblocks = %lld, ptr "PTR_FMT, 3444 __func__, ip->i_ino, 3445 ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), 3446 ip->i_nblocks, ip); 3447 goto flush_out; 3448 } 3449 if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, 3450 mp, XFS_ERRTAG_IFLUSH_6)) { 3451 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3452 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, 3453 __func__, ip->i_ino, ip->i_forkoff, ip); 3454 goto flush_out; 3455 } 3456 3457 /* 3458 * Inode item log recovery for v2 inodes are dependent on the flushiter 3459 * count for correct sequencing. We bump the flush iteration count so 3460 * we can detect flushes which postdate a log record during recovery. 3461 * This is redundant as we now log every change and hence this can't 3462 * happen but we need to still do it to ensure backwards compatibility 3463 * with old kernels that predate logging all inode changes. 3464 */ 3465 if (!xfs_has_v3inodes(mp)) 3466 ip->i_flushiter++; 3467 3468 /* 3469 * If there are inline format data / attr forks attached to this inode, 3470 * make sure they are not corrupt. 3471 */ 3472 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && 3473 xfs_ifork_verify_local_data(ip)) 3474 goto flush_out; 3475 if (xfs_inode_has_attr_fork(ip) && 3476 ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && 3477 xfs_ifork_verify_local_attr(ip)) 3478 goto flush_out; 3479 3480 /* 3481 * Copy the dirty parts of the inode into the on-disk inode. We always 3482 * copy out the core of the inode, because if the inode is dirty at all 3483 * the core must be. 3484 */ 3485 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3486 3487 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3488 if (!xfs_has_v3inodes(mp)) { 3489 if (ip->i_flushiter == DI_MAX_FLUSH) 3490 ip->i_flushiter = 0; 3491 } 3492 3493 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3494 if (xfs_inode_has_attr_fork(ip)) 3495 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3496 3497 /* 3498 * We've recorded everything logged in the inode, so we'd like to clear 3499 * the ili_fields bits so we don't log and flush things unnecessarily. 3500 * However, we can't stop logging all this information until the data 3501 * we've copied into the disk buffer is written to disk. If we did we 3502 * might overwrite the copy of the inode in the log with all the data 3503 * after re-logging only part of it, and in the face of a crash we 3504 * wouldn't have all the data we need to recover. 3505 * 3506 * What we do is move the bits to the ili_last_fields field. When 3507 * logging the inode, these bits are moved back to the ili_fields field. 3508 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3509 * we know that the information those bits represent is permanently on 3510 * disk. As long as the flush completes before the inode is logged 3511 * again, then both ili_fields and ili_last_fields will be cleared. 3512 */ 3513 error = 0; 3514 flush_out: 3515 spin_lock(&iip->ili_lock); 3516 iip->ili_last_fields = iip->ili_fields; 3517 iip->ili_fields = 0; 3518 iip->ili_fsync_fields = 0; 3519 spin_unlock(&iip->ili_lock); 3520 3521 /* 3522 * Store the current LSN of the inode so that we can tell whether the 3523 * item has moved in the AIL from xfs_buf_inode_iodone(). 3524 */ 3525 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3526 &iip->ili_item.li_lsn); 3527 3528 /* generate the checksum. */ 3529 xfs_dinode_calc_crc(mp, dip); 3530 if (error) 3531 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 3532 return error; 3533 } 3534 3535 /* 3536 * Non-blocking flush of dirty inode metadata into the backing buffer. 3537 * 3538 * The caller must have a reference to the inode and hold the cluster buffer 3539 * locked. The function will walk across all the inodes on the cluster buffer it 3540 * can find and lock without blocking, and flush them to the cluster buffer. 3541 * 3542 * On successful flushing of at least one inode, the caller must write out the 3543 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3544 * the caller needs to release the buffer. On failure, the filesystem will be 3545 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3546 * will be returned. 3547 */ 3548 int 3549 xfs_iflush_cluster( 3550 struct xfs_buf *bp) 3551 { 3552 struct xfs_mount *mp = bp->b_mount; 3553 struct xfs_log_item *lip, *n; 3554 struct xfs_inode *ip; 3555 struct xfs_inode_log_item *iip; 3556 int clcount = 0; 3557 int error = 0; 3558 3559 /* 3560 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3561 * will remove itself from the list. 3562 */ 3563 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3564 iip = (struct xfs_inode_log_item *)lip; 3565 ip = iip->ili_inode; 3566 3567 /* 3568 * Quick and dirty check to avoid locks if possible. 3569 */ 3570 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) 3571 continue; 3572 if (xfs_ipincount(ip)) 3573 continue; 3574 3575 /* 3576 * The inode is still attached to the buffer, which means it is 3577 * dirty but reclaim might try to grab it. Check carefully for 3578 * that, and grab the ilock while still holding the i_flags_lock 3579 * to guarantee reclaim will not be able to reclaim this inode 3580 * once we drop the i_flags_lock. 3581 */ 3582 spin_lock(&ip->i_flags_lock); 3583 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3584 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3585 spin_unlock(&ip->i_flags_lock); 3586 continue; 3587 } 3588 3589 /* 3590 * ILOCK will pin the inode against reclaim and prevent 3591 * concurrent transactions modifying the inode while we are 3592 * flushing the inode. If we get the lock, set the flushing 3593 * state before we drop the i_flags_lock. 3594 */ 3595 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3596 spin_unlock(&ip->i_flags_lock); 3597 continue; 3598 } 3599 __xfs_iflags_set(ip, XFS_IFLUSHING); 3600 spin_unlock(&ip->i_flags_lock); 3601 3602 /* 3603 * Abort flushing this inode if we are shut down because the 3604 * inode may not currently be in the AIL. This can occur when 3605 * log I/O failure unpins the inode without inserting into the 3606 * AIL, leaving a dirty/unpinned inode attached to the buffer 3607 * that otherwise looks like it should be flushed. 3608 */ 3609 if (xlog_is_shutdown(mp->m_log)) { 3610 xfs_iunpin_wait(ip); 3611 xfs_iflush_abort(ip); 3612 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3613 error = -EIO; 3614 continue; 3615 } 3616 3617 /* don't block waiting on a log force to unpin dirty inodes */ 3618 if (xfs_ipincount(ip)) { 3619 xfs_iflags_clear(ip, XFS_IFLUSHING); 3620 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3621 continue; 3622 } 3623 3624 if (!xfs_inode_clean(ip)) 3625 error = xfs_iflush(ip, bp); 3626 else 3627 xfs_iflags_clear(ip, XFS_IFLUSHING); 3628 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3629 if (error) 3630 break; 3631 clcount++; 3632 } 3633 3634 if (error) { 3635 /* 3636 * Shutdown first so we kill the log before we release this 3637 * buffer. If it is an INODE_ALLOC buffer and pins the tail 3638 * of the log, failing it before the _log_ is shut down can 3639 * result in the log tail being moved forward in the journal 3640 * on disk because log writes can still be taking place. Hence 3641 * unpinning the tail will allow the ICREATE intent to be 3642 * removed from the log an recovery will fail with uninitialised 3643 * inode cluster buffers. 3644 */ 3645 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3646 bp->b_flags |= XBF_ASYNC; 3647 xfs_buf_ioend_fail(bp); 3648 return error; 3649 } 3650 3651 if (!clcount) 3652 return -EAGAIN; 3653 3654 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3655 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3656 return 0; 3657 3658 } 3659 3660 /* Release an inode. */ 3661 void 3662 xfs_irele( 3663 struct xfs_inode *ip) 3664 { 3665 trace_xfs_irele(ip, _RET_IP_); 3666 iput(VFS_I(ip)); 3667 } 3668 3669 /* 3670 * Ensure all commited transactions touching the inode are written to the log. 3671 */ 3672 int 3673 xfs_log_force_inode( 3674 struct xfs_inode *ip) 3675 { 3676 xfs_csn_t seq = 0; 3677 3678 xfs_ilock(ip, XFS_ILOCK_SHARED); 3679 if (xfs_ipincount(ip)) 3680 seq = ip->i_itemp->ili_commit_seq; 3681 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3682 3683 if (!seq) 3684 return 0; 3685 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); 3686 } 3687 3688 /* 3689 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3690 * abide vfs locking order (lowest pointer value goes first) and breaking the 3691 * layout leases before proceeding. The loop is needed because we cannot call 3692 * the blocking break_layout() with the iolocks held, and therefore have to 3693 * back out both locks. 3694 */ 3695 static int 3696 xfs_iolock_two_inodes_and_break_layout( 3697 struct inode *src, 3698 struct inode *dest) 3699 { 3700 int error; 3701 3702 if (src > dest) 3703 swap(src, dest); 3704 3705 retry: 3706 /* Wait to break both inodes' layouts before we start locking. */ 3707 error = break_layout(src, true); 3708 if (error) 3709 return error; 3710 if (src != dest) { 3711 error = break_layout(dest, true); 3712 if (error) 3713 return error; 3714 } 3715 3716 /* Lock one inode and make sure nobody got in and leased it. */ 3717 inode_lock(src); 3718 error = break_layout(src, false); 3719 if (error) { 3720 inode_unlock(src); 3721 if (error == -EWOULDBLOCK) 3722 goto retry; 3723 return error; 3724 } 3725 3726 if (src == dest) 3727 return 0; 3728 3729 /* Lock the other inode and make sure nobody got in and leased it. */ 3730 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3731 error = break_layout(dest, false); 3732 if (error) { 3733 inode_unlock(src); 3734 inode_unlock(dest); 3735 if (error == -EWOULDBLOCK) 3736 goto retry; 3737 return error; 3738 } 3739 3740 return 0; 3741 } 3742 3743 static int 3744 xfs_mmaplock_two_inodes_and_break_dax_layout( 3745 struct xfs_inode *ip1, 3746 struct xfs_inode *ip2) 3747 { 3748 int error; 3749 bool retry; 3750 struct page *page; 3751 3752 if (ip1->i_ino > ip2->i_ino) 3753 swap(ip1, ip2); 3754 3755 again: 3756 retry = false; 3757 /* Lock the first inode */ 3758 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3759 error = xfs_break_dax_layouts(VFS_I(ip1), &retry); 3760 if (error || retry) { 3761 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3762 if (error == 0 && retry) 3763 goto again; 3764 return error; 3765 } 3766 3767 if (ip1 == ip2) 3768 return 0; 3769 3770 /* Nested lock the second inode */ 3771 xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); 3772 /* 3773 * We cannot use xfs_break_dax_layouts() directly here because it may 3774 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable 3775 * for this nested lock case. 3776 */ 3777 page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); 3778 if (page && page_ref_count(page) != 1) { 3779 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3780 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3781 goto again; 3782 } 3783 3784 return 0; 3785 } 3786 3787 /* 3788 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3789 * mmap activity. 3790 */ 3791 int 3792 xfs_ilock2_io_mmap( 3793 struct xfs_inode *ip1, 3794 struct xfs_inode *ip2) 3795 { 3796 int ret; 3797 3798 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3799 if (ret) 3800 return ret; 3801 3802 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3803 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); 3804 if (ret) { 3805 inode_unlock(VFS_I(ip2)); 3806 if (ip1 != ip2) 3807 inode_unlock(VFS_I(ip1)); 3808 return ret; 3809 } 3810 } else 3811 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, 3812 VFS_I(ip2)->i_mapping); 3813 3814 return 0; 3815 } 3816 3817 /* Unlock both inodes to allow IO and mmap activity. */ 3818 void 3819 xfs_iunlock2_io_mmap( 3820 struct xfs_inode *ip1, 3821 struct xfs_inode *ip2) 3822 { 3823 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3824 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3825 if (ip1 != ip2) 3826 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3827 } else 3828 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, 3829 VFS_I(ip2)->i_mapping); 3830 3831 inode_unlock(VFS_I(ip2)); 3832 if (ip1 != ip2) 3833 inode_unlock(VFS_I(ip1)); 3834 } 3835 3836 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ 3837 void 3838 xfs_iunlock2_remapping( 3839 struct xfs_inode *ip1, 3840 struct xfs_inode *ip2) 3841 { 3842 xfs_iflags_clear(ip1, XFS_IREMAPPING); 3843 3844 if (ip1 != ip2) 3845 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); 3846 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3847 3848 if (ip1 != ip2) 3849 inode_unlock_shared(VFS_I(ip1)); 3850 inode_unlock(VFS_I(ip2)); 3851 } 3852 3853 /* 3854 * Reload the incore inode list for this inode. Caller should ensure that 3855 * the link count cannot change, either by taking ILOCK_SHARED or otherwise 3856 * preventing other threads from executing. 3857 */ 3858 int 3859 xfs_inode_reload_unlinked_bucket( 3860 struct xfs_trans *tp, 3861 struct xfs_inode *ip) 3862 { 3863 struct xfs_mount *mp = tp->t_mountp; 3864 struct xfs_buf *agibp; 3865 struct xfs_agi *agi; 3866 struct xfs_perag *pag; 3867 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 3868 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 3869 xfs_agino_t prev_agino, next_agino; 3870 unsigned int bucket; 3871 bool foundit = false; 3872 int error; 3873 3874 /* Grab the first inode in the list */ 3875 pag = xfs_perag_get(mp, agno); 3876 error = xfs_ialloc_read_agi(pag, tp, 0, &agibp); 3877 xfs_perag_put(pag); 3878 if (error) 3879 return error; 3880 3881 /* 3882 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the 3883 * incore unlinked list pointers for this inode. Check once more to 3884 * see if we raced with anyone else to reload the unlinked list. 3885 */ 3886 if (!xfs_inode_unlinked_incomplete(ip)) { 3887 foundit = true; 3888 goto out_agibp; 3889 } 3890 3891 bucket = agino % XFS_AGI_UNLINKED_BUCKETS; 3892 agi = agibp->b_addr; 3893 3894 trace_xfs_inode_reload_unlinked_bucket(ip); 3895 3896 xfs_info_ratelimited(mp, 3897 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", 3898 agino, agno); 3899 3900 prev_agino = NULLAGINO; 3901 next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3902 while (next_agino != NULLAGINO) { 3903 struct xfs_inode *next_ip = NULL; 3904 3905 /* Found this caller's inode, set its backlink. */ 3906 if (next_agino == agino) { 3907 next_ip = ip; 3908 next_ip->i_prev_unlinked = prev_agino; 3909 foundit = true; 3910 goto next_inode; 3911 } 3912 3913 /* Try in-memory lookup first. */ 3914 next_ip = xfs_iunlink_lookup(pag, next_agino); 3915 if (next_ip) 3916 goto next_inode; 3917 3918 /* Inode not in memory, try reloading it. */ 3919 error = xfs_iunlink_reload_next(tp, agibp, prev_agino, 3920 next_agino); 3921 if (error) 3922 break; 3923 3924 /* Grab the reloaded inode. */ 3925 next_ip = xfs_iunlink_lookup(pag, next_agino); 3926 if (!next_ip) { 3927 /* No incore inode at all? We reloaded it... */ 3928 ASSERT(next_ip != NULL); 3929 error = -EFSCORRUPTED; 3930 break; 3931 } 3932 3933 next_inode: 3934 prev_agino = next_agino; 3935 next_agino = next_ip->i_next_unlinked; 3936 } 3937 3938 out_agibp: 3939 xfs_trans_brelse(tp, agibp); 3940 /* Should have found this inode somewhere in the iunlinked bucket. */ 3941 if (!error && !foundit) 3942 error = -EFSCORRUPTED; 3943 return error; 3944 } 3945 3946 /* Decide if this inode is missing its unlinked list and reload it. */ 3947 int 3948 xfs_inode_reload_unlinked( 3949 struct xfs_inode *ip) 3950 { 3951 struct xfs_trans *tp; 3952 int error; 3953 3954 error = xfs_trans_alloc_empty(ip->i_mount, &tp); 3955 if (error) 3956 return error; 3957 3958 xfs_ilock(ip, XFS_ILOCK_SHARED); 3959 if (xfs_inode_unlinked_incomplete(ip)) 3960 error = xfs_inode_reload_unlinked_bucket(tp, ip); 3961 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3962 xfs_trans_cancel(tp); 3963 3964 return error; 3965 } 3966 3967 /* Has this inode fork been zapped by repair? */ 3968 bool 3969 xfs_ifork_zapped( 3970 const struct xfs_inode *ip, 3971 int whichfork) 3972 { 3973 unsigned int datamask = 0; 3974 3975 switch (whichfork) { 3976 case XFS_DATA_FORK: 3977 switch (ip->i_vnode.i_mode & S_IFMT) { 3978 case S_IFDIR: 3979 datamask = XFS_SICK_INO_DIR_ZAPPED; 3980 break; 3981 case S_IFLNK: 3982 datamask = XFS_SICK_INO_SYMLINK_ZAPPED; 3983 break; 3984 } 3985 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); 3986 case XFS_ATTR_FORK: 3987 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; 3988 default: 3989 return false; 3990 } 3991 } 3992 3993 /* Compute the number of data and realtime blocks used by a file. */ 3994 void 3995 xfs_inode_count_blocks( 3996 struct xfs_trans *tp, 3997 struct xfs_inode *ip, 3998 xfs_filblks_t *dblocks, 3999 xfs_filblks_t *rblocks) 4000 { 4001 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 4002 4003 *rblocks = 0; 4004 if (XFS_IS_REALTIME_INODE(ip)) 4005 xfs_bmap_count_leaves(ifp, rblocks); 4006 *dblocks = ip->i_nblocks - *rblocks; 4007 } 4008 4009 static void 4010 xfs_wait_dax_page( 4011 struct inode *inode) 4012 { 4013 struct xfs_inode *ip = XFS_I(inode); 4014 4015 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 4016 schedule(); 4017 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 4018 } 4019 4020 int 4021 xfs_break_dax_layouts( 4022 struct inode *inode, 4023 bool *retry) 4024 { 4025 struct page *page; 4026 4027 xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); 4028 4029 page = dax_layout_busy_page(inode->i_mapping); 4030 if (!page) 4031 return 0; 4032 4033 *retry = true; 4034 return ___wait_var_event(&page->_refcount, 4035 atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 4036 0, 0, xfs_wait_dax_page(inode)); 4037 } 4038 4039 int 4040 xfs_break_layouts( 4041 struct inode *inode, 4042 uint *iolock, 4043 enum layout_break_reason reason) 4044 { 4045 bool retry; 4046 int error; 4047 4048 xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); 4049 4050 do { 4051 retry = false; 4052 switch (reason) { 4053 case BREAK_UNMAP: 4054 error = xfs_break_dax_layouts(inode, &retry); 4055 if (error || retry) 4056 break; 4057 fallthrough; 4058 case BREAK_WRITE: 4059 error = xfs_break_leased_layouts(inode, iolock, &retry); 4060 break; 4061 default: 4062 WARN_ON_ONCE(1); 4063 error = -EINVAL; 4064 } 4065 } while (error == 0 && retry); 4066 4067 return error; 4068 } 4069 4070 /* Returns the size of fundamental allocation unit for a file, in bytes. */ 4071 unsigned int 4072 xfs_inode_alloc_unitsize( 4073 struct xfs_inode *ip) 4074 { 4075 unsigned int blocks = 1; 4076 4077 if (XFS_IS_REALTIME_INODE(ip)) 4078 blocks = ip->i_mount->m_sb.sb_rextsize; 4079 4080 return XFS_FSB_TO_B(ip->i_mount, blocks); 4081 } 4082