1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1997 Linus Torvalds 4 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) 5 */ 6 #include <linux/export.h> 7 #include <linux/fs.h> 8 #include <linux/mm.h> 9 #include <linux/backing-dev.h> 10 #include <linux/hash.h> 11 #include <linux/swap.h> 12 #include <linux/security.h> 13 #include <linux/cdev.h> 14 #include <linux/memblock.h> 15 #include <linux/fsnotify.h> 16 #include <linux/mount.h> 17 #include <linux/posix_acl.h> 18 #include <linux/prefetch.h> 19 #include <linux/buffer_head.h> /* for inode_has_buffers */ 20 #include <linux/ratelimit.h> 21 #include <linux/list_lru.h> 22 #include <linux/iversion.h> 23 #include <trace/events/writeback.h> 24 #include "internal.h" 25 26 /* 27 * Inode locking rules: 28 * 29 * inode->i_lock protects: 30 * inode->i_state, inode->i_hash, __iget() 31 * Inode LRU list locks protect: 32 * inode->i_sb->s_inode_lru, inode->i_lru 33 * inode->i_sb->s_inode_list_lock protects: 34 * inode->i_sb->s_inodes, inode->i_sb_list 35 * bdi->wb.list_lock protects: 36 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list 37 * inode_hash_lock protects: 38 * inode_hashtable, inode->i_hash 39 * 40 * Lock ordering: 41 * 42 * inode->i_sb->s_inode_list_lock 43 * inode->i_lock 44 * Inode LRU list locks 45 * 46 * bdi->wb.list_lock 47 * inode->i_lock 48 * 49 * inode_hash_lock 50 * inode->i_sb->s_inode_list_lock 51 * inode->i_lock 52 * 53 * iunique_lock 54 * inode_hash_lock 55 */ 56 57 static unsigned int i_hash_mask __read_mostly; 58 static unsigned int i_hash_shift __read_mostly; 59 static struct hlist_head *inode_hashtable __read_mostly; 60 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 61 62 /* 63 * Empty aops. Can be used for the cases where the user does not 64 * define any of the address_space operations. 65 */ 66 const struct address_space_operations empty_aops = { 67 }; 68 EXPORT_SYMBOL(empty_aops); 69 70 /* 71 * Statistics gathering.. 72 */ 73 struct inodes_stat_t inodes_stat; 74 75 static DEFINE_PER_CPU(unsigned long, nr_inodes); 76 static DEFINE_PER_CPU(unsigned long, nr_unused); 77 78 static struct kmem_cache *inode_cachep __read_mostly; 79 80 static long get_nr_inodes(void) 81 { 82 int i; 83 long sum = 0; 84 for_each_possible_cpu(i) 85 sum += per_cpu(nr_inodes, i); 86 return sum < 0 ? 0 : sum; 87 } 88 89 static inline long get_nr_inodes_unused(void) 90 { 91 int i; 92 long sum = 0; 93 for_each_possible_cpu(i) 94 sum += per_cpu(nr_unused, i); 95 return sum < 0 ? 0 : sum; 96 } 97 98 long get_nr_dirty_inodes(void) 99 { 100 /* not actually dirty inodes, but a wild approximation */ 101 long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 102 return nr_dirty > 0 ? nr_dirty : 0; 103 } 104 105 /* 106 * Handle nr_inode sysctl 107 */ 108 #ifdef CONFIG_SYSCTL 109 int proc_nr_inodes(struct ctl_table *table, int write, 110 void __user *buffer, size_t *lenp, loff_t *ppos) 111 { 112 inodes_stat.nr_inodes = get_nr_inodes(); 113 inodes_stat.nr_unused = get_nr_inodes_unused(); 114 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 115 } 116 #endif 117 118 static int no_open(struct inode *inode, struct file *file) 119 { 120 return -ENXIO; 121 } 122 123 /** 124 * inode_init_always - perform inode structure initialisation 125 * @sb: superblock inode belongs to 126 * @inode: inode to initialise 127 * 128 * These are initializations that need to be done on every inode 129 * allocation as the fields are not initialised by slab allocation. 130 */ 131 int inode_init_always(struct super_block *sb, struct inode *inode) 132 { 133 static const struct inode_operations empty_iops; 134 static const struct file_operations no_open_fops = {.open = no_open}; 135 struct address_space *const mapping = &inode->i_data; 136 137 inode->i_sb = sb; 138 inode->i_blkbits = sb->s_blocksize_bits; 139 inode->i_flags = 0; 140 atomic_set(&inode->i_count, 1); 141 inode->i_op = &empty_iops; 142 inode->i_fop = &no_open_fops; 143 inode->__i_nlink = 1; 144 inode->i_opflags = 0; 145 if (sb->s_xattr) 146 inode->i_opflags |= IOP_XATTR; 147 i_uid_write(inode, 0); 148 i_gid_write(inode, 0); 149 atomic_set(&inode->i_writecount, 0); 150 inode->i_size = 0; 151 inode->i_write_hint = WRITE_LIFE_NOT_SET; 152 inode->i_blocks = 0; 153 inode->i_bytes = 0; 154 inode->i_generation = 0; 155 inode->i_pipe = NULL; 156 inode->i_bdev = NULL; 157 inode->i_cdev = NULL; 158 inode->i_link = NULL; 159 inode->i_dir_seq = 0; 160 inode->i_rdev = 0; 161 inode->dirtied_when = 0; 162 163 #ifdef CONFIG_CGROUP_WRITEBACK 164 inode->i_wb_frn_winner = 0; 165 inode->i_wb_frn_avg_time = 0; 166 inode->i_wb_frn_history = 0; 167 #endif 168 169 if (security_inode_alloc(inode)) 170 goto out; 171 spin_lock_init(&inode->i_lock); 172 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 173 174 init_rwsem(&inode->i_rwsem); 175 lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); 176 177 atomic_set(&inode->i_dio_count, 0); 178 179 mapping->a_ops = &empty_aops; 180 mapping->host = inode; 181 mapping->flags = 0; 182 mapping->wb_err = 0; 183 atomic_set(&mapping->i_mmap_writable, 0); 184 #ifdef CONFIG_READ_ONLY_THP_FOR_FS 185 atomic_set(&mapping->nr_thps, 0); 186 #endif 187 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 188 mapping->private_data = NULL; 189 mapping->writeback_index = 0; 190 inode->i_private = NULL; 191 inode->i_mapping = mapping; 192 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 193 #ifdef CONFIG_FS_POSIX_ACL 194 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 195 #endif 196 197 #ifdef CONFIG_FSNOTIFY 198 inode->i_fsnotify_mask = 0; 199 #endif 200 inode->i_flctx = NULL; 201 this_cpu_inc(nr_inodes); 202 203 return 0; 204 out: 205 return -ENOMEM; 206 } 207 EXPORT_SYMBOL(inode_init_always); 208 209 void free_inode_nonrcu(struct inode *inode) 210 { 211 kmem_cache_free(inode_cachep, inode); 212 } 213 EXPORT_SYMBOL(free_inode_nonrcu); 214 215 static void i_callback(struct rcu_head *head) 216 { 217 struct inode *inode = container_of(head, struct inode, i_rcu); 218 if (inode->free_inode) 219 inode->free_inode(inode); 220 else 221 free_inode_nonrcu(inode); 222 } 223 224 static struct inode *alloc_inode(struct super_block *sb) 225 { 226 const struct super_operations *ops = sb->s_op; 227 struct inode *inode; 228 229 if (ops->alloc_inode) 230 inode = ops->alloc_inode(sb); 231 else 232 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 233 234 if (!inode) 235 return NULL; 236 237 if (unlikely(inode_init_always(sb, inode))) { 238 if (ops->destroy_inode) { 239 ops->destroy_inode(inode); 240 if (!ops->free_inode) 241 return NULL; 242 } 243 inode->free_inode = ops->free_inode; 244 i_callback(&inode->i_rcu); 245 return NULL; 246 } 247 248 return inode; 249 } 250 251 void __destroy_inode(struct inode *inode) 252 { 253 BUG_ON(inode_has_buffers(inode)); 254 inode_detach_wb(inode); 255 security_inode_free(inode); 256 fsnotify_inode_delete(inode); 257 locks_free_lock_context(inode); 258 if (!inode->i_nlink) { 259 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 260 atomic_long_dec(&inode->i_sb->s_remove_count); 261 } 262 263 #ifdef CONFIG_FS_POSIX_ACL 264 if (inode->i_acl && !is_uncached_acl(inode->i_acl)) 265 posix_acl_release(inode->i_acl); 266 if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) 267 posix_acl_release(inode->i_default_acl); 268 #endif 269 this_cpu_dec(nr_inodes); 270 } 271 EXPORT_SYMBOL(__destroy_inode); 272 273 static void destroy_inode(struct inode *inode) 274 { 275 const struct super_operations *ops = inode->i_sb->s_op; 276 277 BUG_ON(!list_empty(&inode->i_lru)); 278 __destroy_inode(inode); 279 if (ops->destroy_inode) { 280 ops->destroy_inode(inode); 281 if (!ops->free_inode) 282 return; 283 } 284 inode->free_inode = ops->free_inode; 285 call_rcu(&inode->i_rcu, i_callback); 286 } 287 288 /** 289 * drop_nlink - directly drop an inode's link count 290 * @inode: inode 291 * 292 * This is a low-level filesystem helper to replace any 293 * direct filesystem manipulation of i_nlink. In cases 294 * where we are attempting to track writes to the 295 * filesystem, a decrement to zero means an imminent 296 * write when the file is truncated and actually unlinked 297 * on the filesystem. 298 */ 299 void drop_nlink(struct inode *inode) 300 { 301 WARN_ON(inode->i_nlink == 0); 302 inode->__i_nlink--; 303 if (!inode->i_nlink) 304 atomic_long_inc(&inode->i_sb->s_remove_count); 305 } 306 EXPORT_SYMBOL(drop_nlink); 307 308 /** 309 * clear_nlink - directly zero an inode's link count 310 * @inode: inode 311 * 312 * This is a low-level filesystem helper to replace any 313 * direct filesystem manipulation of i_nlink. See 314 * drop_nlink() for why we care about i_nlink hitting zero. 315 */ 316 void clear_nlink(struct inode *inode) 317 { 318 if (inode->i_nlink) { 319 inode->__i_nlink = 0; 320 atomic_long_inc(&inode->i_sb->s_remove_count); 321 } 322 } 323 EXPORT_SYMBOL(clear_nlink); 324 325 /** 326 * set_nlink - directly set an inode's link count 327 * @inode: inode 328 * @nlink: new nlink (should be non-zero) 329 * 330 * This is a low-level filesystem helper to replace any 331 * direct filesystem manipulation of i_nlink. 332 */ 333 void set_nlink(struct inode *inode, unsigned int nlink) 334 { 335 if (!nlink) { 336 clear_nlink(inode); 337 } else { 338 /* Yes, some filesystems do change nlink from zero to one */ 339 if (inode->i_nlink == 0) 340 atomic_long_dec(&inode->i_sb->s_remove_count); 341 342 inode->__i_nlink = nlink; 343 } 344 } 345 EXPORT_SYMBOL(set_nlink); 346 347 /** 348 * inc_nlink - directly increment an inode's link count 349 * @inode: inode 350 * 351 * This is a low-level filesystem helper to replace any 352 * direct filesystem manipulation of i_nlink. Currently, 353 * it is only here for parity with dec_nlink(). 354 */ 355 void inc_nlink(struct inode *inode) 356 { 357 if (unlikely(inode->i_nlink == 0)) { 358 WARN_ON(!(inode->i_state & I_LINKABLE)); 359 atomic_long_dec(&inode->i_sb->s_remove_count); 360 } 361 362 inode->__i_nlink++; 363 } 364 EXPORT_SYMBOL(inc_nlink); 365 366 static void __address_space_init_once(struct address_space *mapping) 367 { 368 xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); 369 init_rwsem(&mapping->i_mmap_rwsem); 370 INIT_LIST_HEAD(&mapping->private_list); 371 spin_lock_init(&mapping->private_lock); 372 mapping->i_mmap = RB_ROOT_CACHED; 373 } 374 375 void address_space_init_once(struct address_space *mapping) 376 { 377 memset(mapping, 0, sizeof(*mapping)); 378 __address_space_init_once(mapping); 379 } 380 EXPORT_SYMBOL(address_space_init_once); 381 382 /* 383 * These are initializations that only need to be done 384 * once, because the fields are idempotent across use 385 * of the inode, so let the slab aware of that. 386 */ 387 void inode_init_once(struct inode *inode) 388 { 389 memset(inode, 0, sizeof(*inode)); 390 INIT_HLIST_NODE(&inode->i_hash); 391 INIT_LIST_HEAD(&inode->i_devices); 392 INIT_LIST_HEAD(&inode->i_io_list); 393 INIT_LIST_HEAD(&inode->i_wb_list); 394 INIT_LIST_HEAD(&inode->i_lru); 395 __address_space_init_once(&inode->i_data); 396 i_size_ordered_init(inode); 397 } 398 EXPORT_SYMBOL(inode_init_once); 399 400 static void init_once(void *foo) 401 { 402 struct inode *inode = (struct inode *) foo; 403 404 inode_init_once(inode); 405 } 406 407 /* 408 * inode->i_lock must be held 409 */ 410 void __iget(struct inode *inode) 411 { 412 atomic_inc(&inode->i_count); 413 } 414 415 /* 416 * get additional reference to inode; caller must already hold one. 417 */ 418 void ihold(struct inode *inode) 419 { 420 WARN_ON(atomic_inc_return(&inode->i_count) < 2); 421 } 422 EXPORT_SYMBOL(ihold); 423 424 static void inode_lru_list_add(struct inode *inode) 425 { 426 if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) 427 this_cpu_inc(nr_unused); 428 else 429 inode->i_state |= I_REFERENCED; 430 } 431 432 /* 433 * Add inode to LRU if needed (inode is unused and clean). 434 * 435 * Needs inode->i_lock held. 436 */ 437 void inode_add_lru(struct inode *inode) 438 { 439 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | 440 I_FREEING | I_WILL_FREE)) && 441 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) 442 inode_lru_list_add(inode); 443 } 444 445 446 static void inode_lru_list_del(struct inode *inode) 447 { 448 449 if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) 450 this_cpu_dec(nr_unused); 451 } 452 453 /** 454 * inode_sb_list_add - add inode to the superblock list of inodes 455 * @inode: inode to add 456 */ 457 void inode_sb_list_add(struct inode *inode) 458 { 459 spin_lock(&inode->i_sb->s_inode_list_lock); 460 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 461 spin_unlock(&inode->i_sb->s_inode_list_lock); 462 } 463 EXPORT_SYMBOL_GPL(inode_sb_list_add); 464 465 static inline void inode_sb_list_del(struct inode *inode) 466 { 467 if (!list_empty(&inode->i_sb_list)) { 468 spin_lock(&inode->i_sb->s_inode_list_lock); 469 list_del_init(&inode->i_sb_list); 470 spin_unlock(&inode->i_sb->s_inode_list_lock); 471 } 472 } 473 474 static unsigned long hash(struct super_block *sb, unsigned long hashval) 475 { 476 unsigned long tmp; 477 478 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 479 L1_CACHE_BYTES; 480 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); 481 return tmp & i_hash_mask; 482 } 483 484 /** 485 * __insert_inode_hash - hash an inode 486 * @inode: unhashed inode 487 * @hashval: unsigned long value used to locate this object in the 488 * inode_hashtable. 489 * 490 * Add an inode to the inode hash for this superblock. 491 */ 492 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 493 { 494 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 495 496 spin_lock(&inode_hash_lock); 497 spin_lock(&inode->i_lock); 498 hlist_add_head(&inode->i_hash, b); 499 spin_unlock(&inode->i_lock); 500 spin_unlock(&inode_hash_lock); 501 } 502 EXPORT_SYMBOL(__insert_inode_hash); 503 504 /** 505 * __remove_inode_hash - remove an inode from the hash 506 * @inode: inode to unhash 507 * 508 * Remove an inode from the superblock. 509 */ 510 void __remove_inode_hash(struct inode *inode) 511 { 512 spin_lock(&inode_hash_lock); 513 spin_lock(&inode->i_lock); 514 hlist_del_init(&inode->i_hash); 515 spin_unlock(&inode->i_lock); 516 spin_unlock(&inode_hash_lock); 517 } 518 EXPORT_SYMBOL(__remove_inode_hash); 519 520 void clear_inode(struct inode *inode) 521 { 522 /* 523 * We have to cycle the i_pages lock here because reclaim can be in the 524 * process of removing the last page (in __delete_from_page_cache()) 525 * and we must not free the mapping under it. 526 */ 527 xa_lock_irq(&inode->i_data.i_pages); 528 BUG_ON(inode->i_data.nrpages); 529 BUG_ON(inode->i_data.nrexceptional); 530 xa_unlock_irq(&inode->i_data.i_pages); 531 BUG_ON(!list_empty(&inode->i_data.private_list)); 532 BUG_ON(!(inode->i_state & I_FREEING)); 533 BUG_ON(inode->i_state & I_CLEAR); 534 BUG_ON(!list_empty(&inode->i_wb_list)); 535 /* don't need i_lock here, no concurrent mods to i_state */ 536 inode->i_state = I_FREEING | I_CLEAR; 537 } 538 EXPORT_SYMBOL(clear_inode); 539 540 /* 541 * Free the inode passed in, removing it from the lists it is still connected 542 * to. We remove any pages still attached to the inode and wait for any IO that 543 * is still in progress before finally destroying the inode. 544 * 545 * An inode must already be marked I_FREEING so that we avoid the inode being 546 * moved back onto lists if we race with other code that manipulates the lists 547 * (e.g. writeback_single_inode). The caller is responsible for setting this. 548 * 549 * An inode must already be removed from the LRU list before being evicted from 550 * the cache. This should occur atomically with setting the I_FREEING state 551 * flag, so no inodes here should ever be on the LRU when being evicted. 552 */ 553 static void evict(struct inode *inode) 554 { 555 const struct super_operations *op = inode->i_sb->s_op; 556 557 BUG_ON(!(inode->i_state & I_FREEING)); 558 BUG_ON(!list_empty(&inode->i_lru)); 559 560 if (!list_empty(&inode->i_io_list)) 561 inode_io_list_del(inode); 562 563 inode_sb_list_del(inode); 564 565 /* 566 * Wait for flusher thread to be done with the inode so that filesystem 567 * does not start destroying it while writeback is still running. Since 568 * the inode has I_FREEING set, flusher thread won't start new work on 569 * the inode. We just have to wait for running writeback to finish. 570 */ 571 inode_wait_for_writeback(inode); 572 573 if (op->evict_inode) { 574 op->evict_inode(inode); 575 } else { 576 truncate_inode_pages_final(&inode->i_data); 577 clear_inode(inode); 578 } 579 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 580 bd_forget(inode); 581 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 582 cd_forget(inode); 583 584 remove_inode_hash(inode); 585 586 spin_lock(&inode->i_lock); 587 wake_up_bit(&inode->i_state, __I_NEW); 588 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 589 spin_unlock(&inode->i_lock); 590 591 destroy_inode(inode); 592 } 593 594 /* 595 * dispose_list - dispose of the contents of a local list 596 * @head: the head of the list to free 597 * 598 * Dispose-list gets a local list with local inodes in it, so it doesn't 599 * need to worry about list corruption and SMP locks. 600 */ 601 static void dispose_list(struct list_head *head) 602 { 603 while (!list_empty(head)) { 604 struct inode *inode; 605 606 inode = list_first_entry(head, struct inode, i_lru); 607 list_del_init(&inode->i_lru); 608 609 evict(inode); 610 cond_resched(); 611 } 612 } 613 614 /** 615 * evict_inodes - evict all evictable inodes for a superblock 616 * @sb: superblock to operate on 617 * 618 * Make sure that no inodes with zero refcount are retained. This is 619 * called by superblock shutdown after having SB_ACTIVE flag removed, 620 * so any inode reaching zero refcount during or after that call will 621 * be immediately evicted. 622 */ 623 void evict_inodes(struct super_block *sb) 624 { 625 struct inode *inode, *next; 626 LIST_HEAD(dispose); 627 628 again: 629 spin_lock(&sb->s_inode_list_lock); 630 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 631 if (atomic_read(&inode->i_count)) 632 continue; 633 634 spin_lock(&inode->i_lock); 635 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 636 spin_unlock(&inode->i_lock); 637 continue; 638 } 639 640 inode->i_state |= I_FREEING; 641 inode_lru_list_del(inode); 642 spin_unlock(&inode->i_lock); 643 list_add(&inode->i_lru, &dispose); 644 645 /* 646 * We can have a ton of inodes to evict at unmount time given 647 * enough memory, check to see if we need to go to sleep for a 648 * bit so we don't livelock. 649 */ 650 if (need_resched()) { 651 spin_unlock(&sb->s_inode_list_lock); 652 cond_resched(); 653 dispose_list(&dispose); 654 goto again; 655 } 656 } 657 spin_unlock(&sb->s_inode_list_lock); 658 659 dispose_list(&dispose); 660 } 661 EXPORT_SYMBOL_GPL(evict_inodes); 662 663 /** 664 * invalidate_inodes - attempt to free all inodes on a superblock 665 * @sb: superblock to operate on 666 * @kill_dirty: flag to guide handling of dirty inodes 667 * 668 * Attempts to free all inodes for a given superblock. If there were any 669 * busy inodes return a non-zero value, else zero. 670 * If @kill_dirty is set, discard dirty inodes too, otherwise treat 671 * them as busy. 672 */ 673 int invalidate_inodes(struct super_block *sb, bool kill_dirty) 674 { 675 int busy = 0; 676 struct inode *inode, *next; 677 LIST_HEAD(dispose); 678 679 spin_lock(&sb->s_inode_list_lock); 680 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 681 spin_lock(&inode->i_lock); 682 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 683 spin_unlock(&inode->i_lock); 684 continue; 685 } 686 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { 687 spin_unlock(&inode->i_lock); 688 busy = 1; 689 continue; 690 } 691 if (atomic_read(&inode->i_count)) { 692 spin_unlock(&inode->i_lock); 693 busy = 1; 694 continue; 695 } 696 697 inode->i_state |= I_FREEING; 698 inode_lru_list_del(inode); 699 spin_unlock(&inode->i_lock); 700 list_add(&inode->i_lru, &dispose); 701 } 702 spin_unlock(&sb->s_inode_list_lock); 703 704 dispose_list(&dispose); 705 706 return busy; 707 } 708 709 /* 710 * Isolate the inode from the LRU in preparation for freeing it. 711 * 712 * Any inodes which are pinned purely because of attached pagecache have their 713 * pagecache removed. If the inode has metadata buffers attached to 714 * mapping->private_list then try to remove them. 715 * 716 * If the inode has the I_REFERENCED flag set, then it means that it has been 717 * used recently - the flag is set in iput_final(). When we encounter such an 718 * inode, clear the flag and move it to the back of the LRU so it gets another 719 * pass through the LRU before it gets reclaimed. This is necessary because of 720 * the fact we are doing lazy LRU updates to minimise lock contention so the 721 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 722 * with this flag set because they are the inodes that are out of order. 723 */ 724 static enum lru_status inode_lru_isolate(struct list_head *item, 725 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 726 { 727 struct list_head *freeable = arg; 728 struct inode *inode = container_of(item, struct inode, i_lru); 729 730 /* 731 * we are inverting the lru lock/inode->i_lock here, so use a trylock. 732 * If we fail to get the lock, just skip it. 733 */ 734 if (!spin_trylock(&inode->i_lock)) 735 return LRU_SKIP; 736 737 /* 738 * Referenced or dirty inodes are still in use. Give them another pass 739 * through the LRU as we canot reclaim them now. 740 */ 741 if (atomic_read(&inode->i_count) || 742 (inode->i_state & ~I_REFERENCED)) { 743 list_lru_isolate(lru, &inode->i_lru); 744 spin_unlock(&inode->i_lock); 745 this_cpu_dec(nr_unused); 746 return LRU_REMOVED; 747 } 748 749 /* recently referenced inodes get one more pass */ 750 if (inode->i_state & I_REFERENCED) { 751 inode->i_state &= ~I_REFERENCED; 752 spin_unlock(&inode->i_lock); 753 return LRU_ROTATE; 754 } 755 756 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 757 __iget(inode); 758 spin_unlock(&inode->i_lock); 759 spin_unlock(lru_lock); 760 if (remove_inode_buffers(inode)) { 761 unsigned long reap; 762 reap = invalidate_mapping_pages(&inode->i_data, 0, -1); 763 if (current_is_kswapd()) 764 __count_vm_events(KSWAPD_INODESTEAL, reap); 765 else 766 __count_vm_events(PGINODESTEAL, reap); 767 if (current->reclaim_state) 768 current->reclaim_state->reclaimed_slab += reap; 769 } 770 iput(inode); 771 spin_lock(lru_lock); 772 return LRU_RETRY; 773 } 774 775 WARN_ON(inode->i_state & I_NEW); 776 inode->i_state |= I_FREEING; 777 list_lru_isolate_move(lru, &inode->i_lru, freeable); 778 spin_unlock(&inode->i_lock); 779 780 this_cpu_dec(nr_unused); 781 return LRU_REMOVED; 782 } 783 784 /* 785 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 786 * This is called from the superblock shrinker function with a number of inodes 787 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 788 * then are freed outside inode_lock by dispose_list(). 789 */ 790 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 791 { 792 LIST_HEAD(freeable); 793 long freed; 794 795 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 796 inode_lru_isolate, &freeable); 797 dispose_list(&freeable); 798 return freed; 799 } 800 801 static void __wait_on_freeing_inode(struct inode *inode); 802 /* 803 * Called with the inode lock held. 804 */ 805 static struct inode *find_inode(struct super_block *sb, 806 struct hlist_head *head, 807 int (*test)(struct inode *, void *), 808 void *data) 809 { 810 struct inode *inode = NULL; 811 812 repeat: 813 hlist_for_each_entry(inode, head, i_hash) { 814 if (inode->i_sb != sb) 815 continue; 816 if (!test(inode, data)) 817 continue; 818 spin_lock(&inode->i_lock); 819 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 820 __wait_on_freeing_inode(inode); 821 goto repeat; 822 } 823 if (unlikely(inode->i_state & I_CREATING)) { 824 spin_unlock(&inode->i_lock); 825 return ERR_PTR(-ESTALE); 826 } 827 __iget(inode); 828 spin_unlock(&inode->i_lock); 829 return inode; 830 } 831 return NULL; 832 } 833 834 /* 835 * find_inode_fast is the fast path version of find_inode, see the comment at 836 * iget_locked for details. 837 */ 838 static struct inode *find_inode_fast(struct super_block *sb, 839 struct hlist_head *head, unsigned long ino) 840 { 841 struct inode *inode = NULL; 842 843 repeat: 844 hlist_for_each_entry(inode, head, i_hash) { 845 if (inode->i_ino != ino) 846 continue; 847 if (inode->i_sb != sb) 848 continue; 849 spin_lock(&inode->i_lock); 850 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 851 __wait_on_freeing_inode(inode); 852 goto repeat; 853 } 854 if (unlikely(inode->i_state & I_CREATING)) { 855 spin_unlock(&inode->i_lock); 856 return ERR_PTR(-ESTALE); 857 } 858 __iget(inode); 859 spin_unlock(&inode->i_lock); 860 return inode; 861 } 862 return NULL; 863 } 864 865 /* 866 * Each cpu owns a range of LAST_INO_BATCH numbers. 867 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, 868 * to renew the exhausted range. 869 * 870 * This does not significantly increase overflow rate because every CPU can 871 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is 872 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the 873 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase 874 * overflow rate by 2x, which does not seem too significant. 875 * 876 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 877 * error if st_ino won't fit in target struct field. Use 32bit counter 878 * here to attempt to avoid that. 879 */ 880 #define LAST_INO_BATCH 1024 881 static DEFINE_PER_CPU(unsigned int, last_ino); 882 883 unsigned int get_next_ino(void) 884 { 885 unsigned int *p = &get_cpu_var(last_ino); 886 unsigned int res = *p; 887 888 #ifdef CONFIG_SMP 889 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { 890 static atomic_t shared_last_ino; 891 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); 892 893 res = next - LAST_INO_BATCH; 894 } 895 #endif 896 897 res++; 898 /* get_next_ino should not provide a 0 inode number */ 899 if (unlikely(!res)) 900 res++; 901 *p = res; 902 put_cpu_var(last_ino); 903 return res; 904 } 905 EXPORT_SYMBOL(get_next_ino); 906 907 /** 908 * new_inode_pseudo - obtain an inode 909 * @sb: superblock 910 * 911 * Allocates a new inode for given superblock. 912 * Inode wont be chained in superblock s_inodes list 913 * This means : 914 * - fs can't be unmount 915 * - quotas, fsnotify, writeback can't work 916 */ 917 struct inode *new_inode_pseudo(struct super_block *sb) 918 { 919 struct inode *inode = alloc_inode(sb); 920 921 if (inode) { 922 spin_lock(&inode->i_lock); 923 inode->i_state = 0; 924 spin_unlock(&inode->i_lock); 925 INIT_LIST_HEAD(&inode->i_sb_list); 926 } 927 return inode; 928 } 929 930 /** 931 * new_inode - obtain an inode 932 * @sb: superblock 933 * 934 * Allocates a new inode for given superblock. The default gfp_mask 935 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. 936 * If HIGHMEM pages are unsuitable or it is known that pages allocated 937 * for the page cache are not reclaimable or migratable, 938 * mapping_set_gfp_mask() must be called with suitable flags on the 939 * newly created inode's mapping 940 * 941 */ 942 struct inode *new_inode(struct super_block *sb) 943 { 944 struct inode *inode; 945 946 spin_lock_prefetch(&sb->s_inode_list_lock); 947 948 inode = new_inode_pseudo(sb); 949 if (inode) 950 inode_sb_list_add(inode); 951 return inode; 952 } 953 EXPORT_SYMBOL(new_inode); 954 955 #ifdef CONFIG_DEBUG_LOCK_ALLOC 956 void lockdep_annotate_inode_mutex_key(struct inode *inode) 957 { 958 if (S_ISDIR(inode->i_mode)) { 959 struct file_system_type *type = inode->i_sb->s_type; 960 961 /* Set new key only if filesystem hasn't already changed it */ 962 if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { 963 /* 964 * ensure nobody is actually holding i_mutex 965 */ 966 // mutex_destroy(&inode->i_mutex); 967 init_rwsem(&inode->i_rwsem); 968 lockdep_set_class(&inode->i_rwsem, 969 &type->i_mutex_dir_key); 970 } 971 } 972 } 973 EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); 974 #endif 975 976 /** 977 * unlock_new_inode - clear the I_NEW state and wake up any waiters 978 * @inode: new inode to unlock 979 * 980 * Called when the inode is fully initialised to clear the new state of the 981 * inode and wake up anyone waiting for the inode to finish initialisation. 982 */ 983 void unlock_new_inode(struct inode *inode) 984 { 985 lockdep_annotate_inode_mutex_key(inode); 986 spin_lock(&inode->i_lock); 987 WARN_ON(!(inode->i_state & I_NEW)); 988 inode->i_state &= ~I_NEW & ~I_CREATING; 989 smp_mb(); 990 wake_up_bit(&inode->i_state, __I_NEW); 991 spin_unlock(&inode->i_lock); 992 } 993 EXPORT_SYMBOL(unlock_new_inode); 994 995 void discard_new_inode(struct inode *inode) 996 { 997 lockdep_annotate_inode_mutex_key(inode); 998 spin_lock(&inode->i_lock); 999 WARN_ON(!(inode->i_state & I_NEW)); 1000 inode->i_state &= ~I_NEW; 1001 smp_mb(); 1002 wake_up_bit(&inode->i_state, __I_NEW); 1003 spin_unlock(&inode->i_lock); 1004 iput(inode); 1005 } 1006 EXPORT_SYMBOL(discard_new_inode); 1007 1008 /** 1009 * lock_two_nondirectories - take two i_mutexes on non-directory objects 1010 * 1011 * Lock any non-NULL argument that is not a directory. 1012 * Zero, one or two objects may be locked by this function. 1013 * 1014 * @inode1: first inode to lock 1015 * @inode2: second inode to lock 1016 */ 1017 void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 1018 { 1019 if (inode1 > inode2) 1020 swap(inode1, inode2); 1021 1022 if (inode1 && !S_ISDIR(inode1->i_mode)) 1023 inode_lock(inode1); 1024 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 1025 inode_lock_nested(inode2, I_MUTEX_NONDIR2); 1026 } 1027 EXPORT_SYMBOL(lock_two_nondirectories); 1028 1029 /** 1030 * unlock_two_nondirectories - release locks from lock_two_nondirectories() 1031 * @inode1: first inode to unlock 1032 * @inode2: second inode to unlock 1033 */ 1034 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 1035 { 1036 if (inode1 && !S_ISDIR(inode1->i_mode)) 1037 inode_unlock(inode1); 1038 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 1039 inode_unlock(inode2); 1040 } 1041 EXPORT_SYMBOL(unlock_two_nondirectories); 1042 1043 /** 1044 * inode_insert5 - obtain an inode from a mounted file system 1045 * @inode: pre-allocated inode to use for insert to cache 1046 * @hashval: hash value (usually inode number) to get 1047 * @test: callback used for comparisons between inodes 1048 * @set: callback used to initialize a new struct inode 1049 * @data: opaque data pointer to pass to @test and @set 1050 * 1051 * Search for the inode specified by @hashval and @data in the inode cache, 1052 * and if present it is return it with an increased reference count. This is 1053 * a variant of iget5_locked() for callers that don't want to fail on memory 1054 * allocation of inode. 1055 * 1056 * If the inode is not in cache, insert the pre-allocated inode to cache and 1057 * return it locked, hashed, and with the I_NEW flag set. The file system gets 1058 * to fill it in before unlocking it via unlock_new_inode(). 1059 * 1060 * Note both @test and @set are called with the inode_hash_lock held, so can't 1061 * sleep. 1062 */ 1063 struct inode *inode_insert5(struct inode *inode, unsigned long hashval, 1064 int (*test)(struct inode *, void *), 1065 int (*set)(struct inode *, void *), void *data) 1066 { 1067 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 1068 struct inode *old; 1069 bool creating = inode->i_state & I_CREATING; 1070 1071 again: 1072 spin_lock(&inode_hash_lock); 1073 old = find_inode(inode->i_sb, head, test, data); 1074 if (unlikely(old)) { 1075 /* 1076 * Uhhuh, somebody else created the same inode under us. 1077 * Use the old inode instead of the preallocated one. 1078 */ 1079 spin_unlock(&inode_hash_lock); 1080 if (IS_ERR(old)) 1081 return NULL; 1082 wait_on_inode(old); 1083 if (unlikely(inode_unhashed(old))) { 1084 iput(old); 1085 goto again; 1086 } 1087 return old; 1088 } 1089 1090 if (set && unlikely(set(inode, data))) { 1091 inode = NULL; 1092 goto unlock; 1093 } 1094 1095 /* 1096 * Return the locked inode with I_NEW set, the 1097 * caller is responsible for filling in the contents 1098 */ 1099 spin_lock(&inode->i_lock); 1100 inode->i_state |= I_NEW; 1101 hlist_add_head(&inode->i_hash, head); 1102 spin_unlock(&inode->i_lock); 1103 if (!creating) 1104 inode_sb_list_add(inode); 1105 unlock: 1106 spin_unlock(&inode_hash_lock); 1107 1108 return inode; 1109 } 1110 EXPORT_SYMBOL(inode_insert5); 1111 1112 /** 1113 * iget5_locked - obtain an inode from a mounted file system 1114 * @sb: super block of file system 1115 * @hashval: hash value (usually inode number) to get 1116 * @test: callback used for comparisons between inodes 1117 * @set: callback used to initialize a new struct inode 1118 * @data: opaque data pointer to pass to @test and @set 1119 * 1120 * Search for the inode specified by @hashval and @data in the inode cache, 1121 * and if present it is return it with an increased reference count. This is 1122 * a generalized version of iget_locked() for file systems where the inode 1123 * number is not sufficient for unique identification of an inode. 1124 * 1125 * If the inode is not in cache, allocate a new inode and return it locked, 1126 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1127 * before unlocking it via unlock_new_inode(). 1128 * 1129 * Note both @test and @set are called with the inode_hash_lock held, so can't 1130 * sleep. 1131 */ 1132 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 1133 int (*test)(struct inode *, void *), 1134 int (*set)(struct inode *, void *), void *data) 1135 { 1136 struct inode *inode = ilookup5(sb, hashval, test, data); 1137 1138 if (!inode) { 1139 struct inode *new = alloc_inode(sb); 1140 1141 if (new) { 1142 new->i_state = 0; 1143 inode = inode_insert5(new, hashval, test, set, data); 1144 if (unlikely(inode != new)) 1145 destroy_inode(new); 1146 } 1147 } 1148 return inode; 1149 } 1150 EXPORT_SYMBOL(iget5_locked); 1151 1152 /** 1153 * iget_locked - obtain an inode from a mounted file system 1154 * @sb: super block of file system 1155 * @ino: inode number to get 1156 * 1157 * Search for the inode specified by @ino in the inode cache and if present 1158 * return it with an increased reference count. This is for file systems 1159 * where the inode number is sufficient for unique identification of an inode. 1160 * 1161 * If the inode is not in cache, allocate a new inode and return it locked, 1162 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1163 * before unlocking it via unlock_new_inode(). 1164 */ 1165 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1166 { 1167 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1168 struct inode *inode; 1169 again: 1170 spin_lock(&inode_hash_lock); 1171 inode = find_inode_fast(sb, head, ino); 1172 spin_unlock(&inode_hash_lock); 1173 if (inode) { 1174 if (IS_ERR(inode)) 1175 return NULL; 1176 wait_on_inode(inode); 1177 if (unlikely(inode_unhashed(inode))) { 1178 iput(inode); 1179 goto again; 1180 } 1181 return inode; 1182 } 1183 1184 inode = alloc_inode(sb); 1185 if (inode) { 1186 struct inode *old; 1187 1188 spin_lock(&inode_hash_lock); 1189 /* We released the lock, so.. */ 1190 old = find_inode_fast(sb, head, ino); 1191 if (!old) { 1192 inode->i_ino = ino; 1193 spin_lock(&inode->i_lock); 1194 inode->i_state = I_NEW; 1195 hlist_add_head(&inode->i_hash, head); 1196 spin_unlock(&inode->i_lock); 1197 inode_sb_list_add(inode); 1198 spin_unlock(&inode_hash_lock); 1199 1200 /* Return the locked inode with I_NEW set, the 1201 * caller is responsible for filling in the contents 1202 */ 1203 return inode; 1204 } 1205 1206 /* 1207 * Uhhuh, somebody else created the same inode under 1208 * us. Use the old inode instead of the one we just 1209 * allocated. 1210 */ 1211 spin_unlock(&inode_hash_lock); 1212 destroy_inode(inode); 1213 if (IS_ERR(old)) 1214 return NULL; 1215 inode = old; 1216 wait_on_inode(inode); 1217 if (unlikely(inode_unhashed(inode))) { 1218 iput(inode); 1219 goto again; 1220 } 1221 } 1222 return inode; 1223 } 1224 EXPORT_SYMBOL(iget_locked); 1225 1226 /* 1227 * search the inode cache for a matching inode number. 1228 * If we find one, then the inode number we are trying to 1229 * allocate is not unique and so we should not use it. 1230 * 1231 * Returns 1 if the inode number is unique, 0 if it is not. 1232 */ 1233 static int test_inode_iunique(struct super_block *sb, unsigned long ino) 1234 { 1235 struct hlist_head *b = inode_hashtable + hash(sb, ino); 1236 struct inode *inode; 1237 1238 spin_lock(&inode_hash_lock); 1239 hlist_for_each_entry(inode, b, i_hash) { 1240 if (inode->i_ino == ino && inode->i_sb == sb) { 1241 spin_unlock(&inode_hash_lock); 1242 return 0; 1243 } 1244 } 1245 spin_unlock(&inode_hash_lock); 1246 1247 return 1; 1248 } 1249 1250 /** 1251 * iunique - get a unique inode number 1252 * @sb: superblock 1253 * @max_reserved: highest reserved inode number 1254 * 1255 * Obtain an inode number that is unique on the system for a given 1256 * superblock. This is used by file systems that have no natural 1257 * permanent inode numbering system. An inode number is returned that 1258 * is higher than the reserved limit but unique. 1259 * 1260 * BUGS: 1261 * With a large number of inodes live on the file system this function 1262 * currently becomes quite slow. 1263 */ 1264 ino_t iunique(struct super_block *sb, ino_t max_reserved) 1265 { 1266 /* 1267 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 1268 * error if st_ino won't fit in target struct field. Use 32bit counter 1269 * here to attempt to avoid that. 1270 */ 1271 static DEFINE_SPINLOCK(iunique_lock); 1272 static unsigned int counter; 1273 ino_t res; 1274 1275 spin_lock(&iunique_lock); 1276 do { 1277 if (counter <= max_reserved) 1278 counter = max_reserved + 1; 1279 res = counter++; 1280 } while (!test_inode_iunique(sb, res)); 1281 spin_unlock(&iunique_lock); 1282 1283 return res; 1284 } 1285 EXPORT_SYMBOL(iunique); 1286 1287 struct inode *igrab(struct inode *inode) 1288 { 1289 spin_lock(&inode->i_lock); 1290 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1291 __iget(inode); 1292 spin_unlock(&inode->i_lock); 1293 } else { 1294 spin_unlock(&inode->i_lock); 1295 /* 1296 * Handle the case where s_op->clear_inode is not been 1297 * called yet, and somebody is calling igrab 1298 * while the inode is getting freed. 1299 */ 1300 inode = NULL; 1301 } 1302 return inode; 1303 } 1304 EXPORT_SYMBOL(igrab); 1305 1306 /** 1307 * ilookup5_nowait - search for an inode in the inode cache 1308 * @sb: super block of file system to search 1309 * @hashval: hash value (usually inode number) to search for 1310 * @test: callback used for comparisons between inodes 1311 * @data: opaque data pointer to pass to @test 1312 * 1313 * Search for the inode specified by @hashval and @data in the inode cache. 1314 * If the inode is in the cache, the inode is returned with an incremented 1315 * reference count. 1316 * 1317 * Note: I_NEW is not waited upon so you have to be very careful what you do 1318 * with the returned inode. You probably should be using ilookup5() instead. 1319 * 1320 * Note2: @test is called with the inode_hash_lock held, so can't sleep. 1321 */ 1322 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1323 int (*test)(struct inode *, void *), void *data) 1324 { 1325 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1326 struct inode *inode; 1327 1328 spin_lock(&inode_hash_lock); 1329 inode = find_inode(sb, head, test, data); 1330 spin_unlock(&inode_hash_lock); 1331 1332 return IS_ERR(inode) ? NULL : inode; 1333 } 1334 EXPORT_SYMBOL(ilookup5_nowait); 1335 1336 /** 1337 * ilookup5 - search for an inode in the inode cache 1338 * @sb: super block of file system to search 1339 * @hashval: hash value (usually inode number) to search for 1340 * @test: callback used for comparisons between inodes 1341 * @data: opaque data pointer to pass to @test 1342 * 1343 * Search for the inode specified by @hashval and @data in the inode cache, 1344 * and if the inode is in the cache, return the inode with an incremented 1345 * reference count. Waits on I_NEW before returning the inode. 1346 * returned with an incremented reference count. 1347 * 1348 * This is a generalized version of ilookup() for file systems where the 1349 * inode number is not sufficient for unique identification of an inode. 1350 * 1351 * Note: @test is called with the inode_hash_lock held, so can't sleep. 1352 */ 1353 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1354 int (*test)(struct inode *, void *), void *data) 1355 { 1356 struct inode *inode; 1357 again: 1358 inode = ilookup5_nowait(sb, hashval, test, data); 1359 if (inode) { 1360 wait_on_inode(inode); 1361 if (unlikely(inode_unhashed(inode))) { 1362 iput(inode); 1363 goto again; 1364 } 1365 } 1366 return inode; 1367 } 1368 EXPORT_SYMBOL(ilookup5); 1369 1370 /** 1371 * ilookup - search for an inode in the inode cache 1372 * @sb: super block of file system to search 1373 * @ino: inode number to search for 1374 * 1375 * Search for the inode @ino in the inode cache, and if the inode is in the 1376 * cache, the inode is returned with an incremented reference count. 1377 */ 1378 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1379 { 1380 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1381 struct inode *inode; 1382 again: 1383 spin_lock(&inode_hash_lock); 1384 inode = find_inode_fast(sb, head, ino); 1385 spin_unlock(&inode_hash_lock); 1386 1387 if (inode) { 1388 if (IS_ERR(inode)) 1389 return NULL; 1390 wait_on_inode(inode); 1391 if (unlikely(inode_unhashed(inode))) { 1392 iput(inode); 1393 goto again; 1394 } 1395 } 1396 return inode; 1397 } 1398 EXPORT_SYMBOL(ilookup); 1399 1400 /** 1401 * find_inode_nowait - find an inode in the inode cache 1402 * @sb: super block of file system to search 1403 * @hashval: hash value (usually inode number) to search for 1404 * @match: callback used for comparisons between inodes 1405 * @data: opaque data pointer to pass to @match 1406 * 1407 * Search for the inode specified by @hashval and @data in the inode 1408 * cache, where the helper function @match will return 0 if the inode 1409 * does not match, 1 if the inode does match, and -1 if the search 1410 * should be stopped. The @match function must be responsible for 1411 * taking the i_lock spin_lock and checking i_state for an inode being 1412 * freed or being initialized, and incrementing the reference count 1413 * before returning 1. It also must not sleep, since it is called with 1414 * the inode_hash_lock spinlock held. 1415 * 1416 * This is a even more generalized version of ilookup5() when the 1417 * function must never block --- find_inode() can block in 1418 * __wait_on_freeing_inode() --- or when the caller can not increment 1419 * the reference count because the resulting iput() might cause an 1420 * inode eviction. The tradeoff is that the @match funtion must be 1421 * very carefully implemented. 1422 */ 1423 struct inode *find_inode_nowait(struct super_block *sb, 1424 unsigned long hashval, 1425 int (*match)(struct inode *, unsigned long, 1426 void *), 1427 void *data) 1428 { 1429 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1430 struct inode *inode, *ret_inode = NULL; 1431 int mval; 1432 1433 spin_lock(&inode_hash_lock); 1434 hlist_for_each_entry(inode, head, i_hash) { 1435 if (inode->i_sb != sb) 1436 continue; 1437 mval = match(inode, hashval, data); 1438 if (mval == 0) 1439 continue; 1440 if (mval == 1) 1441 ret_inode = inode; 1442 goto out; 1443 } 1444 out: 1445 spin_unlock(&inode_hash_lock); 1446 return ret_inode; 1447 } 1448 EXPORT_SYMBOL(find_inode_nowait); 1449 1450 int insert_inode_locked(struct inode *inode) 1451 { 1452 struct super_block *sb = inode->i_sb; 1453 ino_t ino = inode->i_ino; 1454 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1455 1456 while (1) { 1457 struct inode *old = NULL; 1458 spin_lock(&inode_hash_lock); 1459 hlist_for_each_entry(old, head, i_hash) { 1460 if (old->i_ino != ino) 1461 continue; 1462 if (old->i_sb != sb) 1463 continue; 1464 spin_lock(&old->i_lock); 1465 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1466 spin_unlock(&old->i_lock); 1467 continue; 1468 } 1469 break; 1470 } 1471 if (likely(!old)) { 1472 spin_lock(&inode->i_lock); 1473 inode->i_state |= I_NEW | I_CREATING; 1474 hlist_add_head(&inode->i_hash, head); 1475 spin_unlock(&inode->i_lock); 1476 spin_unlock(&inode_hash_lock); 1477 return 0; 1478 } 1479 if (unlikely(old->i_state & I_CREATING)) { 1480 spin_unlock(&old->i_lock); 1481 spin_unlock(&inode_hash_lock); 1482 return -EBUSY; 1483 } 1484 __iget(old); 1485 spin_unlock(&old->i_lock); 1486 spin_unlock(&inode_hash_lock); 1487 wait_on_inode(old); 1488 if (unlikely(!inode_unhashed(old))) { 1489 iput(old); 1490 return -EBUSY; 1491 } 1492 iput(old); 1493 } 1494 } 1495 EXPORT_SYMBOL(insert_inode_locked); 1496 1497 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1498 int (*test)(struct inode *, void *), void *data) 1499 { 1500 struct inode *old; 1501 1502 inode->i_state |= I_CREATING; 1503 old = inode_insert5(inode, hashval, test, NULL, data); 1504 1505 if (old != inode) { 1506 iput(old); 1507 return -EBUSY; 1508 } 1509 return 0; 1510 } 1511 EXPORT_SYMBOL(insert_inode_locked4); 1512 1513 1514 int generic_delete_inode(struct inode *inode) 1515 { 1516 return 1; 1517 } 1518 EXPORT_SYMBOL(generic_delete_inode); 1519 1520 /* 1521 * Called when we're dropping the last reference 1522 * to an inode. 1523 * 1524 * Call the FS "drop_inode()" function, defaulting to 1525 * the legacy UNIX filesystem behaviour. If it tells 1526 * us to evict inode, do so. Otherwise, retain inode 1527 * in cache if fs is alive, sync and evict if fs is 1528 * shutting down. 1529 */ 1530 static void iput_final(struct inode *inode) 1531 { 1532 struct super_block *sb = inode->i_sb; 1533 const struct super_operations *op = inode->i_sb->s_op; 1534 int drop; 1535 1536 WARN_ON(inode->i_state & I_NEW); 1537 1538 if (op->drop_inode) 1539 drop = op->drop_inode(inode); 1540 else 1541 drop = generic_drop_inode(inode); 1542 1543 if (!drop && (sb->s_flags & SB_ACTIVE)) { 1544 inode_add_lru(inode); 1545 spin_unlock(&inode->i_lock); 1546 return; 1547 } 1548 1549 if (!drop) { 1550 inode->i_state |= I_WILL_FREE; 1551 spin_unlock(&inode->i_lock); 1552 write_inode_now(inode, 1); 1553 spin_lock(&inode->i_lock); 1554 WARN_ON(inode->i_state & I_NEW); 1555 inode->i_state &= ~I_WILL_FREE; 1556 } 1557 1558 inode->i_state |= I_FREEING; 1559 if (!list_empty(&inode->i_lru)) 1560 inode_lru_list_del(inode); 1561 spin_unlock(&inode->i_lock); 1562 1563 evict(inode); 1564 } 1565 1566 /** 1567 * iput - put an inode 1568 * @inode: inode to put 1569 * 1570 * Puts an inode, dropping its usage count. If the inode use count hits 1571 * zero, the inode is then freed and may also be destroyed. 1572 * 1573 * Consequently, iput() can sleep. 1574 */ 1575 void iput(struct inode *inode) 1576 { 1577 if (!inode) 1578 return; 1579 BUG_ON(inode->i_state & I_CLEAR); 1580 retry: 1581 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { 1582 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { 1583 atomic_inc(&inode->i_count); 1584 spin_unlock(&inode->i_lock); 1585 trace_writeback_lazytime_iput(inode); 1586 mark_inode_dirty_sync(inode); 1587 goto retry; 1588 } 1589 iput_final(inode); 1590 } 1591 } 1592 EXPORT_SYMBOL(iput); 1593 1594 /** 1595 * bmap - find a block number in a file 1596 * @inode: inode of file 1597 * @block: block to find 1598 * 1599 * Returns the block number on the device holding the inode that 1600 * is the disk block number for the block of the file requested. 1601 * That is, asked for block 4 of inode 1 the function will return the 1602 * disk block relative to the disk start that holds that block of the 1603 * file. 1604 */ 1605 sector_t bmap(struct inode *inode, sector_t block) 1606 { 1607 sector_t res = 0; 1608 if (inode->i_mapping->a_ops->bmap) 1609 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1610 return res; 1611 } 1612 EXPORT_SYMBOL(bmap); 1613 1614 /* 1615 * With relative atime, only update atime if the previous atime is 1616 * earlier than either the ctime or mtime or if at least a day has 1617 * passed since the last atime update. 1618 */ 1619 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, 1620 struct timespec64 now) 1621 { 1622 1623 if (!(mnt->mnt_flags & MNT_RELATIME)) 1624 return 1; 1625 /* 1626 * Is mtime younger than atime? If yes, update atime: 1627 */ 1628 if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) 1629 return 1; 1630 /* 1631 * Is ctime younger than atime? If yes, update atime: 1632 */ 1633 if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) 1634 return 1; 1635 1636 /* 1637 * Is the previous atime value older than a day? If yes, 1638 * update atime: 1639 */ 1640 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) 1641 return 1; 1642 /* 1643 * Good, we can skip the atime update: 1644 */ 1645 return 0; 1646 } 1647 1648 int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) 1649 { 1650 int iflags = I_DIRTY_TIME; 1651 bool dirty = false; 1652 1653 if (flags & S_ATIME) 1654 inode->i_atime = *time; 1655 if (flags & S_VERSION) 1656 dirty = inode_maybe_inc_iversion(inode, false); 1657 if (flags & S_CTIME) 1658 inode->i_ctime = *time; 1659 if (flags & S_MTIME) 1660 inode->i_mtime = *time; 1661 if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && 1662 !(inode->i_sb->s_flags & SB_LAZYTIME)) 1663 dirty = true; 1664 1665 if (dirty) 1666 iflags |= I_DIRTY_SYNC; 1667 __mark_inode_dirty(inode, iflags); 1668 return 0; 1669 } 1670 EXPORT_SYMBOL(generic_update_time); 1671 1672 /* 1673 * This does the actual work of updating an inodes time or version. Must have 1674 * had called mnt_want_write() before calling this. 1675 */ 1676 static int update_time(struct inode *inode, struct timespec64 *time, int flags) 1677 { 1678 int (*update_time)(struct inode *, struct timespec64 *, int); 1679 1680 update_time = inode->i_op->update_time ? inode->i_op->update_time : 1681 generic_update_time; 1682 1683 return update_time(inode, time, flags); 1684 } 1685 1686 /** 1687 * touch_atime - update the access time 1688 * @path: the &struct path to update 1689 * @inode: inode to update 1690 * 1691 * Update the accessed time on an inode and mark it for writeback. 1692 * This function automatically handles read only file systems and media, 1693 * as well as the "noatime" flag and inode specific "noatime" markers. 1694 */ 1695 bool atime_needs_update(const struct path *path, struct inode *inode) 1696 { 1697 struct vfsmount *mnt = path->mnt; 1698 struct timespec64 now; 1699 1700 if (inode->i_flags & S_NOATIME) 1701 return false; 1702 1703 /* Atime updates will likely cause i_uid and i_gid to be written 1704 * back improprely if their true value is unknown to the vfs. 1705 */ 1706 if (HAS_UNMAPPED_ID(inode)) 1707 return false; 1708 1709 if (IS_NOATIME(inode)) 1710 return false; 1711 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) 1712 return false; 1713 1714 if (mnt->mnt_flags & MNT_NOATIME) 1715 return false; 1716 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1717 return false; 1718 1719 now = current_time(inode); 1720 1721 if (!relatime_need_update(mnt, inode, now)) 1722 return false; 1723 1724 if (timespec64_equal(&inode->i_atime, &now)) 1725 return false; 1726 1727 return true; 1728 } 1729 1730 void touch_atime(const struct path *path) 1731 { 1732 struct vfsmount *mnt = path->mnt; 1733 struct inode *inode = d_inode(path->dentry); 1734 struct timespec64 now; 1735 1736 if (!atime_needs_update(path, inode)) 1737 return; 1738 1739 if (!sb_start_write_trylock(inode->i_sb)) 1740 return; 1741 1742 if (__mnt_want_write(mnt) != 0) 1743 goto skip_update; 1744 /* 1745 * File systems can error out when updating inodes if they need to 1746 * allocate new space to modify an inode (such is the case for 1747 * Btrfs), but since we touch atime while walking down the path we 1748 * really don't care if we failed to update the atime of the file, 1749 * so just ignore the return value. 1750 * We may also fail on filesystems that have the ability to make parts 1751 * of the fs read only, e.g. subvolumes in Btrfs. 1752 */ 1753 now = current_time(inode); 1754 update_time(inode, &now, S_ATIME); 1755 __mnt_drop_write(mnt); 1756 skip_update: 1757 sb_end_write(inode->i_sb); 1758 } 1759 EXPORT_SYMBOL(touch_atime); 1760 1761 /* 1762 * The logic we want is 1763 * 1764 * if suid or (sgid and xgrp) 1765 * remove privs 1766 */ 1767 int should_remove_suid(struct dentry *dentry) 1768 { 1769 umode_t mode = d_inode(dentry)->i_mode; 1770 int kill = 0; 1771 1772 /* suid always must be killed */ 1773 if (unlikely(mode & S_ISUID)) 1774 kill = ATTR_KILL_SUID; 1775 1776 /* 1777 * sgid without any exec bits is just a mandatory locking mark; leave 1778 * it alone. If some exec bits are set, it's a real sgid; kill it. 1779 */ 1780 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1781 kill |= ATTR_KILL_SGID; 1782 1783 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1784 return kill; 1785 1786 return 0; 1787 } 1788 EXPORT_SYMBOL(should_remove_suid); 1789 1790 /* 1791 * Return mask of changes for notify_change() that need to be done as a 1792 * response to write or truncate. Return 0 if nothing has to be changed. 1793 * Negative value on error (change should be denied). 1794 */ 1795 int dentry_needs_remove_privs(struct dentry *dentry) 1796 { 1797 struct inode *inode = d_inode(dentry); 1798 int mask = 0; 1799 int ret; 1800 1801 if (IS_NOSEC(inode)) 1802 return 0; 1803 1804 mask = should_remove_suid(dentry); 1805 ret = security_inode_need_killpriv(dentry); 1806 if (ret < 0) 1807 return ret; 1808 if (ret) 1809 mask |= ATTR_KILL_PRIV; 1810 return mask; 1811 } 1812 1813 static int __remove_privs(struct dentry *dentry, int kill) 1814 { 1815 struct iattr newattrs; 1816 1817 newattrs.ia_valid = ATTR_FORCE | kill; 1818 /* 1819 * Note we call this on write, so notify_change will not 1820 * encounter any conflicting delegations: 1821 */ 1822 return notify_change(dentry, &newattrs, NULL); 1823 } 1824 1825 /* 1826 * Remove special file priviledges (suid, capabilities) when file is written 1827 * to or truncated. 1828 */ 1829 int file_remove_privs(struct file *file) 1830 { 1831 struct dentry *dentry = file_dentry(file); 1832 struct inode *inode = file_inode(file); 1833 int kill; 1834 int error = 0; 1835 1836 /* 1837 * Fast path for nothing security related. 1838 * As well for non-regular files, e.g. blkdev inodes. 1839 * For example, blkdev_write_iter() might get here 1840 * trying to remove privs which it is not allowed to. 1841 */ 1842 if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) 1843 return 0; 1844 1845 kill = dentry_needs_remove_privs(dentry); 1846 if (kill < 0) 1847 return kill; 1848 if (kill) 1849 error = __remove_privs(dentry, kill); 1850 if (!error) 1851 inode_has_no_xattr(inode); 1852 1853 return error; 1854 } 1855 EXPORT_SYMBOL(file_remove_privs); 1856 1857 /** 1858 * file_update_time - update mtime and ctime time 1859 * @file: file accessed 1860 * 1861 * Update the mtime and ctime members of an inode and mark the inode 1862 * for writeback. Note that this function is meant exclusively for 1863 * usage in the file write path of filesystems, and filesystems may 1864 * choose to explicitly ignore update via this function with the 1865 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1866 * timestamps are handled by the server. This can return an error for 1867 * file systems who need to allocate space in order to update an inode. 1868 */ 1869 1870 int file_update_time(struct file *file) 1871 { 1872 struct inode *inode = file_inode(file); 1873 struct timespec64 now; 1874 int sync_it = 0; 1875 int ret; 1876 1877 /* First try to exhaust all avenues to not sync */ 1878 if (IS_NOCMTIME(inode)) 1879 return 0; 1880 1881 now = current_time(inode); 1882 if (!timespec64_equal(&inode->i_mtime, &now)) 1883 sync_it = S_MTIME; 1884 1885 if (!timespec64_equal(&inode->i_ctime, &now)) 1886 sync_it |= S_CTIME; 1887 1888 if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) 1889 sync_it |= S_VERSION; 1890 1891 if (!sync_it) 1892 return 0; 1893 1894 /* Finally allowed to write? Takes lock. */ 1895 if (__mnt_want_write_file(file)) 1896 return 0; 1897 1898 ret = update_time(inode, &now, sync_it); 1899 __mnt_drop_write_file(file); 1900 1901 return ret; 1902 } 1903 EXPORT_SYMBOL(file_update_time); 1904 1905 /* Caller must hold the file's inode lock */ 1906 int file_modified(struct file *file) 1907 { 1908 int err; 1909 1910 /* 1911 * Clear the security bits if the process is not being run by root. 1912 * This keeps people from modifying setuid and setgid binaries. 1913 */ 1914 err = file_remove_privs(file); 1915 if (err) 1916 return err; 1917 1918 if (unlikely(file->f_mode & FMODE_NOCMTIME)) 1919 return 0; 1920 1921 return file_update_time(file); 1922 } 1923 EXPORT_SYMBOL(file_modified); 1924 1925 int inode_needs_sync(struct inode *inode) 1926 { 1927 if (IS_SYNC(inode)) 1928 return 1; 1929 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1930 return 1; 1931 return 0; 1932 } 1933 EXPORT_SYMBOL(inode_needs_sync); 1934 1935 /* 1936 * If we try to find an inode in the inode hash while it is being 1937 * deleted, we have to wait until the filesystem completes its 1938 * deletion before reporting that it isn't found. This function waits 1939 * until the deletion _might_ have completed. Callers are responsible 1940 * to recheck inode state. 1941 * 1942 * It doesn't matter if I_NEW is not set initially, a call to 1943 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1944 * will DTRT. 1945 */ 1946 static void __wait_on_freeing_inode(struct inode *inode) 1947 { 1948 wait_queue_head_t *wq; 1949 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1950 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1951 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 1952 spin_unlock(&inode->i_lock); 1953 spin_unlock(&inode_hash_lock); 1954 schedule(); 1955 finish_wait(wq, &wait.wq_entry); 1956 spin_lock(&inode_hash_lock); 1957 } 1958 1959 static __initdata unsigned long ihash_entries; 1960 static int __init set_ihash_entries(char *str) 1961 { 1962 if (!str) 1963 return 0; 1964 ihash_entries = simple_strtoul(str, &str, 0); 1965 return 1; 1966 } 1967 __setup("ihash_entries=", set_ihash_entries); 1968 1969 /* 1970 * Initialize the waitqueues and inode hash table. 1971 */ 1972 void __init inode_init_early(void) 1973 { 1974 /* If hashes are distributed across NUMA nodes, defer 1975 * hash allocation until vmalloc space is available. 1976 */ 1977 if (hashdist) 1978 return; 1979 1980 inode_hashtable = 1981 alloc_large_system_hash("Inode-cache", 1982 sizeof(struct hlist_head), 1983 ihash_entries, 1984 14, 1985 HASH_EARLY | HASH_ZERO, 1986 &i_hash_shift, 1987 &i_hash_mask, 1988 0, 1989 0); 1990 } 1991 1992 void __init inode_init(void) 1993 { 1994 /* inode slab cache */ 1995 inode_cachep = kmem_cache_create("inode_cache", 1996 sizeof(struct inode), 1997 0, 1998 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1999 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 2000 init_once); 2001 2002 /* Hash may have been set up in inode_init_early */ 2003 if (!hashdist) 2004 return; 2005 2006 inode_hashtable = 2007 alloc_large_system_hash("Inode-cache", 2008 sizeof(struct hlist_head), 2009 ihash_entries, 2010 14, 2011 HASH_ZERO, 2012 &i_hash_shift, 2013 &i_hash_mask, 2014 0, 2015 0); 2016 } 2017 2018 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 2019 { 2020 inode->i_mode = mode; 2021 if (S_ISCHR(mode)) { 2022 inode->i_fop = &def_chr_fops; 2023 inode->i_rdev = rdev; 2024 } else if (S_ISBLK(mode)) { 2025 inode->i_fop = &def_blk_fops; 2026 inode->i_rdev = rdev; 2027 } else if (S_ISFIFO(mode)) 2028 inode->i_fop = &pipefifo_fops; 2029 else if (S_ISSOCK(mode)) 2030 ; /* leave it no_open_fops */ 2031 else 2032 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" 2033 " inode %s:%lu\n", mode, inode->i_sb->s_id, 2034 inode->i_ino); 2035 } 2036 EXPORT_SYMBOL(init_special_inode); 2037 2038 /** 2039 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards 2040 * @inode: New inode 2041 * @dir: Directory inode 2042 * @mode: mode of the new inode 2043 */ 2044 void inode_init_owner(struct inode *inode, const struct inode *dir, 2045 umode_t mode) 2046 { 2047 inode->i_uid = current_fsuid(); 2048 if (dir && dir->i_mode & S_ISGID) { 2049 inode->i_gid = dir->i_gid; 2050 2051 /* Directories are special, and always inherit S_ISGID */ 2052 if (S_ISDIR(mode)) 2053 mode |= S_ISGID; 2054 else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && 2055 !in_group_p(inode->i_gid) && 2056 !capable_wrt_inode_uidgid(dir, CAP_FSETID)) 2057 mode &= ~S_ISGID; 2058 } else 2059 inode->i_gid = current_fsgid(); 2060 inode->i_mode = mode; 2061 } 2062 EXPORT_SYMBOL(inode_init_owner); 2063 2064 /** 2065 * inode_owner_or_capable - check current task permissions to inode 2066 * @inode: inode being checked 2067 * 2068 * Return true if current either has CAP_FOWNER in a namespace with the 2069 * inode owner uid mapped, or owns the file. 2070 */ 2071 bool inode_owner_or_capable(const struct inode *inode) 2072 { 2073 struct user_namespace *ns; 2074 2075 if (uid_eq(current_fsuid(), inode->i_uid)) 2076 return true; 2077 2078 ns = current_user_ns(); 2079 if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) 2080 return true; 2081 return false; 2082 } 2083 EXPORT_SYMBOL(inode_owner_or_capable); 2084 2085 /* 2086 * Direct i/o helper functions 2087 */ 2088 static void __inode_dio_wait(struct inode *inode) 2089 { 2090 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); 2091 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); 2092 2093 do { 2094 prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); 2095 if (atomic_read(&inode->i_dio_count)) 2096 schedule(); 2097 } while (atomic_read(&inode->i_dio_count)); 2098 finish_wait(wq, &q.wq_entry); 2099 } 2100 2101 /** 2102 * inode_dio_wait - wait for outstanding DIO requests to finish 2103 * @inode: inode to wait for 2104 * 2105 * Waits for all pending direct I/O requests to finish so that we can 2106 * proceed with a truncate or equivalent operation. 2107 * 2108 * Must be called under a lock that serializes taking new references 2109 * to i_dio_count, usually by inode->i_mutex. 2110 */ 2111 void inode_dio_wait(struct inode *inode) 2112 { 2113 if (atomic_read(&inode->i_dio_count)) 2114 __inode_dio_wait(inode); 2115 } 2116 EXPORT_SYMBOL(inode_dio_wait); 2117 2118 /* 2119 * inode_set_flags - atomically set some inode flags 2120 * 2121 * Note: the caller should be holding i_mutex, or else be sure that 2122 * they have exclusive access to the inode structure (i.e., while the 2123 * inode is being instantiated). The reason for the cmpxchg() loop 2124 * --- which wouldn't be necessary if all code paths which modify 2125 * i_flags actually followed this rule, is that there is at least one 2126 * code path which doesn't today so we use cmpxchg() out of an abundance 2127 * of caution. 2128 * 2129 * In the long run, i_mutex is overkill, and we should probably look 2130 * at using the i_lock spinlock to protect i_flags, and then make sure 2131 * it is so documented in include/linux/fs.h and that all code follows 2132 * the locking convention!! 2133 */ 2134 void inode_set_flags(struct inode *inode, unsigned int flags, 2135 unsigned int mask) 2136 { 2137 WARN_ON_ONCE(flags & ~mask); 2138 set_mask_bits(&inode->i_flags, mask, flags); 2139 } 2140 EXPORT_SYMBOL(inode_set_flags); 2141 2142 void inode_nohighmem(struct inode *inode) 2143 { 2144 mapping_set_gfp_mask(inode->i_mapping, GFP_USER); 2145 } 2146 EXPORT_SYMBOL(inode_nohighmem); 2147 2148 /** 2149 * timespec64_trunc - Truncate timespec64 to a granularity 2150 * @t: Timespec64 2151 * @gran: Granularity in ns. 2152 * 2153 * Truncate a timespec64 to a granularity. Always rounds down. gran must 2154 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). 2155 */ 2156 struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) 2157 { 2158 /* Avoid division in the common cases 1 ns and 1 s. */ 2159 if (gran == 1) { 2160 /* nothing */ 2161 } else if (gran == NSEC_PER_SEC) { 2162 t.tv_nsec = 0; 2163 } else if (gran > 1 && gran < NSEC_PER_SEC) { 2164 t.tv_nsec -= t.tv_nsec % gran; 2165 } else { 2166 WARN(1, "illegal file time granularity: %u", gran); 2167 } 2168 return t; 2169 } 2170 EXPORT_SYMBOL(timespec64_trunc); 2171 2172 /** 2173 * timestamp_truncate - Truncate timespec to a granularity 2174 * @t: Timespec 2175 * @inode: inode being updated 2176 * 2177 * Truncate a timespec to the granularity supported by the fs 2178 * containing the inode. Always rounds down. gran must 2179 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). 2180 */ 2181 struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) 2182 { 2183 struct super_block *sb = inode->i_sb; 2184 unsigned int gran = sb->s_time_gran; 2185 2186 t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); 2187 if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) 2188 t.tv_nsec = 0; 2189 2190 /* Avoid division in the common cases 1 ns and 1 s. */ 2191 if (gran == 1) 2192 ; /* nothing */ 2193 else if (gran == NSEC_PER_SEC) 2194 t.tv_nsec = 0; 2195 else if (gran > 1 && gran < NSEC_PER_SEC) 2196 t.tv_nsec -= t.tv_nsec % gran; 2197 else 2198 WARN(1, "invalid file time granularity: %u", gran); 2199 return t; 2200 } 2201 EXPORT_SYMBOL(timestamp_truncate); 2202 2203 /** 2204 * current_time - Return FS time 2205 * @inode: inode. 2206 * 2207 * Return the current time truncated to the time granularity supported by 2208 * the fs. 2209 * 2210 * Note that inode and inode->sb cannot be NULL. 2211 * Otherwise, the function warns and returns time without truncation. 2212 */ 2213 struct timespec64 current_time(struct inode *inode) 2214 { 2215 struct timespec64 now; 2216 2217 ktime_get_coarse_real_ts64(&now); 2218 2219 if (unlikely(!inode->i_sb)) { 2220 WARN(1, "current_time() called with uninitialized super_block in the inode"); 2221 return now; 2222 } 2223 2224 return timestamp_truncate(now, inode); 2225 } 2226 EXPORT_SYMBOL(current_time); 2227 2228 /* 2229 * Generic function to check FS_IOC_SETFLAGS values and reject any invalid 2230 * configurations. 2231 * 2232 * Note: the caller should be holding i_mutex, or else be sure that they have 2233 * exclusive access to the inode structure. 2234 */ 2235 int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, 2236 unsigned int flags) 2237 { 2238 /* 2239 * The IMMUTABLE and APPEND_ONLY flags can only be changed by 2240 * the relevant capability. 2241 * 2242 * This test looks nicer. Thanks to Pauline Middelink 2243 */ 2244 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && 2245 !capable(CAP_LINUX_IMMUTABLE)) 2246 return -EPERM; 2247 2248 return 0; 2249 } 2250 EXPORT_SYMBOL(vfs_ioc_setflags_prepare); 2251 2252 /* 2253 * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid 2254 * configurations. 2255 * 2256 * Note: the caller should be holding i_mutex, or else be sure that they have 2257 * exclusive access to the inode structure. 2258 */ 2259 int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, 2260 struct fsxattr *fa) 2261 { 2262 /* 2263 * Can't modify an immutable/append-only file unless we have 2264 * appropriate permission. 2265 */ 2266 if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & 2267 (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) && 2268 !capable(CAP_LINUX_IMMUTABLE)) 2269 return -EPERM; 2270 2271 /* 2272 * Project Quota ID state is only allowed to change from within the init 2273 * namespace. Enforce that restriction only if we are trying to change 2274 * the quota ID state. Everything else is allowed in user namespaces. 2275 */ 2276 if (current_user_ns() != &init_user_ns) { 2277 if (old_fa->fsx_projid != fa->fsx_projid) 2278 return -EINVAL; 2279 if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & 2280 FS_XFLAG_PROJINHERIT) 2281 return -EINVAL; 2282 } 2283 2284 /* Check extent size hints. */ 2285 if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) 2286 return -EINVAL; 2287 2288 if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && 2289 !S_ISDIR(inode->i_mode)) 2290 return -EINVAL; 2291 2292 if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && 2293 !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 2294 return -EINVAL; 2295 2296 /* 2297 * It is only valid to set the DAX flag on regular files and 2298 * directories on filesystems. 2299 */ 2300 if ((fa->fsx_xflags & FS_XFLAG_DAX) && 2301 !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 2302 return -EINVAL; 2303 2304 /* Extent size hints of zero turn off the flags. */ 2305 if (fa->fsx_extsize == 0) 2306 fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); 2307 if (fa->fsx_cowextsize == 0) 2308 fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; 2309 2310 return 0; 2311 } 2312 EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); 2313