1 /* 2 * (C) 1997 Linus Torvalds 3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) 4 */ 5 #include <linux/export.h> 6 #include <linux/fs.h> 7 #include <linux/mm.h> 8 #include <linux/backing-dev.h> 9 #include <linux/hash.h> 10 #include <linux/swap.h> 11 #include <linux/security.h> 12 #include <linux/cdev.h> 13 #include <linux/bootmem.h> 14 #include <linux/fsnotify.h> 15 #include <linux/mount.h> 16 #include <linux/posix_acl.h> 17 #include <linux/prefetch.h> 18 #include <linux/buffer_head.h> /* for inode_has_buffers */ 19 #include <linux/ratelimit.h> 20 #include <linux/list_lru.h> 21 #include <trace/events/writeback.h> 22 #include "internal.h" 23 24 /* 25 * Inode locking rules: 26 * 27 * inode->i_lock protects: 28 * inode->i_state, inode->i_hash, __iget() 29 * Inode LRU list locks protect: 30 * inode->i_sb->s_inode_lru, inode->i_lru 31 * inode_sb_list_lock protects: 32 * sb->s_inodes, inode->i_sb_list 33 * bdi->wb.list_lock protects: 34 * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list 35 * inode_hash_lock protects: 36 * inode_hashtable, inode->i_hash 37 * 38 * Lock ordering: 39 * 40 * inode_sb_list_lock 41 * inode->i_lock 42 * Inode LRU list locks 43 * 44 * bdi->wb.list_lock 45 * inode->i_lock 46 * 47 * inode_hash_lock 48 * inode_sb_list_lock 49 * inode->i_lock 50 * 51 * iunique_lock 52 * inode_hash_lock 53 */ 54 55 static unsigned int i_hash_mask __read_mostly; 56 static unsigned int i_hash_shift __read_mostly; 57 static struct hlist_head *inode_hashtable __read_mostly; 58 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); 61 62 /* 63 * Empty aops. Can be used for the cases where the user does not 64 * define any of the address_space operations. 65 */ 66 const struct address_space_operations empty_aops = { 67 }; 68 EXPORT_SYMBOL(empty_aops); 69 70 /* 71 * Statistics gathering.. 72 */ 73 struct inodes_stat_t inodes_stat; 74 75 static DEFINE_PER_CPU(unsigned long, nr_inodes); 76 static DEFINE_PER_CPU(unsigned long, nr_unused); 77 78 static struct kmem_cache *inode_cachep __read_mostly; 79 80 static long get_nr_inodes(void) 81 { 82 int i; 83 long sum = 0; 84 for_each_possible_cpu(i) 85 sum += per_cpu(nr_inodes, i); 86 return sum < 0 ? 0 : sum; 87 } 88 89 static inline long get_nr_inodes_unused(void) 90 { 91 int i; 92 long sum = 0; 93 for_each_possible_cpu(i) 94 sum += per_cpu(nr_unused, i); 95 return sum < 0 ? 0 : sum; 96 } 97 98 long get_nr_dirty_inodes(void) 99 { 100 /* not actually dirty inodes, but a wild approximation */ 101 long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); 102 return nr_dirty > 0 ? nr_dirty : 0; 103 } 104 105 /* 106 * Handle nr_inode sysctl 107 */ 108 #ifdef CONFIG_SYSCTL 109 int proc_nr_inodes(struct ctl_table *table, int write, 110 void __user *buffer, size_t *lenp, loff_t *ppos) 111 { 112 inodes_stat.nr_inodes = get_nr_inodes(); 113 inodes_stat.nr_unused = get_nr_inodes_unused(); 114 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 115 } 116 #endif 117 118 static int no_open(struct inode *inode, struct file *file) 119 { 120 return -ENXIO; 121 } 122 123 /** 124 * inode_init_always - perform inode structure intialisation 125 * @sb: superblock inode belongs to 126 * @inode: inode to initialise 127 * 128 * These are initializations that need to be done on every inode 129 * allocation as the fields are not initialised by slab allocation. 130 */ 131 int inode_init_always(struct super_block *sb, struct inode *inode) 132 { 133 static const struct inode_operations empty_iops; 134 static const struct file_operations no_open_fops = {.open = no_open}; 135 struct address_space *const mapping = &inode->i_data; 136 137 inode->i_sb = sb; 138 inode->i_blkbits = sb->s_blocksize_bits; 139 inode->i_flags = 0; 140 atomic_set(&inode->i_count, 1); 141 inode->i_op = &empty_iops; 142 inode->i_fop = &no_open_fops; 143 inode->__i_nlink = 1; 144 inode->i_opflags = 0; 145 i_uid_write(inode, 0); 146 i_gid_write(inode, 0); 147 atomic_set(&inode->i_writecount, 0); 148 inode->i_size = 0; 149 inode->i_blocks = 0; 150 inode->i_bytes = 0; 151 inode->i_generation = 0; 152 inode->i_pipe = NULL; 153 inode->i_bdev = NULL; 154 inode->i_cdev = NULL; 155 inode->i_rdev = 0; 156 inode->dirtied_when = 0; 157 158 if (security_inode_alloc(inode)) 159 goto out; 160 spin_lock_init(&inode->i_lock); 161 lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); 162 163 mutex_init(&inode->i_mutex); 164 lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); 165 166 atomic_set(&inode->i_dio_count, 0); 167 168 mapping->a_ops = &empty_aops; 169 mapping->host = inode; 170 mapping->flags = 0; 171 atomic_set(&mapping->i_mmap_writable, 0); 172 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 173 mapping->private_data = NULL; 174 mapping->writeback_index = 0; 175 inode->i_private = NULL; 176 inode->i_mapping = mapping; 177 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ 178 #ifdef CONFIG_FS_POSIX_ACL 179 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; 180 #endif 181 182 #ifdef CONFIG_FSNOTIFY 183 inode->i_fsnotify_mask = 0; 184 #endif 185 inode->i_flctx = NULL; 186 this_cpu_inc(nr_inodes); 187 188 return 0; 189 out: 190 return -ENOMEM; 191 } 192 EXPORT_SYMBOL(inode_init_always); 193 194 static struct inode *alloc_inode(struct super_block *sb) 195 { 196 struct inode *inode; 197 198 if (sb->s_op->alloc_inode) 199 inode = sb->s_op->alloc_inode(sb); 200 else 201 inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); 202 203 if (!inode) 204 return NULL; 205 206 if (unlikely(inode_init_always(sb, inode))) { 207 if (inode->i_sb->s_op->destroy_inode) 208 inode->i_sb->s_op->destroy_inode(inode); 209 else 210 kmem_cache_free(inode_cachep, inode); 211 return NULL; 212 } 213 214 return inode; 215 } 216 217 void free_inode_nonrcu(struct inode *inode) 218 { 219 kmem_cache_free(inode_cachep, inode); 220 } 221 EXPORT_SYMBOL(free_inode_nonrcu); 222 223 void __destroy_inode(struct inode *inode) 224 { 225 BUG_ON(inode_has_buffers(inode)); 226 security_inode_free(inode); 227 fsnotify_inode_delete(inode); 228 locks_free_lock_context(inode->i_flctx); 229 if (!inode->i_nlink) { 230 WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); 231 atomic_long_dec(&inode->i_sb->s_remove_count); 232 } 233 234 #ifdef CONFIG_FS_POSIX_ACL 235 if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) 236 posix_acl_release(inode->i_acl); 237 if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) 238 posix_acl_release(inode->i_default_acl); 239 #endif 240 this_cpu_dec(nr_inodes); 241 } 242 EXPORT_SYMBOL(__destroy_inode); 243 244 static void i_callback(struct rcu_head *head) 245 { 246 struct inode *inode = container_of(head, struct inode, i_rcu); 247 kmem_cache_free(inode_cachep, inode); 248 } 249 250 static void destroy_inode(struct inode *inode) 251 { 252 BUG_ON(!list_empty(&inode->i_lru)); 253 __destroy_inode(inode); 254 if (inode->i_sb->s_op->destroy_inode) 255 inode->i_sb->s_op->destroy_inode(inode); 256 else 257 call_rcu(&inode->i_rcu, i_callback); 258 } 259 260 /** 261 * drop_nlink - directly drop an inode's link count 262 * @inode: inode 263 * 264 * This is a low-level filesystem helper to replace any 265 * direct filesystem manipulation of i_nlink. In cases 266 * where we are attempting to track writes to the 267 * filesystem, a decrement to zero means an imminent 268 * write when the file is truncated and actually unlinked 269 * on the filesystem. 270 */ 271 void drop_nlink(struct inode *inode) 272 { 273 WARN_ON(inode->i_nlink == 0); 274 inode->__i_nlink--; 275 if (!inode->i_nlink) 276 atomic_long_inc(&inode->i_sb->s_remove_count); 277 } 278 EXPORT_SYMBOL(drop_nlink); 279 280 /** 281 * clear_nlink - directly zero an inode's link count 282 * @inode: inode 283 * 284 * This is a low-level filesystem helper to replace any 285 * direct filesystem manipulation of i_nlink. See 286 * drop_nlink() for why we care about i_nlink hitting zero. 287 */ 288 void clear_nlink(struct inode *inode) 289 { 290 if (inode->i_nlink) { 291 inode->__i_nlink = 0; 292 atomic_long_inc(&inode->i_sb->s_remove_count); 293 } 294 } 295 EXPORT_SYMBOL(clear_nlink); 296 297 /** 298 * set_nlink - directly set an inode's link count 299 * @inode: inode 300 * @nlink: new nlink (should be non-zero) 301 * 302 * This is a low-level filesystem helper to replace any 303 * direct filesystem manipulation of i_nlink. 304 */ 305 void set_nlink(struct inode *inode, unsigned int nlink) 306 { 307 if (!nlink) { 308 clear_nlink(inode); 309 } else { 310 /* Yes, some filesystems do change nlink from zero to one */ 311 if (inode->i_nlink == 0) 312 atomic_long_dec(&inode->i_sb->s_remove_count); 313 314 inode->__i_nlink = nlink; 315 } 316 } 317 EXPORT_SYMBOL(set_nlink); 318 319 /** 320 * inc_nlink - directly increment an inode's link count 321 * @inode: inode 322 * 323 * This is a low-level filesystem helper to replace any 324 * direct filesystem manipulation of i_nlink. Currently, 325 * it is only here for parity with dec_nlink(). 326 */ 327 void inc_nlink(struct inode *inode) 328 { 329 if (unlikely(inode->i_nlink == 0)) { 330 WARN_ON(!(inode->i_state & I_LINKABLE)); 331 atomic_long_dec(&inode->i_sb->s_remove_count); 332 } 333 334 inode->__i_nlink++; 335 } 336 EXPORT_SYMBOL(inc_nlink); 337 338 void address_space_init_once(struct address_space *mapping) 339 { 340 memset(mapping, 0, sizeof(*mapping)); 341 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 342 spin_lock_init(&mapping->tree_lock); 343 init_rwsem(&mapping->i_mmap_rwsem); 344 INIT_LIST_HEAD(&mapping->private_list); 345 spin_lock_init(&mapping->private_lock); 346 mapping->i_mmap = RB_ROOT; 347 } 348 EXPORT_SYMBOL(address_space_init_once); 349 350 /* 351 * These are initializations that only need to be done 352 * once, because the fields are idempotent across use 353 * of the inode, so let the slab aware of that. 354 */ 355 void inode_init_once(struct inode *inode) 356 { 357 memset(inode, 0, sizeof(*inode)); 358 INIT_HLIST_NODE(&inode->i_hash); 359 INIT_LIST_HEAD(&inode->i_devices); 360 INIT_LIST_HEAD(&inode->i_wb_list); 361 INIT_LIST_HEAD(&inode->i_lru); 362 address_space_init_once(&inode->i_data); 363 i_size_ordered_init(inode); 364 #ifdef CONFIG_FSNOTIFY 365 INIT_HLIST_HEAD(&inode->i_fsnotify_marks); 366 #endif 367 } 368 EXPORT_SYMBOL(inode_init_once); 369 370 static void init_once(void *foo) 371 { 372 struct inode *inode = (struct inode *) foo; 373 374 inode_init_once(inode); 375 } 376 377 /* 378 * inode->i_lock must be held 379 */ 380 void __iget(struct inode *inode) 381 { 382 atomic_inc(&inode->i_count); 383 } 384 385 /* 386 * get additional reference to inode; caller must already hold one. 387 */ 388 void ihold(struct inode *inode) 389 { 390 WARN_ON(atomic_inc_return(&inode->i_count) < 2); 391 } 392 EXPORT_SYMBOL(ihold); 393 394 static void inode_lru_list_add(struct inode *inode) 395 { 396 if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) 397 this_cpu_inc(nr_unused); 398 } 399 400 /* 401 * Add inode to LRU if needed (inode is unused and clean). 402 * 403 * Needs inode->i_lock held. 404 */ 405 void inode_add_lru(struct inode *inode) 406 { 407 if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | 408 I_FREEING | I_WILL_FREE)) && 409 !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) 410 inode_lru_list_add(inode); 411 } 412 413 414 static void inode_lru_list_del(struct inode *inode) 415 { 416 417 if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) 418 this_cpu_dec(nr_unused); 419 } 420 421 /** 422 * inode_sb_list_add - add inode to the superblock list of inodes 423 * @inode: inode to add 424 */ 425 void inode_sb_list_add(struct inode *inode) 426 { 427 spin_lock(&inode_sb_list_lock); 428 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); 429 spin_unlock(&inode_sb_list_lock); 430 } 431 EXPORT_SYMBOL_GPL(inode_sb_list_add); 432 433 static inline void inode_sb_list_del(struct inode *inode) 434 { 435 if (!list_empty(&inode->i_sb_list)) { 436 spin_lock(&inode_sb_list_lock); 437 list_del_init(&inode->i_sb_list); 438 spin_unlock(&inode_sb_list_lock); 439 } 440 } 441 442 static unsigned long hash(struct super_block *sb, unsigned long hashval) 443 { 444 unsigned long tmp; 445 446 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 447 L1_CACHE_BYTES; 448 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); 449 return tmp & i_hash_mask; 450 } 451 452 /** 453 * __insert_inode_hash - hash an inode 454 * @inode: unhashed inode 455 * @hashval: unsigned long value used to locate this object in the 456 * inode_hashtable. 457 * 458 * Add an inode to the inode hash for this superblock. 459 */ 460 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 461 { 462 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 463 464 spin_lock(&inode_hash_lock); 465 spin_lock(&inode->i_lock); 466 hlist_add_head(&inode->i_hash, b); 467 spin_unlock(&inode->i_lock); 468 spin_unlock(&inode_hash_lock); 469 } 470 EXPORT_SYMBOL(__insert_inode_hash); 471 472 /** 473 * __remove_inode_hash - remove an inode from the hash 474 * @inode: inode to unhash 475 * 476 * Remove an inode from the superblock. 477 */ 478 void __remove_inode_hash(struct inode *inode) 479 { 480 spin_lock(&inode_hash_lock); 481 spin_lock(&inode->i_lock); 482 hlist_del_init(&inode->i_hash); 483 spin_unlock(&inode->i_lock); 484 spin_unlock(&inode_hash_lock); 485 } 486 EXPORT_SYMBOL(__remove_inode_hash); 487 488 void clear_inode(struct inode *inode) 489 { 490 might_sleep(); 491 /* 492 * We have to cycle tree_lock here because reclaim can be still in the 493 * process of removing the last page (in __delete_from_page_cache()) 494 * and we must not free mapping under it. 495 */ 496 spin_lock_irq(&inode->i_data.tree_lock); 497 BUG_ON(inode->i_data.nrpages); 498 BUG_ON(inode->i_data.nrshadows); 499 spin_unlock_irq(&inode->i_data.tree_lock); 500 BUG_ON(!list_empty(&inode->i_data.private_list)); 501 BUG_ON(!(inode->i_state & I_FREEING)); 502 BUG_ON(inode->i_state & I_CLEAR); 503 /* don't need i_lock here, no concurrent mods to i_state */ 504 inode->i_state = I_FREEING | I_CLEAR; 505 } 506 EXPORT_SYMBOL(clear_inode); 507 508 /* 509 * Free the inode passed in, removing it from the lists it is still connected 510 * to. We remove any pages still attached to the inode and wait for any IO that 511 * is still in progress before finally destroying the inode. 512 * 513 * An inode must already be marked I_FREEING so that we avoid the inode being 514 * moved back onto lists if we race with other code that manipulates the lists 515 * (e.g. writeback_single_inode). The caller is responsible for setting this. 516 * 517 * An inode must already be removed from the LRU list before being evicted from 518 * the cache. This should occur atomically with setting the I_FREEING state 519 * flag, so no inodes here should ever be on the LRU when being evicted. 520 */ 521 static void evict(struct inode *inode) 522 { 523 const struct super_operations *op = inode->i_sb->s_op; 524 525 BUG_ON(!(inode->i_state & I_FREEING)); 526 BUG_ON(!list_empty(&inode->i_lru)); 527 528 if (!list_empty(&inode->i_wb_list)) 529 inode_wb_list_del(inode); 530 531 inode_sb_list_del(inode); 532 533 /* 534 * Wait for flusher thread to be done with the inode so that filesystem 535 * does not start destroying it while writeback is still running. Since 536 * the inode has I_FREEING set, flusher thread won't start new work on 537 * the inode. We just have to wait for running writeback to finish. 538 */ 539 inode_wait_for_writeback(inode); 540 541 if (op->evict_inode) { 542 op->evict_inode(inode); 543 } else { 544 truncate_inode_pages_final(&inode->i_data); 545 clear_inode(inode); 546 } 547 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 548 bd_forget(inode); 549 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 550 cd_forget(inode); 551 552 remove_inode_hash(inode); 553 554 spin_lock(&inode->i_lock); 555 wake_up_bit(&inode->i_state, __I_NEW); 556 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); 557 spin_unlock(&inode->i_lock); 558 559 destroy_inode(inode); 560 } 561 562 /* 563 * dispose_list - dispose of the contents of a local list 564 * @head: the head of the list to free 565 * 566 * Dispose-list gets a local list with local inodes in it, so it doesn't 567 * need to worry about list corruption and SMP locks. 568 */ 569 static void dispose_list(struct list_head *head) 570 { 571 while (!list_empty(head)) { 572 struct inode *inode; 573 574 inode = list_first_entry(head, struct inode, i_lru); 575 list_del_init(&inode->i_lru); 576 577 evict(inode); 578 } 579 } 580 581 /** 582 * evict_inodes - evict all evictable inodes for a superblock 583 * @sb: superblock to operate on 584 * 585 * Make sure that no inodes with zero refcount are retained. This is 586 * called by superblock shutdown after having MS_ACTIVE flag removed, 587 * so any inode reaching zero refcount during or after that call will 588 * be immediately evicted. 589 */ 590 void evict_inodes(struct super_block *sb) 591 { 592 struct inode *inode, *next; 593 LIST_HEAD(dispose); 594 595 spin_lock(&inode_sb_list_lock); 596 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 597 if (atomic_read(&inode->i_count)) 598 continue; 599 600 spin_lock(&inode->i_lock); 601 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 602 spin_unlock(&inode->i_lock); 603 continue; 604 } 605 606 inode->i_state |= I_FREEING; 607 inode_lru_list_del(inode); 608 spin_unlock(&inode->i_lock); 609 list_add(&inode->i_lru, &dispose); 610 } 611 spin_unlock(&inode_sb_list_lock); 612 613 dispose_list(&dispose); 614 } 615 616 /** 617 * invalidate_inodes - attempt to free all inodes on a superblock 618 * @sb: superblock to operate on 619 * @kill_dirty: flag to guide handling of dirty inodes 620 * 621 * Attempts to free all inodes for a given superblock. If there were any 622 * busy inodes return a non-zero value, else zero. 623 * If @kill_dirty is set, discard dirty inodes too, otherwise treat 624 * them as busy. 625 */ 626 int invalidate_inodes(struct super_block *sb, bool kill_dirty) 627 { 628 int busy = 0; 629 struct inode *inode, *next; 630 LIST_HEAD(dispose); 631 632 spin_lock(&inode_sb_list_lock); 633 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 634 spin_lock(&inode->i_lock); 635 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 636 spin_unlock(&inode->i_lock); 637 continue; 638 } 639 if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { 640 spin_unlock(&inode->i_lock); 641 busy = 1; 642 continue; 643 } 644 if (atomic_read(&inode->i_count)) { 645 spin_unlock(&inode->i_lock); 646 busy = 1; 647 continue; 648 } 649 650 inode->i_state |= I_FREEING; 651 inode_lru_list_del(inode); 652 spin_unlock(&inode->i_lock); 653 list_add(&inode->i_lru, &dispose); 654 } 655 spin_unlock(&inode_sb_list_lock); 656 657 dispose_list(&dispose); 658 659 return busy; 660 } 661 662 /* 663 * Isolate the inode from the LRU in preparation for freeing it. 664 * 665 * Any inodes which are pinned purely because of attached pagecache have their 666 * pagecache removed. If the inode has metadata buffers attached to 667 * mapping->private_list then try to remove them. 668 * 669 * If the inode has the I_REFERENCED flag set, then it means that it has been 670 * used recently - the flag is set in iput_final(). When we encounter such an 671 * inode, clear the flag and move it to the back of the LRU so it gets another 672 * pass through the LRU before it gets reclaimed. This is necessary because of 673 * the fact we are doing lazy LRU updates to minimise lock contention so the 674 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 675 * with this flag set because they are the inodes that are out of order. 676 */ 677 static enum lru_status inode_lru_isolate(struct list_head *item, 678 struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 679 { 680 struct list_head *freeable = arg; 681 struct inode *inode = container_of(item, struct inode, i_lru); 682 683 /* 684 * we are inverting the lru lock/inode->i_lock here, so use a trylock. 685 * If we fail to get the lock, just skip it. 686 */ 687 if (!spin_trylock(&inode->i_lock)) 688 return LRU_SKIP; 689 690 /* 691 * Referenced or dirty inodes are still in use. Give them another pass 692 * through the LRU as we canot reclaim them now. 693 */ 694 if (atomic_read(&inode->i_count) || 695 (inode->i_state & ~I_REFERENCED)) { 696 list_lru_isolate(lru, &inode->i_lru); 697 spin_unlock(&inode->i_lock); 698 this_cpu_dec(nr_unused); 699 return LRU_REMOVED; 700 } 701 702 /* recently referenced inodes get one more pass */ 703 if (inode->i_state & I_REFERENCED) { 704 inode->i_state &= ~I_REFERENCED; 705 spin_unlock(&inode->i_lock); 706 return LRU_ROTATE; 707 } 708 709 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 710 __iget(inode); 711 spin_unlock(&inode->i_lock); 712 spin_unlock(lru_lock); 713 if (remove_inode_buffers(inode)) { 714 unsigned long reap; 715 reap = invalidate_mapping_pages(&inode->i_data, 0, -1); 716 if (current_is_kswapd()) 717 __count_vm_events(KSWAPD_INODESTEAL, reap); 718 else 719 __count_vm_events(PGINODESTEAL, reap); 720 if (current->reclaim_state) 721 current->reclaim_state->reclaimed_slab += reap; 722 } 723 iput(inode); 724 spin_lock(lru_lock); 725 return LRU_RETRY; 726 } 727 728 WARN_ON(inode->i_state & I_NEW); 729 inode->i_state |= I_FREEING; 730 list_lru_isolate_move(lru, &inode->i_lru, freeable); 731 spin_unlock(&inode->i_lock); 732 733 this_cpu_dec(nr_unused); 734 return LRU_REMOVED; 735 } 736 737 /* 738 * Walk the superblock inode LRU for freeable inodes and attempt to free them. 739 * This is called from the superblock shrinker function with a number of inodes 740 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 741 * then are freed outside inode_lock by dispose_list(). 742 */ 743 long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 744 { 745 LIST_HEAD(freeable); 746 long freed; 747 748 freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 749 inode_lru_isolate, &freeable); 750 dispose_list(&freeable); 751 return freed; 752 } 753 754 static void __wait_on_freeing_inode(struct inode *inode); 755 /* 756 * Called with the inode lock held. 757 */ 758 static struct inode *find_inode(struct super_block *sb, 759 struct hlist_head *head, 760 int (*test)(struct inode *, void *), 761 void *data) 762 { 763 struct inode *inode = NULL; 764 765 repeat: 766 hlist_for_each_entry(inode, head, i_hash) { 767 if (inode->i_sb != sb) 768 continue; 769 if (!test(inode, data)) 770 continue; 771 spin_lock(&inode->i_lock); 772 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 773 __wait_on_freeing_inode(inode); 774 goto repeat; 775 } 776 __iget(inode); 777 spin_unlock(&inode->i_lock); 778 return inode; 779 } 780 return NULL; 781 } 782 783 /* 784 * find_inode_fast is the fast path version of find_inode, see the comment at 785 * iget_locked for details. 786 */ 787 static struct inode *find_inode_fast(struct super_block *sb, 788 struct hlist_head *head, unsigned long ino) 789 { 790 struct inode *inode = NULL; 791 792 repeat: 793 hlist_for_each_entry(inode, head, i_hash) { 794 if (inode->i_ino != ino) 795 continue; 796 if (inode->i_sb != sb) 797 continue; 798 spin_lock(&inode->i_lock); 799 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 800 __wait_on_freeing_inode(inode); 801 goto repeat; 802 } 803 __iget(inode); 804 spin_unlock(&inode->i_lock); 805 return inode; 806 } 807 return NULL; 808 } 809 810 /* 811 * Each cpu owns a range of LAST_INO_BATCH numbers. 812 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, 813 * to renew the exhausted range. 814 * 815 * This does not significantly increase overflow rate because every CPU can 816 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is 817 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the 818 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase 819 * overflow rate by 2x, which does not seem too significant. 820 * 821 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 822 * error if st_ino won't fit in target struct field. Use 32bit counter 823 * here to attempt to avoid that. 824 */ 825 #define LAST_INO_BATCH 1024 826 static DEFINE_PER_CPU(unsigned int, last_ino); 827 828 unsigned int get_next_ino(void) 829 { 830 unsigned int *p = &get_cpu_var(last_ino); 831 unsigned int res = *p; 832 833 #ifdef CONFIG_SMP 834 if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { 835 static atomic_t shared_last_ino; 836 int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); 837 838 res = next - LAST_INO_BATCH; 839 } 840 #endif 841 842 *p = ++res; 843 put_cpu_var(last_ino); 844 return res; 845 } 846 EXPORT_SYMBOL(get_next_ino); 847 848 /** 849 * new_inode_pseudo - obtain an inode 850 * @sb: superblock 851 * 852 * Allocates a new inode for given superblock. 853 * Inode wont be chained in superblock s_inodes list 854 * This means : 855 * - fs can't be unmount 856 * - quotas, fsnotify, writeback can't work 857 */ 858 struct inode *new_inode_pseudo(struct super_block *sb) 859 { 860 struct inode *inode = alloc_inode(sb); 861 862 if (inode) { 863 spin_lock(&inode->i_lock); 864 inode->i_state = 0; 865 spin_unlock(&inode->i_lock); 866 INIT_LIST_HEAD(&inode->i_sb_list); 867 } 868 return inode; 869 } 870 871 /** 872 * new_inode - obtain an inode 873 * @sb: superblock 874 * 875 * Allocates a new inode for given superblock. The default gfp_mask 876 * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. 877 * If HIGHMEM pages are unsuitable or it is known that pages allocated 878 * for the page cache are not reclaimable or migratable, 879 * mapping_set_gfp_mask() must be called with suitable flags on the 880 * newly created inode's mapping 881 * 882 */ 883 struct inode *new_inode(struct super_block *sb) 884 { 885 struct inode *inode; 886 887 spin_lock_prefetch(&inode_sb_list_lock); 888 889 inode = new_inode_pseudo(sb); 890 if (inode) 891 inode_sb_list_add(inode); 892 return inode; 893 } 894 EXPORT_SYMBOL(new_inode); 895 896 #ifdef CONFIG_DEBUG_LOCK_ALLOC 897 void lockdep_annotate_inode_mutex_key(struct inode *inode) 898 { 899 if (S_ISDIR(inode->i_mode)) { 900 struct file_system_type *type = inode->i_sb->s_type; 901 902 /* Set new key only if filesystem hasn't already changed it */ 903 if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { 904 /* 905 * ensure nobody is actually holding i_mutex 906 */ 907 mutex_destroy(&inode->i_mutex); 908 mutex_init(&inode->i_mutex); 909 lockdep_set_class(&inode->i_mutex, 910 &type->i_mutex_dir_key); 911 } 912 } 913 } 914 EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); 915 #endif 916 917 /** 918 * unlock_new_inode - clear the I_NEW state and wake up any waiters 919 * @inode: new inode to unlock 920 * 921 * Called when the inode is fully initialised to clear the new state of the 922 * inode and wake up anyone waiting for the inode to finish initialisation. 923 */ 924 void unlock_new_inode(struct inode *inode) 925 { 926 lockdep_annotate_inode_mutex_key(inode); 927 spin_lock(&inode->i_lock); 928 WARN_ON(!(inode->i_state & I_NEW)); 929 inode->i_state &= ~I_NEW; 930 smp_mb(); 931 wake_up_bit(&inode->i_state, __I_NEW); 932 spin_unlock(&inode->i_lock); 933 } 934 EXPORT_SYMBOL(unlock_new_inode); 935 936 /** 937 * lock_two_nondirectories - take two i_mutexes on non-directory objects 938 * 939 * Lock any non-NULL argument that is not a directory. 940 * Zero, one or two objects may be locked by this function. 941 * 942 * @inode1: first inode to lock 943 * @inode2: second inode to lock 944 */ 945 void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) 946 { 947 if (inode1 > inode2) 948 swap(inode1, inode2); 949 950 if (inode1 && !S_ISDIR(inode1->i_mode)) 951 mutex_lock(&inode1->i_mutex); 952 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 953 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); 954 } 955 EXPORT_SYMBOL(lock_two_nondirectories); 956 957 /** 958 * unlock_two_nondirectories - release locks from lock_two_nondirectories() 959 * @inode1: first inode to unlock 960 * @inode2: second inode to unlock 961 */ 962 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) 963 { 964 if (inode1 && !S_ISDIR(inode1->i_mode)) 965 mutex_unlock(&inode1->i_mutex); 966 if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) 967 mutex_unlock(&inode2->i_mutex); 968 } 969 EXPORT_SYMBOL(unlock_two_nondirectories); 970 971 /** 972 * iget5_locked - obtain an inode from a mounted file system 973 * @sb: super block of file system 974 * @hashval: hash value (usually inode number) to get 975 * @test: callback used for comparisons between inodes 976 * @set: callback used to initialize a new struct inode 977 * @data: opaque data pointer to pass to @test and @set 978 * 979 * Search for the inode specified by @hashval and @data in the inode cache, 980 * and if present it is return it with an increased reference count. This is 981 * a generalized version of iget_locked() for file systems where the inode 982 * number is not sufficient for unique identification of an inode. 983 * 984 * If the inode is not in cache, allocate a new inode and return it locked, 985 * hashed, and with the I_NEW flag set. The file system gets to fill it in 986 * before unlocking it via unlock_new_inode(). 987 * 988 * Note both @test and @set are called with the inode_hash_lock held, so can't 989 * sleep. 990 */ 991 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 992 int (*test)(struct inode *, void *), 993 int (*set)(struct inode *, void *), void *data) 994 { 995 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 996 struct inode *inode; 997 998 spin_lock(&inode_hash_lock); 999 inode = find_inode(sb, head, test, data); 1000 spin_unlock(&inode_hash_lock); 1001 1002 if (inode) { 1003 wait_on_inode(inode); 1004 return inode; 1005 } 1006 1007 inode = alloc_inode(sb); 1008 if (inode) { 1009 struct inode *old; 1010 1011 spin_lock(&inode_hash_lock); 1012 /* We released the lock, so.. */ 1013 old = find_inode(sb, head, test, data); 1014 if (!old) { 1015 if (set(inode, data)) 1016 goto set_failed; 1017 1018 spin_lock(&inode->i_lock); 1019 inode->i_state = I_NEW; 1020 hlist_add_head(&inode->i_hash, head); 1021 spin_unlock(&inode->i_lock); 1022 inode_sb_list_add(inode); 1023 spin_unlock(&inode_hash_lock); 1024 1025 /* Return the locked inode with I_NEW set, the 1026 * caller is responsible for filling in the contents 1027 */ 1028 return inode; 1029 } 1030 1031 /* 1032 * Uhhuh, somebody else created the same inode under 1033 * us. Use the old inode instead of the one we just 1034 * allocated. 1035 */ 1036 spin_unlock(&inode_hash_lock); 1037 destroy_inode(inode); 1038 inode = old; 1039 wait_on_inode(inode); 1040 } 1041 return inode; 1042 1043 set_failed: 1044 spin_unlock(&inode_hash_lock); 1045 destroy_inode(inode); 1046 return NULL; 1047 } 1048 EXPORT_SYMBOL(iget5_locked); 1049 1050 /** 1051 * iget_locked - obtain an inode from a mounted file system 1052 * @sb: super block of file system 1053 * @ino: inode number to get 1054 * 1055 * Search for the inode specified by @ino in the inode cache and if present 1056 * return it with an increased reference count. This is for file systems 1057 * where the inode number is sufficient for unique identification of an inode. 1058 * 1059 * If the inode is not in cache, allocate a new inode and return it locked, 1060 * hashed, and with the I_NEW flag set. The file system gets to fill it in 1061 * before unlocking it via unlock_new_inode(). 1062 */ 1063 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 1064 { 1065 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1066 struct inode *inode; 1067 1068 spin_lock(&inode_hash_lock); 1069 inode = find_inode_fast(sb, head, ino); 1070 spin_unlock(&inode_hash_lock); 1071 if (inode) { 1072 wait_on_inode(inode); 1073 return inode; 1074 } 1075 1076 inode = alloc_inode(sb); 1077 if (inode) { 1078 struct inode *old; 1079 1080 spin_lock(&inode_hash_lock); 1081 /* We released the lock, so.. */ 1082 old = find_inode_fast(sb, head, ino); 1083 if (!old) { 1084 inode->i_ino = ino; 1085 spin_lock(&inode->i_lock); 1086 inode->i_state = I_NEW; 1087 hlist_add_head(&inode->i_hash, head); 1088 spin_unlock(&inode->i_lock); 1089 inode_sb_list_add(inode); 1090 spin_unlock(&inode_hash_lock); 1091 1092 /* Return the locked inode with I_NEW set, the 1093 * caller is responsible for filling in the contents 1094 */ 1095 return inode; 1096 } 1097 1098 /* 1099 * Uhhuh, somebody else created the same inode under 1100 * us. Use the old inode instead of the one we just 1101 * allocated. 1102 */ 1103 spin_unlock(&inode_hash_lock); 1104 destroy_inode(inode); 1105 inode = old; 1106 wait_on_inode(inode); 1107 } 1108 return inode; 1109 } 1110 EXPORT_SYMBOL(iget_locked); 1111 1112 /* 1113 * search the inode cache for a matching inode number. 1114 * If we find one, then the inode number we are trying to 1115 * allocate is not unique and so we should not use it. 1116 * 1117 * Returns 1 if the inode number is unique, 0 if it is not. 1118 */ 1119 static int test_inode_iunique(struct super_block *sb, unsigned long ino) 1120 { 1121 struct hlist_head *b = inode_hashtable + hash(sb, ino); 1122 struct inode *inode; 1123 1124 spin_lock(&inode_hash_lock); 1125 hlist_for_each_entry(inode, b, i_hash) { 1126 if (inode->i_ino == ino && inode->i_sb == sb) { 1127 spin_unlock(&inode_hash_lock); 1128 return 0; 1129 } 1130 } 1131 spin_unlock(&inode_hash_lock); 1132 1133 return 1; 1134 } 1135 1136 /** 1137 * iunique - get a unique inode number 1138 * @sb: superblock 1139 * @max_reserved: highest reserved inode number 1140 * 1141 * Obtain an inode number that is unique on the system for a given 1142 * superblock. This is used by file systems that have no natural 1143 * permanent inode numbering system. An inode number is returned that 1144 * is higher than the reserved limit but unique. 1145 * 1146 * BUGS: 1147 * With a large number of inodes live on the file system this function 1148 * currently becomes quite slow. 1149 */ 1150 ino_t iunique(struct super_block *sb, ino_t max_reserved) 1151 { 1152 /* 1153 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 1154 * error if st_ino won't fit in target struct field. Use 32bit counter 1155 * here to attempt to avoid that. 1156 */ 1157 static DEFINE_SPINLOCK(iunique_lock); 1158 static unsigned int counter; 1159 ino_t res; 1160 1161 spin_lock(&iunique_lock); 1162 do { 1163 if (counter <= max_reserved) 1164 counter = max_reserved + 1; 1165 res = counter++; 1166 } while (!test_inode_iunique(sb, res)); 1167 spin_unlock(&iunique_lock); 1168 1169 return res; 1170 } 1171 EXPORT_SYMBOL(iunique); 1172 1173 struct inode *igrab(struct inode *inode) 1174 { 1175 spin_lock(&inode->i_lock); 1176 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { 1177 __iget(inode); 1178 spin_unlock(&inode->i_lock); 1179 } else { 1180 spin_unlock(&inode->i_lock); 1181 /* 1182 * Handle the case where s_op->clear_inode is not been 1183 * called yet, and somebody is calling igrab 1184 * while the inode is getting freed. 1185 */ 1186 inode = NULL; 1187 } 1188 return inode; 1189 } 1190 EXPORT_SYMBOL(igrab); 1191 1192 /** 1193 * ilookup5_nowait - search for an inode in the inode cache 1194 * @sb: super block of file system to search 1195 * @hashval: hash value (usually inode number) to search for 1196 * @test: callback used for comparisons between inodes 1197 * @data: opaque data pointer to pass to @test 1198 * 1199 * Search for the inode specified by @hashval and @data in the inode cache. 1200 * If the inode is in the cache, the inode is returned with an incremented 1201 * reference count. 1202 * 1203 * Note: I_NEW is not waited upon so you have to be very careful what you do 1204 * with the returned inode. You probably should be using ilookup5() instead. 1205 * 1206 * Note2: @test is called with the inode_hash_lock held, so can't sleep. 1207 */ 1208 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1209 int (*test)(struct inode *, void *), void *data) 1210 { 1211 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1212 struct inode *inode; 1213 1214 spin_lock(&inode_hash_lock); 1215 inode = find_inode(sb, head, test, data); 1216 spin_unlock(&inode_hash_lock); 1217 1218 return inode; 1219 } 1220 EXPORT_SYMBOL(ilookup5_nowait); 1221 1222 /** 1223 * ilookup5 - search for an inode in the inode cache 1224 * @sb: super block of file system to search 1225 * @hashval: hash value (usually inode number) to search for 1226 * @test: callback used for comparisons between inodes 1227 * @data: opaque data pointer to pass to @test 1228 * 1229 * Search for the inode specified by @hashval and @data in the inode cache, 1230 * and if the inode is in the cache, return the inode with an incremented 1231 * reference count. Waits on I_NEW before returning the inode. 1232 * returned with an incremented reference count. 1233 * 1234 * This is a generalized version of ilookup() for file systems where the 1235 * inode number is not sufficient for unique identification of an inode. 1236 * 1237 * Note: @test is called with the inode_hash_lock held, so can't sleep. 1238 */ 1239 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1240 int (*test)(struct inode *, void *), void *data) 1241 { 1242 struct inode *inode = ilookup5_nowait(sb, hashval, test, data); 1243 1244 if (inode) 1245 wait_on_inode(inode); 1246 return inode; 1247 } 1248 EXPORT_SYMBOL(ilookup5); 1249 1250 /** 1251 * ilookup - search for an inode in the inode cache 1252 * @sb: super block of file system to search 1253 * @ino: inode number to search for 1254 * 1255 * Search for the inode @ino in the inode cache, and if the inode is in the 1256 * cache, the inode is returned with an incremented reference count. 1257 */ 1258 struct inode *ilookup(struct super_block *sb, unsigned long ino) 1259 { 1260 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1261 struct inode *inode; 1262 1263 spin_lock(&inode_hash_lock); 1264 inode = find_inode_fast(sb, head, ino); 1265 spin_unlock(&inode_hash_lock); 1266 1267 if (inode) 1268 wait_on_inode(inode); 1269 return inode; 1270 } 1271 EXPORT_SYMBOL(ilookup); 1272 1273 /** 1274 * find_inode_nowait - find an inode in the inode cache 1275 * @sb: super block of file system to search 1276 * @hashval: hash value (usually inode number) to search for 1277 * @match: callback used for comparisons between inodes 1278 * @data: opaque data pointer to pass to @match 1279 * 1280 * Search for the inode specified by @hashval and @data in the inode 1281 * cache, where the helper function @match will return 0 if the inode 1282 * does not match, 1 if the inode does match, and -1 if the search 1283 * should be stopped. The @match function must be responsible for 1284 * taking the i_lock spin_lock and checking i_state for an inode being 1285 * freed or being initialized, and incrementing the reference count 1286 * before returning 1. It also must not sleep, since it is called with 1287 * the inode_hash_lock spinlock held. 1288 * 1289 * This is a even more generalized version of ilookup5() when the 1290 * function must never block --- find_inode() can block in 1291 * __wait_on_freeing_inode() --- or when the caller can not increment 1292 * the reference count because the resulting iput() might cause an 1293 * inode eviction. The tradeoff is that the @match funtion must be 1294 * very carefully implemented. 1295 */ 1296 struct inode *find_inode_nowait(struct super_block *sb, 1297 unsigned long hashval, 1298 int (*match)(struct inode *, unsigned long, 1299 void *), 1300 void *data) 1301 { 1302 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1303 struct inode *inode, *ret_inode = NULL; 1304 int mval; 1305 1306 spin_lock(&inode_hash_lock); 1307 hlist_for_each_entry(inode, head, i_hash) { 1308 if (inode->i_sb != sb) 1309 continue; 1310 mval = match(inode, hashval, data); 1311 if (mval == 0) 1312 continue; 1313 if (mval == 1) 1314 ret_inode = inode; 1315 goto out; 1316 } 1317 out: 1318 spin_unlock(&inode_hash_lock); 1319 return ret_inode; 1320 } 1321 EXPORT_SYMBOL(find_inode_nowait); 1322 1323 int insert_inode_locked(struct inode *inode) 1324 { 1325 struct super_block *sb = inode->i_sb; 1326 ino_t ino = inode->i_ino; 1327 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1328 1329 while (1) { 1330 struct inode *old = NULL; 1331 spin_lock(&inode_hash_lock); 1332 hlist_for_each_entry(old, head, i_hash) { 1333 if (old->i_ino != ino) 1334 continue; 1335 if (old->i_sb != sb) 1336 continue; 1337 spin_lock(&old->i_lock); 1338 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1339 spin_unlock(&old->i_lock); 1340 continue; 1341 } 1342 break; 1343 } 1344 if (likely(!old)) { 1345 spin_lock(&inode->i_lock); 1346 inode->i_state |= I_NEW; 1347 hlist_add_head(&inode->i_hash, head); 1348 spin_unlock(&inode->i_lock); 1349 spin_unlock(&inode_hash_lock); 1350 return 0; 1351 } 1352 __iget(old); 1353 spin_unlock(&old->i_lock); 1354 spin_unlock(&inode_hash_lock); 1355 wait_on_inode(old); 1356 if (unlikely(!inode_unhashed(old))) { 1357 iput(old); 1358 return -EBUSY; 1359 } 1360 iput(old); 1361 } 1362 } 1363 EXPORT_SYMBOL(insert_inode_locked); 1364 1365 int insert_inode_locked4(struct inode *inode, unsigned long hashval, 1366 int (*test)(struct inode *, void *), void *data) 1367 { 1368 struct super_block *sb = inode->i_sb; 1369 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1370 1371 while (1) { 1372 struct inode *old = NULL; 1373 1374 spin_lock(&inode_hash_lock); 1375 hlist_for_each_entry(old, head, i_hash) { 1376 if (old->i_sb != sb) 1377 continue; 1378 if (!test(old, data)) 1379 continue; 1380 spin_lock(&old->i_lock); 1381 if (old->i_state & (I_FREEING|I_WILL_FREE)) { 1382 spin_unlock(&old->i_lock); 1383 continue; 1384 } 1385 break; 1386 } 1387 if (likely(!old)) { 1388 spin_lock(&inode->i_lock); 1389 inode->i_state |= I_NEW; 1390 hlist_add_head(&inode->i_hash, head); 1391 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode_hash_lock); 1393 return 0; 1394 } 1395 __iget(old); 1396 spin_unlock(&old->i_lock); 1397 spin_unlock(&inode_hash_lock); 1398 wait_on_inode(old); 1399 if (unlikely(!inode_unhashed(old))) { 1400 iput(old); 1401 return -EBUSY; 1402 } 1403 iput(old); 1404 } 1405 } 1406 EXPORT_SYMBOL(insert_inode_locked4); 1407 1408 1409 int generic_delete_inode(struct inode *inode) 1410 { 1411 return 1; 1412 } 1413 EXPORT_SYMBOL(generic_delete_inode); 1414 1415 /* 1416 * Called when we're dropping the last reference 1417 * to an inode. 1418 * 1419 * Call the FS "drop_inode()" function, defaulting to 1420 * the legacy UNIX filesystem behaviour. If it tells 1421 * us to evict inode, do so. Otherwise, retain inode 1422 * in cache if fs is alive, sync and evict if fs is 1423 * shutting down. 1424 */ 1425 static void iput_final(struct inode *inode) 1426 { 1427 struct super_block *sb = inode->i_sb; 1428 const struct super_operations *op = inode->i_sb->s_op; 1429 int drop; 1430 1431 WARN_ON(inode->i_state & I_NEW); 1432 1433 if (op->drop_inode) 1434 drop = op->drop_inode(inode); 1435 else 1436 drop = generic_drop_inode(inode); 1437 1438 if (!drop && (sb->s_flags & MS_ACTIVE)) { 1439 inode->i_state |= I_REFERENCED; 1440 inode_add_lru(inode); 1441 spin_unlock(&inode->i_lock); 1442 return; 1443 } 1444 1445 if (!drop) { 1446 inode->i_state |= I_WILL_FREE; 1447 spin_unlock(&inode->i_lock); 1448 write_inode_now(inode, 1); 1449 spin_lock(&inode->i_lock); 1450 WARN_ON(inode->i_state & I_NEW); 1451 inode->i_state &= ~I_WILL_FREE; 1452 } 1453 1454 inode->i_state |= I_FREEING; 1455 if (!list_empty(&inode->i_lru)) 1456 inode_lru_list_del(inode); 1457 spin_unlock(&inode->i_lock); 1458 1459 evict(inode); 1460 } 1461 1462 /** 1463 * iput - put an inode 1464 * @inode: inode to put 1465 * 1466 * Puts an inode, dropping its usage count. If the inode use count hits 1467 * zero, the inode is then freed and may also be destroyed. 1468 * 1469 * Consequently, iput() can sleep. 1470 */ 1471 void iput(struct inode *inode) 1472 { 1473 if (!inode) 1474 return; 1475 BUG_ON(inode->i_state & I_CLEAR); 1476 retry: 1477 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { 1478 if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { 1479 atomic_inc(&inode->i_count); 1480 inode->i_state &= ~I_DIRTY_TIME; 1481 spin_unlock(&inode->i_lock); 1482 trace_writeback_lazytime_iput(inode); 1483 mark_inode_dirty_sync(inode); 1484 goto retry; 1485 } 1486 iput_final(inode); 1487 } 1488 } 1489 EXPORT_SYMBOL(iput); 1490 1491 /** 1492 * bmap - find a block number in a file 1493 * @inode: inode of file 1494 * @block: block to find 1495 * 1496 * Returns the block number on the device holding the inode that 1497 * is the disk block number for the block of the file requested. 1498 * That is, asked for block 4 of inode 1 the function will return the 1499 * disk block relative to the disk start that holds that block of the 1500 * file. 1501 */ 1502 sector_t bmap(struct inode *inode, sector_t block) 1503 { 1504 sector_t res = 0; 1505 if (inode->i_mapping->a_ops->bmap) 1506 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1507 return res; 1508 } 1509 EXPORT_SYMBOL(bmap); 1510 1511 /* 1512 * With relative atime, only update atime if the previous atime is 1513 * earlier than either the ctime or mtime or if at least a day has 1514 * passed since the last atime update. 1515 */ 1516 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, 1517 struct timespec now) 1518 { 1519 1520 if (!(mnt->mnt_flags & MNT_RELATIME)) 1521 return 1; 1522 /* 1523 * Is mtime younger than atime? If yes, update atime: 1524 */ 1525 if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0) 1526 return 1; 1527 /* 1528 * Is ctime younger than atime? If yes, update atime: 1529 */ 1530 if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0) 1531 return 1; 1532 1533 /* 1534 * Is the previous atime value older than a day? If yes, 1535 * update atime: 1536 */ 1537 if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) 1538 return 1; 1539 /* 1540 * Good, we can skip the atime update: 1541 */ 1542 return 0; 1543 } 1544 1545 int generic_update_time(struct inode *inode, struct timespec *time, int flags) 1546 { 1547 int iflags = I_DIRTY_TIME; 1548 1549 if (flags & S_ATIME) 1550 inode->i_atime = *time; 1551 if (flags & S_VERSION) 1552 inode_inc_iversion(inode); 1553 if (flags & S_CTIME) 1554 inode->i_ctime = *time; 1555 if (flags & S_MTIME) 1556 inode->i_mtime = *time; 1557 1558 if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) 1559 iflags |= I_DIRTY_SYNC; 1560 __mark_inode_dirty(inode, iflags); 1561 return 0; 1562 } 1563 EXPORT_SYMBOL(generic_update_time); 1564 1565 /* 1566 * This does the actual work of updating an inodes time or version. Must have 1567 * had called mnt_want_write() before calling this. 1568 */ 1569 static int update_time(struct inode *inode, struct timespec *time, int flags) 1570 { 1571 int (*update_time)(struct inode *, struct timespec *, int); 1572 1573 update_time = inode->i_op->update_time ? inode->i_op->update_time : 1574 generic_update_time; 1575 1576 return update_time(inode, time, flags); 1577 } 1578 1579 /** 1580 * touch_atime - update the access time 1581 * @path: the &struct path to update 1582 * 1583 * Update the accessed time on an inode and mark it for writeback. 1584 * This function automatically handles read only file systems and media, 1585 * as well as the "noatime" flag and inode specific "noatime" markers. 1586 */ 1587 void touch_atime(const struct path *path) 1588 { 1589 struct vfsmount *mnt = path->mnt; 1590 struct inode *inode = d_inode(path->dentry); 1591 struct timespec now; 1592 1593 if (inode->i_flags & S_NOATIME) 1594 return; 1595 if (IS_NOATIME(inode)) 1596 return; 1597 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1598 return; 1599 1600 if (mnt->mnt_flags & MNT_NOATIME) 1601 return; 1602 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1603 return; 1604 1605 now = current_fs_time(inode->i_sb); 1606 1607 if (!relatime_need_update(mnt, inode, now)) 1608 return; 1609 1610 if (timespec_equal(&inode->i_atime, &now)) 1611 return; 1612 1613 if (!sb_start_write_trylock(inode->i_sb)) 1614 return; 1615 1616 if (__mnt_want_write(mnt)) 1617 goto skip_update; 1618 /* 1619 * File systems can error out when updating inodes if they need to 1620 * allocate new space to modify an inode (such is the case for 1621 * Btrfs), but since we touch atime while walking down the path we 1622 * really don't care if we failed to update the atime of the file, 1623 * so just ignore the return value. 1624 * We may also fail on filesystems that have the ability to make parts 1625 * of the fs read only, e.g. subvolumes in Btrfs. 1626 */ 1627 update_time(inode, &now, S_ATIME); 1628 __mnt_drop_write(mnt); 1629 skip_update: 1630 sb_end_write(inode->i_sb); 1631 } 1632 EXPORT_SYMBOL(touch_atime); 1633 1634 /* 1635 * The logic we want is 1636 * 1637 * if suid or (sgid and xgrp) 1638 * remove privs 1639 */ 1640 int should_remove_suid(struct dentry *dentry) 1641 { 1642 umode_t mode = d_inode(dentry)->i_mode; 1643 int kill = 0; 1644 1645 /* suid always must be killed */ 1646 if (unlikely(mode & S_ISUID)) 1647 kill = ATTR_KILL_SUID; 1648 1649 /* 1650 * sgid without any exec bits is just a mandatory locking mark; leave 1651 * it alone. If some exec bits are set, it's a real sgid; kill it. 1652 */ 1653 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1654 kill |= ATTR_KILL_SGID; 1655 1656 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) 1657 return kill; 1658 1659 return 0; 1660 } 1661 EXPORT_SYMBOL(should_remove_suid); 1662 1663 static int __remove_suid(struct dentry *dentry, int kill) 1664 { 1665 struct iattr newattrs; 1666 1667 newattrs.ia_valid = ATTR_FORCE | kill; 1668 /* 1669 * Note we call this on write, so notify_change will not 1670 * encounter any conflicting delegations: 1671 */ 1672 return notify_change(dentry, &newattrs, NULL); 1673 } 1674 1675 int file_remove_suid(struct file *file) 1676 { 1677 struct dentry *dentry = file->f_path.dentry; 1678 struct inode *inode = d_inode(dentry); 1679 int killsuid; 1680 int killpriv; 1681 int error = 0; 1682 1683 /* Fast path for nothing security related */ 1684 if (IS_NOSEC(inode)) 1685 return 0; 1686 1687 killsuid = should_remove_suid(dentry); 1688 killpriv = security_inode_need_killpriv(dentry); 1689 1690 if (killpriv < 0) 1691 return killpriv; 1692 if (killpriv) 1693 error = security_inode_killpriv(dentry); 1694 if (!error && killsuid) 1695 error = __remove_suid(dentry, killsuid); 1696 if (!error && (inode->i_sb->s_flags & MS_NOSEC)) 1697 inode->i_flags |= S_NOSEC; 1698 1699 return error; 1700 } 1701 EXPORT_SYMBOL(file_remove_suid); 1702 1703 /** 1704 * file_update_time - update mtime and ctime time 1705 * @file: file accessed 1706 * 1707 * Update the mtime and ctime members of an inode and mark the inode 1708 * for writeback. Note that this function is meant exclusively for 1709 * usage in the file write path of filesystems, and filesystems may 1710 * choose to explicitly ignore update via this function with the 1711 * S_NOCMTIME inode flag, e.g. for network filesystem where these 1712 * timestamps are handled by the server. This can return an error for 1713 * file systems who need to allocate space in order to update an inode. 1714 */ 1715 1716 int file_update_time(struct file *file) 1717 { 1718 struct inode *inode = file_inode(file); 1719 struct timespec now; 1720 int sync_it = 0; 1721 int ret; 1722 1723 /* First try to exhaust all avenues to not sync */ 1724 if (IS_NOCMTIME(inode)) 1725 return 0; 1726 1727 now = current_fs_time(inode->i_sb); 1728 if (!timespec_equal(&inode->i_mtime, &now)) 1729 sync_it = S_MTIME; 1730 1731 if (!timespec_equal(&inode->i_ctime, &now)) 1732 sync_it |= S_CTIME; 1733 1734 if (IS_I_VERSION(inode)) 1735 sync_it |= S_VERSION; 1736 1737 if (!sync_it) 1738 return 0; 1739 1740 /* Finally allowed to write? Takes lock. */ 1741 if (__mnt_want_write_file(file)) 1742 return 0; 1743 1744 ret = update_time(inode, &now, sync_it); 1745 __mnt_drop_write_file(file); 1746 1747 return ret; 1748 } 1749 EXPORT_SYMBOL(file_update_time); 1750 1751 int inode_needs_sync(struct inode *inode) 1752 { 1753 if (IS_SYNC(inode)) 1754 return 1; 1755 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1756 return 1; 1757 return 0; 1758 } 1759 EXPORT_SYMBOL(inode_needs_sync); 1760 1761 /* 1762 * If we try to find an inode in the inode hash while it is being 1763 * deleted, we have to wait until the filesystem completes its 1764 * deletion before reporting that it isn't found. This function waits 1765 * until the deletion _might_ have completed. Callers are responsible 1766 * to recheck inode state. 1767 * 1768 * It doesn't matter if I_NEW is not set initially, a call to 1769 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list 1770 * will DTRT. 1771 */ 1772 static void __wait_on_freeing_inode(struct inode *inode) 1773 { 1774 wait_queue_head_t *wq; 1775 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1776 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1777 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1778 spin_unlock(&inode->i_lock); 1779 spin_unlock(&inode_hash_lock); 1780 schedule(); 1781 finish_wait(wq, &wait.wait); 1782 spin_lock(&inode_hash_lock); 1783 } 1784 1785 static __initdata unsigned long ihash_entries; 1786 static int __init set_ihash_entries(char *str) 1787 { 1788 if (!str) 1789 return 0; 1790 ihash_entries = simple_strtoul(str, &str, 0); 1791 return 1; 1792 } 1793 __setup("ihash_entries=", set_ihash_entries); 1794 1795 /* 1796 * Initialize the waitqueues and inode hash table. 1797 */ 1798 void __init inode_init_early(void) 1799 { 1800 unsigned int loop; 1801 1802 /* If hashes are distributed across NUMA nodes, defer 1803 * hash allocation until vmalloc space is available. 1804 */ 1805 if (hashdist) 1806 return; 1807 1808 inode_hashtable = 1809 alloc_large_system_hash("Inode-cache", 1810 sizeof(struct hlist_head), 1811 ihash_entries, 1812 14, 1813 HASH_EARLY, 1814 &i_hash_shift, 1815 &i_hash_mask, 1816 0, 1817 0); 1818 1819 for (loop = 0; loop < (1U << i_hash_shift); loop++) 1820 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1821 } 1822 1823 void __init inode_init(void) 1824 { 1825 unsigned int loop; 1826 1827 /* inode slab cache */ 1828 inode_cachep = kmem_cache_create("inode_cache", 1829 sizeof(struct inode), 1830 0, 1831 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1832 SLAB_MEM_SPREAD), 1833 init_once); 1834 1835 /* Hash may have been set up in inode_init_early */ 1836 if (!hashdist) 1837 return; 1838 1839 inode_hashtable = 1840 alloc_large_system_hash("Inode-cache", 1841 sizeof(struct hlist_head), 1842 ihash_entries, 1843 14, 1844 0, 1845 &i_hash_shift, 1846 &i_hash_mask, 1847 0, 1848 0); 1849 1850 for (loop = 0; loop < (1U << i_hash_shift); loop++) 1851 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1852 } 1853 1854 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1855 { 1856 inode->i_mode = mode; 1857 if (S_ISCHR(mode)) { 1858 inode->i_fop = &def_chr_fops; 1859 inode->i_rdev = rdev; 1860 } else if (S_ISBLK(mode)) { 1861 inode->i_fop = &def_blk_fops; 1862 inode->i_rdev = rdev; 1863 } else if (S_ISFIFO(mode)) 1864 inode->i_fop = &pipefifo_fops; 1865 else if (S_ISSOCK(mode)) 1866 ; /* leave it no_open_fops */ 1867 else 1868 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" 1869 " inode %s:%lu\n", mode, inode->i_sb->s_id, 1870 inode->i_ino); 1871 } 1872 EXPORT_SYMBOL(init_special_inode); 1873 1874 /** 1875 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards 1876 * @inode: New inode 1877 * @dir: Directory inode 1878 * @mode: mode of the new inode 1879 */ 1880 void inode_init_owner(struct inode *inode, const struct inode *dir, 1881 umode_t mode) 1882 { 1883 inode->i_uid = current_fsuid(); 1884 if (dir && dir->i_mode & S_ISGID) { 1885 inode->i_gid = dir->i_gid; 1886 if (S_ISDIR(mode)) 1887 mode |= S_ISGID; 1888 } else 1889 inode->i_gid = current_fsgid(); 1890 inode->i_mode = mode; 1891 } 1892 EXPORT_SYMBOL(inode_init_owner); 1893 1894 /** 1895 * inode_owner_or_capable - check current task permissions to inode 1896 * @inode: inode being checked 1897 * 1898 * Return true if current either has CAP_FOWNER in a namespace with the 1899 * inode owner uid mapped, or owns the file. 1900 */ 1901 bool inode_owner_or_capable(const struct inode *inode) 1902 { 1903 struct user_namespace *ns; 1904 1905 if (uid_eq(current_fsuid(), inode->i_uid)) 1906 return true; 1907 1908 ns = current_user_ns(); 1909 if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) 1910 return true; 1911 return false; 1912 } 1913 EXPORT_SYMBOL(inode_owner_or_capable); 1914 1915 /* 1916 * Direct i/o helper functions 1917 */ 1918 static void __inode_dio_wait(struct inode *inode) 1919 { 1920 wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); 1921 DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); 1922 1923 do { 1924 prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); 1925 if (atomic_read(&inode->i_dio_count)) 1926 schedule(); 1927 } while (atomic_read(&inode->i_dio_count)); 1928 finish_wait(wq, &q.wait); 1929 } 1930 1931 /** 1932 * inode_dio_wait - wait for outstanding DIO requests to finish 1933 * @inode: inode to wait for 1934 * 1935 * Waits for all pending direct I/O requests to finish so that we can 1936 * proceed with a truncate or equivalent operation. 1937 * 1938 * Must be called under a lock that serializes taking new references 1939 * to i_dio_count, usually by inode->i_mutex. 1940 */ 1941 void inode_dio_wait(struct inode *inode) 1942 { 1943 if (atomic_read(&inode->i_dio_count)) 1944 __inode_dio_wait(inode); 1945 } 1946 EXPORT_SYMBOL(inode_dio_wait); 1947 1948 /* 1949 * inode_set_flags - atomically set some inode flags 1950 * 1951 * Note: the caller should be holding i_mutex, or else be sure that 1952 * they have exclusive access to the inode structure (i.e., while the 1953 * inode is being instantiated). The reason for the cmpxchg() loop 1954 * --- which wouldn't be necessary if all code paths which modify 1955 * i_flags actually followed this rule, is that there is at least one 1956 * code path which doesn't today --- for example, 1957 * __generic_file_aio_write() calls file_remove_suid() without holding 1958 * i_mutex --- so we use cmpxchg() out of an abundance of caution. 1959 * 1960 * In the long run, i_mutex is overkill, and we should probably look 1961 * at using the i_lock spinlock to protect i_flags, and then make sure 1962 * it is so documented in include/linux/fs.h and that all code follows 1963 * the locking convention!! 1964 */ 1965 void inode_set_flags(struct inode *inode, unsigned int flags, 1966 unsigned int mask) 1967 { 1968 unsigned int old_flags, new_flags; 1969 1970 WARN_ON_ONCE(flags & ~mask); 1971 do { 1972 old_flags = ACCESS_ONCE(inode->i_flags); 1973 new_flags = (old_flags & ~mask) | flags; 1974 } while (unlikely(cmpxchg(&inode->i_flags, old_flags, 1975 new_flags) != old_flags)); 1976 } 1977 EXPORT_SYMBOL(inode_set_flags); 1978