1 /* 2 * linux/fs/inode.c 3 * 4 * (C) 1997 Linus Torvalds 5 */ 6 7 #include <linux/config.h> 8 #include <linux/fs.h> 9 #include <linux/mm.h> 10 #include <linux/dcache.h> 11 #include <linux/init.h> 12 #include <linux/quotaops.h> 13 #include <linux/slab.h> 14 #include <linux/writeback.h> 15 #include <linux/module.h> 16 #include <linux/backing-dev.h> 17 #include <linux/wait.h> 18 #include <linux/hash.h> 19 #include <linux/swap.h> 20 #include <linux/security.h> 21 #include <linux/pagemap.h> 22 #include <linux/cdev.h> 23 #include <linux/bootmem.h> 24 25 /* 26 * This is needed for the following functions: 27 * - inode_has_buffers 28 * - invalidate_inode_buffers 29 * - invalidate_bdev 30 * 31 * FIXME: remove all knowledge of the buffer layer from this file 32 */ 33 #include <linux/buffer_head.h> 34 35 /* 36 * New inode.c implementation. 37 * 38 * This implementation has the basic premise of trying 39 * to be extremely low-overhead and SMP-safe, yet be 40 * simple enough to be "obviously correct". 41 * 42 * Famous last words. 43 */ 44 45 /* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ 46 47 /* #define INODE_PARANOIA 1 */ 48 /* #define INODE_DEBUG 1 */ 49 50 /* 51 * Inode lookup is no longer as critical as it used to be: 52 * most of the lookups are going to be through the dcache. 53 */ 54 #define I_HASHBITS i_hash_shift 55 #define I_HASHMASK i_hash_mask 56 57 static unsigned int i_hash_mask; 58 static unsigned int i_hash_shift; 59 60 /* 61 * Each inode can be on two separate lists. One is 62 * the hash list of the inode, used for lookups. The 63 * other linked list is the "type" list: 64 * "in_use" - valid inode, i_count > 0, i_nlink > 0 65 * "dirty" - as "in_use" but also dirty 66 * "unused" - valid inode, i_count = 0 67 * 68 * A "dirty" list is maintained for each super block, 69 * allowing for low-overhead inode sync() operations. 70 */ 71 72 LIST_HEAD(inode_in_use); 73 LIST_HEAD(inode_unused); 74 static struct hlist_head *inode_hashtable; 75 76 /* 77 * A simple spinlock to protect the list manipulations. 78 * 79 * NOTE! You also have to own the lock if you change 80 * the i_state of an inode while it is in use.. 81 */ 82 DEFINE_SPINLOCK(inode_lock); 83 84 /* 85 * iprune_sem provides exclusion between the kswapd or try_to_free_pages 86 * icache shrinking path, and the umount path. Without this exclusion, 87 * by the time prune_icache calls iput for the inode whose pages it has 88 * been invalidating, or by the time it calls clear_inode & destroy_inode 89 * from its final dispose_list, the struct super_block they refer to 90 * (for inode->i_sb->s_op) may already have been freed and reused. 91 */ 92 DECLARE_MUTEX(iprune_sem); 93 94 /* 95 * Statistics gathering.. 96 */ 97 struct inodes_stat_t inodes_stat; 98 99 static kmem_cache_t * inode_cachep; 100 101 static struct inode *alloc_inode(struct super_block *sb) 102 { 103 static struct address_space_operations empty_aops; 104 static struct inode_operations empty_iops; 105 static struct file_operations empty_fops; 106 struct inode *inode; 107 108 if (sb->s_op->alloc_inode) 109 inode = sb->s_op->alloc_inode(sb); 110 else 111 inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); 112 113 if (inode) { 114 struct address_space * const mapping = &inode->i_data; 115 116 inode->i_sb = sb; 117 inode->i_blkbits = sb->s_blocksize_bits; 118 inode->i_flags = 0; 119 atomic_set(&inode->i_count, 1); 120 inode->i_op = &empty_iops; 121 inode->i_fop = &empty_fops; 122 inode->i_nlink = 1; 123 atomic_set(&inode->i_writecount, 0); 124 inode->i_size = 0; 125 inode->i_blocks = 0; 126 inode->i_bytes = 0; 127 inode->i_generation = 0; 128 #ifdef CONFIG_QUOTA 129 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 130 #endif 131 inode->i_pipe = NULL; 132 inode->i_bdev = NULL; 133 inode->i_cdev = NULL; 134 inode->i_rdev = 0; 135 inode->i_security = NULL; 136 inode->dirtied_when = 0; 137 if (security_inode_alloc(inode)) { 138 if (inode->i_sb->s_op->destroy_inode) 139 inode->i_sb->s_op->destroy_inode(inode); 140 else 141 kmem_cache_free(inode_cachep, (inode)); 142 return NULL; 143 } 144 145 mapping->a_ops = &empty_aops; 146 mapping->host = inode; 147 mapping->flags = 0; 148 mapping_set_gfp_mask(mapping, GFP_HIGHUSER); 149 mapping->assoc_mapping = NULL; 150 mapping->backing_dev_info = &default_backing_dev_info; 151 152 /* 153 * If the block_device provides a backing_dev_info for client 154 * inodes then use that. Otherwise the inode share the bdev's 155 * backing_dev_info. 156 */ 157 if (sb->s_bdev) { 158 struct backing_dev_info *bdi; 159 160 bdi = sb->s_bdev->bd_inode_backing_dev_info; 161 if (!bdi) 162 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 163 mapping->backing_dev_info = bdi; 164 } 165 memset(&inode->u, 0, sizeof(inode->u)); 166 inode->i_mapping = mapping; 167 } 168 return inode; 169 } 170 171 void destroy_inode(struct inode *inode) 172 { 173 if (inode_has_buffers(inode)) 174 BUG(); 175 security_inode_free(inode); 176 if (inode->i_sb->s_op->destroy_inode) 177 inode->i_sb->s_op->destroy_inode(inode); 178 else 179 kmem_cache_free(inode_cachep, (inode)); 180 } 181 182 183 /* 184 * These are initializations that only need to be done 185 * once, because the fields are idempotent across use 186 * of the inode, so let the slab aware of that. 187 */ 188 void inode_init_once(struct inode *inode) 189 { 190 memset(inode, 0, sizeof(*inode)); 191 INIT_HLIST_NODE(&inode->i_hash); 192 INIT_LIST_HEAD(&inode->i_dentry); 193 INIT_LIST_HEAD(&inode->i_devices); 194 sema_init(&inode->i_sem, 1); 195 init_rwsem(&inode->i_alloc_sem); 196 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 197 rwlock_init(&inode->i_data.tree_lock); 198 spin_lock_init(&inode->i_data.i_mmap_lock); 199 INIT_LIST_HEAD(&inode->i_data.private_list); 200 spin_lock_init(&inode->i_data.private_lock); 201 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 202 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 203 spin_lock_init(&inode->i_lock); 204 i_size_ordered_init(inode); 205 } 206 207 EXPORT_SYMBOL(inode_init_once); 208 209 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) 210 { 211 struct inode * inode = (struct inode *) foo; 212 213 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 214 SLAB_CTOR_CONSTRUCTOR) 215 inode_init_once(inode); 216 } 217 218 /* 219 * inode_lock must be held 220 */ 221 void __iget(struct inode * inode) 222 { 223 if (atomic_read(&inode->i_count)) { 224 atomic_inc(&inode->i_count); 225 return; 226 } 227 atomic_inc(&inode->i_count); 228 if (!(inode->i_state & (I_DIRTY|I_LOCK))) 229 list_move(&inode->i_list, &inode_in_use); 230 inodes_stat.nr_unused--; 231 } 232 233 /** 234 * clear_inode - clear an inode 235 * @inode: inode to clear 236 * 237 * This is called by the filesystem to tell us 238 * that the inode is no longer useful. We just 239 * terminate it with extreme prejudice. 240 */ 241 void clear_inode(struct inode *inode) 242 { 243 might_sleep(); 244 invalidate_inode_buffers(inode); 245 246 if (inode->i_data.nrpages) 247 BUG(); 248 if (!(inode->i_state & I_FREEING)) 249 BUG(); 250 if (inode->i_state & I_CLEAR) 251 BUG(); 252 wait_on_inode(inode); 253 DQUOT_DROP(inode); 254 if (inode->i_sb && inode->i_sb->s_op->clear_inode) 255 inode->i_sb->s_op->clear_inode(inode); 256 if (inode->i_bdev) 257 bd_forget(inode); 258 if (inode->i_cdev) 259 cd_forget(inode); 260 inode->i_state = I_CLEAR; 261 } 262 263 EXPORT_SYMBOL(clear_inode); 264 265 /* 266 * dispose_list - dispose of the contents of a local list 267 * @head: the head of the list to free 268 * 269 * Dispose-list gets a local list with local inodes in it, so it doesn't 270 * need to worry about list corruption and SMP locks. 271 */ 272 static void dispose_list(struct list_head *head) 273 { 274 int nr_disposed = 0; 275 276 while (!list_empty(head)) { 277 struct inode *inode; 278 279 inode = list_entry(head->next, struct inode, i_list); 280 list_del(&inode->i_list); 281 282 if (inode->i_data.nrpages) 283 truncate_inode_pages(&inode->i_data, 0); 284 clear_inode(inode); 285 destroy_inode(inode); 286 nr_disposed++; 287 } 288 spin_lock(&inode_lock); 289 inodes_stat.nr_inodes -= nr_disposed; 290 spin_unlock(&inode_lock); 291 } 292 293 /* 294 * Invalidate all inodes for a device. 295 */ 296 static int invalidate_list(struct list_head *head, struct list_head *dispose) 297 { 298 struct list_head *next; 299 int busy = 0, count = 0; 300 301 next = head->next; 302 for (;;) { 303 struct list_head * tmp = next; 304 struct inode * inode; 305 306 /* 307 * We can reschedule here without worrying about the list's 308 * consistency because the per-sb list of inodes must not 309 * change during umount anymore, and because iprune_sem keeps 310 * shrink_icache_memory() away. 311 */ 312 cond_resched_lock(&inode_lock); 313 314 next = next->next; 315 if (tmp == head) 316 break; 317 inode = list_entry(tmp, struct inode, i_sb_list); 318 invalidate_inode_buffers(inode); 319 if (!atomic_read(&inode->i_count)) { 320 hlist_del_init(&inode->i_hash); 321 list_del(&inode->i_sb_list); 322 list_move(&inode->i_list, dispose); 323 inode->i_state |= I_FREEING; 324 count++; 325 continue; 326 } 327 busy = 1; 328 } 329 /* only unused inodes may be cached with i_count zero */ 330 inodes_stat.nr_unused -= count; 331 return busy; 332 } 333 334 /** 335 * invalidate_inodes - discard the inodes on a device 336 * @sb: superblock 337 * 338 * Discard all of the inodes for a given superblock. If the discard 339 * fails because there are busy inodes then a non zero value is returned. 340 * If the discard is successful all the inodes have been discarded. 341 */ 342 int invalidate_inodes(struct super_block * sb) 343 { 344 int busy; 345 LIST_HEAD(throw_away); 346 347 down(&iprune_sem); 348 spin_lock(&inode_lock); 349 busy = invalidate_list(&sb->s_inodes, &throw_away); 350 spin_unlock(&inode_lock); 351 352 dispose_list(&throw_away); 353 up(&iprune_sem); 354 355 return busy; 356 } 357 358 EXPORT_SYMBOL(invalidate_inodes); 359 360 int __invalidate_device(struct block_device *bdev) 361 { 362 struct super_block *sb = get_super(bdev); 363 int res = 0; 364 365 if (sb) { 366 /* 367 * no need to lock the super, get_super holds the 368 * read semaphore so the filesystem cannot go away 369 * under us (->put_super runs with the write lock 370 * hold). 371 */ 372 shrink_dcache_sb(sb); 373 res = invalidate_inodes(sb); 374 drop_super(sb); 375 } 376 invalidate_bdev(bdev, 0); 377 return res; 378 } 379 EXPORT_SYMBOL(__invalidate_device); 380 381 static int can_unuse(struct inode *inode) 382 { 383 if (inode->i_state) 384 return 0; 385 if (inode_has_buffers(inode)) 386 return 0; 387 if (atomic_read(&inode->i_count)) 388 return 0; 389 if (inode->i_data.nrpages) 390 return 0; 391 return 1; 392 } 393 394 /* 395 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 396 * a temporary list and then are freed outside inode_lock by dispose_list(). 397 * 398 * Any inodes which are pinned purely because of attached pagecache have their 399 * pagecache removed. We expect the final iput() on that inode to add it to 400 * the front of the inode_unused list. So look for it there and if the 401 * inode is still freeable, proceed. The right inode is found 99.9% of the 402 * time in testing on a 4-way. 403 * 404 * If the inode has metadata buffers attached to mapping->private_list then 405 * try to remove them. 406 */ 407 static void prune_icache(int nr_to_scan) 408 { 409 LIST_HEAD(freeable); 410 int nr_pruned = 0; 411 int nr_scanned; 412 unsigned long reap = 0; 413 414 down(&iprune_sem); 415 spin_lock(&inode_lock); 416 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 417 struct inode *inode; 418 419 if (list_empty(&inode_unused)) 420 break; 421 422 inode = list_entry(inode_unused.prev, struct inode, i_list); 423 424 if (inode->i_state || atomic_read(&inode->i_count)) { 425 list_move(&inode->i_list, &inode_unused); 426 continue; 427 } 428 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 429 __iget(inode); 430 spin_unlock(&inode_lock); 431 if (remove_inode_buffers(inode)) 432 reap += invalidate_inode_pages(&inode->i_data); 433 iput(inode); 434 spin_lock(&inode_lock); 435 436 if (inode != list_entry(inode_unused.next, 437 struct inode, i_list)) 438 continue; /* wrong inode or list_empty */ 439 if (!can_unuse(inode)) 440 continue; 441 } 442 hlist_del_init(&inode->i_hash); 443 list_del_init(&inode->i_sb_list); 444 list_move(&inode->i_list, &freeable); 445 inode->i_state |= I_FREEING; 446 nr_pruned++; 447 } 448 inodes_stat.nr_unused -= nr_pruned; 449 spin_unlock(&inode_lock); 450 451 dispose_list(&freeable); 452 up(&iprune_sem); 453 454 if (current_is_kswapd()) 455 mod_page_state(kswapd_inodesteal, reap); 456 else 457 mod_page_state(pginodesteal, reap); 458 } 459 460 /* 461 * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, 462 * "unused" means that no dentries are referring to the inodes: the files are 463 * not open and the dcache references to those inodes have already been 464 * reclaimed. 465 * 466 * This function is passed the number of inodes to scan, and it returns the 467 * total number of remaining possibly-reclaimable inodes. 468 */ 469 static int shrink_icache_memory(int nr, unsigned int gfp_mask) 470 { 471 if (nr) { 472 /* 473 * Nasty deadlock avoidance. We may hold various FS locks, 474 * and we don't want to recurse into the FS that called us 475 * in clear_inode() and friends.. 476 */ 477 if (!(gfp_mask & __GFP_FS)) 478 return -1; 479 prune_icache(nr); 480 } 481 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 482 } 483 484 static void __wait_on_freeing_inode(struct inode *inode); 485 /* 486 * Called with the inode lock held. 487 * NOTE: we are not increasing the inode-refcount, you must call __iget() 488 * by hand after calling find_inode now! This simplifies iunique and won't 489 * add any additional branch in the common code. 490 */ 491 static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) 492 { 493 struct hlist_node *node; 494 struct inode * inode = NULL; 495 496 repeat: 497 hlist_for_each (node, head) { 498 inode = hlist_entry(node, struct inode, i_hash); 499 if (inode->i_sb != sb) 500 continue; 501 if (!test(inode, data)) 502 continue; 503 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 504 __wait_on_freeing_inode(inode); 505 goto repeat; 506 } 507 break; 508 } 509 return node ? inode : NULL; 510 } 511 512 /* 513 * find_inode_fast is the fast path version of find_inode, see the comment at 514 * iget_locked for details. 515 */ 516 static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) 517 { 518 struct hlist_node *node; 519 struct inode * inode = NULL; 520 521 repeat: 522 hlist_for_each (node, head) { 523 inode = hlist_entry(node, struct inode, i_hash); 524 if (inode->i_ino != ino) 525 continue; 526 if (inode->i_sb != sb) 527 continue; 528 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 529 __wait_on_freeing_inode(inode); 530 goto repeat; 531 } 532 break; 533 } 534 return node ? inode : NULL; 535 } 536 537 /** 538 * new_inode - obtain an inode 539 * @sb: superblock 540 * 541 * Allocates a new inode for given superblock. 542 */ 543 struct inode *new_inode(struct super_block *sb) 544 { 545 static unsigned long last_ino; 546 struct inode * inode; 547 548 spin_lock_prefetch(&inode_lock); 549 550 inode = alloc_inode(sb); 551 if (inode) { 552 spin_lock(&inode_lock); 553 inodes_stat.nr_inodes++; 554 list_add(&inode->i_list, &inode_in_use); 555 list_add(&inode->i_sb_list, &sb->s_inodes); 556 inode->i_ino = ++last_ino; 557 inode->i_state = 0; 558 spin_unlock(&inode_lock); 559 } 560 return inode; 561 } 562 563 EXPORT_SYMBOL(new_inode); 564 565 void unlock_new_inode(struct inode *inode) 566 { 567 /* 568 * This is special! We do not need the spinlock 569 * when clearing I_LOCK, because we're guaranteed 570 * that nobody else tries to do anything about the 571 * state of the inode when it is locked, as we 572 * just created it (so there can be no old holders 573 * that haven't tested I_LOCK). 574 */ 575 inode->i_state &= ~(I_LOCK|I_NEW); 576 wake_up_inode(inode); 577 } 578 579 EXPORT_SYMBOL(unlock_new_inode); 580 581 /* 582 * This is called without the inode lock held.. Be careful. 583 * 584 * We no longer cache the sb_flags in i_flags - see fs.h 585 * -- rmk@arm.uk.linux.org 586 */ 587 static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) 588 { 589 struct inode * inode; 590 591 inode = alloc_inode(sb); 592 if (inode) { 593 struct inode * old; 594 595 spin_lock(&inode_lock); 596 /* We released the lock, so.. */ 597 old = find_inode(sb, head, test, data); 598 if (!old) { 599 if (set(inode, data)) 600 goto set_failed; 601 602 inodes_stat.nr_inodes++; 603 list_add(&inode->i_list, &inode_in_use); 604 list_add(&inode->i_sb_list, &sb->s_inodes); 605 hlist_add_head(&inode->i_hash, head); 606 inode->i_state = I_LOCK|I_NEW; 607 spin_unlock(&inode_lock); 608 609 /* Return the locked inode with I_NEW set, the 610 * caller is responsible for filling in the contents 611 */ 612 return inode; 613 } 614 615 /* 616 * Uhhuh, somebody else created the same inode under 617 * us. Use the old inode instead of the one we just 618 * allocated. 619 */ 620 __iget(old); 621 spin_unlock(&inode_lock); 622 destroy_inode(inode); 623 inode = old; 624 wait_on_inode(inode); 625 } 626 return inode; 627 628 set_failed: 629 spin_unlock(&inode_lock); 630 destroy_inode(inode); 631 return NULL; 632 } 633 634 /* 635 * get_new_inode_fast is the fast path version of get_new_inode, see the 636 * comment at iget_locked for details. 637 */ 638 static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) 639 { 640 struct inode * inode; 641 642 inode = alloc_inode(sb); 643 if (inode) { 644 struct inode * old; 645 646 spin_lock(&inode_lock); 647 /* We released the lock, so.. */ 648 old = find_inode_fast(sb, head, ino); 649 if (!old) { 650 inode->i_ino = ino; 651 inodes_stat.nr_inodes++; 652 list_add(&inode->i_list, &inode_in_use); 653 list_add(&inode->i_sb_list, &sb->s_inodes); 654 hlist_add_head(&inode->i_hash, head); 655 inode->i_state = I_LOCK|I_NEW; 656 spin_unlock(&inode_lock); 657 658 /* Return the locked inode with I_NEW set, the 659 * caller is responsible for filling in the contents 660 */ 661 return inode; 662 } 663 664 /* 665 * Uhhuh, somebody else created the same inode under 666 * us. Use the old inode instead of the one we just 667 * allocated. 668 */ 669 __iget(old); 670 spin_unlock(&inode_lock); 671 destroy_inode(inode); 672 inode = old; 673 wait_on_inode(inode); 674 } 675 return inode; 676 } 677 678 static inline unsigned long hash(struct super_block *sb, unsigned long hashval) 679 { 680 unsigned long tmp; 681 682 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 683 L1_CACHE_BYTES; 684 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); 685 return tmp & I_HASHMASK; 686 } 687 688 /** 689 * iunique - get a unique inode number 690 * @sb: superblock 691 * @max_reserved: highest reserved inode number 692 * 693 * Obtain an inode number that is unique on the system for a given 694 * superblock. This is used by file systems that have no natural 695 * permanent inode numbering system. An inode number is returned that 696 * is higher than the reserved limit but unique. 697 * 698 * BUGS: 699 * With a large number of inodes live on the file system this function 700 * currently becomes quite slow. 701 */ 702 ino_t iunique(struct super_block *sb, ino_t max_reserved) 703 { 704 static ino_t counter; 705 struct inode *inode; 706 struct hlist_head * head; 707 ino_t res; 708 spin_lock(&inode_lock); 709 retry: 710 if (counter > max_reserved) { 711 head = inode_hashtable + hash(sb,counter); 712 res = counter++; 713 inode = find_inode_fast(sb, head, res); 714 if (!inode) { 715 spin_unlock(&inode_lock); 716 return res; 717 } 718 } else { 719 counter = max_reserved + 1; 720 } 721 goto retry; 722 723 } 724 725 EXPORT_SYMBOL(iunique); 726 727 struct inode *igrab(struct inode *inode) 728 { 729 spin_lock(&inode_lock); 730 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 731 __iget(inode); 732 else 733 /* 734 * Handle the case where s_op->clear_inode is not been 735 * called yet, and somebody is calling igrab 736 * while the inode is getting freed. 737 */ 738 inode = NULL; 739 spin_unlock(&inode_lock); 740 return inode; 741 } 742 743 EXPORT_SYMBOL(igrab); 744 745 /** 746 * ifind - internal function, you want ilookup5() or iget5(). 747 * @sb: super block of file system to search 748 * @head: the head of the list to search 749 * @test: callback used for comparisons between inodes 750 * @data: opaque data pointer to pass to @test 751 * 752 * ifind() searches for the inode specified by @data in the inode 753 * cache. This is a generalized version of ifind_fast() for file systems where 754 * the inode number is not sufficient for unique identification of an inode. 755 * 756 * If the inode is in the cache, the inode is returned with an incremented 757 * reference count. 758 * 759 * Otherwise NULL is returned. 760 * 761 * Note, @test is called with the inode_lock held, so can't sleep. 762 */ 763 static inline struct inode *ifind(struct super_block *sb, 764 struct hlist_head *head, int (*test)(struct inode *, void *), 765 void *data) 766 { 767 struct inode *inode; 768 769 spin_lock(&inode_lock); 770 inode = find_inode(sb, head, test, data); 771 if (inode) { 772 __iget(inode); 773 spin_unlock(&inode_lock); 774 wait_on_inode(inode); 775 return inode; 776 } 777 spin_unlock(&inode_lock); 778 return NULL; 779 } 780 781 /** 782 * ifind_fast - internal function, you want ilookup() or iget(). 783 * @sb: super block of file system to search 784 * @head: head of the list to search 785 * @ino: inode number to search for 786 * 787 * ifind_fast() searches for the inode @ino in the inode cache. This is for 788 * file systems where the inode number is sufficient for unique identification 789 * of an inode. 790 * 791 * If the inode is in the cache, the inode is returned with an incremented 792 * reference count. 793 * 794 * Otherwise NULL is returned. 795 */ 796 static inline struct inode *ifind_fast(struct super_block *sb, 797 struct hlist_head *head, unsigned long ino) 798 { 799 struct inode *inode; 800 801 spin_lock(&inode_lock); 802 inode = find_inode_fast(sb, head, ino); 803 if (inode) { 804 __iget(inode); 805 spin_unlock(&inode_lock); 806 wait_on_inode(inode); 807 return inode; 808 } 809 spin_unlock(&inode_lock); 810 return NULL; 811 } 812 813 /** 814 * ilookup5 - search for an inode in the inode cache 815 * @sb: super block of file system to search 816 * @hashval: hash value (usually inode number) to search for 817 * @test: callback used for comparisons between inodes 818 * @data: opaque data pointer to pass to @test 819 * 820 * ilookup5() uses ifind() to search for the inode specified by @hashval and 821 * @data in the inode cache. This is a generalized version of ilookup() for 822 * file systems where the inode number is not sufficient for unique 823 * identification of an inode. 824 * 825 * If the inode is in the cache, the inode is returned with an incremented 826 * reference count. 827 * 828 * Otherwise NULL is returned. 829 * 830 * Note, @test is called with the inode_lock held, so can't sleep. 831 */ 832 struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 833 int (*test)(struct inode *, void *), void *data) 834 { 835 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 836 837 return ifind(sb, head, test, data); 838 } 839 840 EXPORT_SYMBOL(ilookup5); 841 842 /** 843 * ilookup - search for an inode in the inode cache 844 * @sb: super block of file system to search 845 * @ino: inode number to search for 846 * 847 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 848 * This is for file systems where the inode number is sufficient for unique 849 * identification of an inode. 850 * 851 * If the inode is in the cache, the inode is returned with an incremented 852 * reference count. 853 * 854 * Otherwise NULL is returned. 855 */ 856 struct inode *ilookup(struct super_block *sb, unsigned long ino) 857 { 858 struct hlist_head *head = inode_hashtable + hash(sb, ino); 859 860 return ifind_fast(sb, head, ino); 861 } 862 863 EXPORT_SYMBOL(ilookup); 864 865 /** 866 * iget5_locked - obtain an inode from a mounted file system 867 * @sb: super block of file system 868 * @hashval: hash value (usually inode number) to get 869 * @test: callback used for comparisons between inodes 870 * @set: callback used to initialize a new struct inode 871 * @data: opaque data pointer to pass to @test and @set 872 * 873 * This is iget() without the read_inode() portion of get_new_inode(). 874 * 875 * iget5_locked() uses ifind() to search for the inode specified by @hashval 876 * and @data in the inode cache and if present it is returned with an increased 877 * reference count. This is a generalized version of iget_locked() for file 878 * systems where the inode number is not sufficient for unique identification 879 * of an inode. 880 * 881 * If the inode is not in cache, get_new_inode() is called to allocate a new 882 * inode and this is returned locked, hashed, and with the I_NEW flag set. The 883 * file system gets to fill it in before unlocking it via unlock_new_inode(). 884 * 885 * Note both @test and @set are called with the inode_lock held, so can't sleep. 886 */ 887 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 888 int (*test)(struct inode *, void *), 889 int (*set)(struct inode *, void *), void *data) 890 { 891 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 892 struct inode *inode; 893 894 inode = ifind(sb, head, test, data); 895 if (inode) 896 return inode; 897 /* 898 * get_new_inode() will do the right thing, re-trying the search 899 * in case it had to block at any point. 900 */ 901 return get_new_inode(sb, head, test, set, data); 902 } 903 904 EXPORT_SYMBOL(iget5_locked); 905 906 /** 907 * iget_locked - obtain an inode from a mounted file system 908 * @sb: super block of file system 909 * @ino: inode number to get 910 * 911 * This is iget() without the read_inode() portion of get_new_inode_fast(). 912 * 913 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in 914 * the inode cache and if present it is returned with an increased reference 915 * count. This is for file systems where the inode number is sufficient for 916 * unique identification of an inode. 917 * 918 * If the inode is not in cache, get_new_inode_fast() is called to allocate a 919 * new inode and this is returned locked, hashed, and with the I_NEW flag set. 920 * The file system gets to fill it in before unlocking it via 921 * unlock_new_inode(). 922 */ 923 struct inode *iget_locked(struct super_block *sb, unsigned long ino) 924 { 925 struct hlist_head *head = inode_hashtable + hash(sb, ino); 926 struct inode *inode; 927 928 inode = ifind_fast(sb, head, ino); 929 if (inode) 930 return inode; 931 /* 932 * get_new_inode_fast() will do the right thing, re-trying the search 933 * in case it had to block at any point. 934 */ 935 return get_new_inode_fast(sb, head, ino); 936 } 937 938 EXPORT_SYMBOL(iget_locked); 939 940 /** 941 * __insert_inode_hash - hash an inode 942 * @inode: unhashed inode 943 * @hashval: unsigned long value used to locate this object in the 944 * inode_hashtable. 945 * 946 * Add an inode to the inode hash for this superblock. 947 */ 948 void __insert_inode_hash(struct inode *inode, unsigned long hashval) 949 { 950 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 951 spin_lock(&inode_lock); 952 hlist_add_head(&inode->i_hash, head); 953 spin_unlock(&inode_lock); 954 } 955 956 EXPORT_SYMBOL(__insert_inode_hash); 957 958 /** 959 * remove_inode_hash - remove an inode from the hash 960 * @inode: inode to unhash 961 * 962 * Remove an inode from the superblock. 963 */ 964 void remove_inode_hash(struct inode *inode) 965 { 966 spin_lock(&inode_lock); 967 hlist_del_init(&inode->i_hash); 968 spin_unlock(&inode_lock); 969 } 970 971 EXPORT_SYMBOL(remove_inode_hash); 972 973 /* 974 * Tell the filesystem that this inode is no longer of any interest and should 975 * be completely destroyed. 976 * 977 * We leave the inode in the inode hash table until *after* the filesystem's 978 * ->delete_inode completes. This ensures that an iget (such as nfsd might 979 * instigate) will always find up-to-date information either in the hash or on 980 * disk. 981 * 982 * I_FREEING is set so that no-one will take a new reference to the inode while 983 * it is being deleted. 984 */ 985 void generic_delete_inode(struct inode *inode) 986 { 987 struct super_operations *op = inode->i_sb->s_op; 988 989 list_del_init(&inode->i_list); 990 list_del_init(&inode->i_sb_list); 991 inode->i_state|=I_FREEING; 992 inodes_stat.nr_inodes--; 993 spin_unlock(&inode_lock); 994 995 if (inode->i_data.nrpages) 996 truncate_inode_pages(&inode->i_data, 0); 997 998 security_inode_delete(inode); 999 1000 if (op->delete_inode) { 1001 void (*delete)(struct inode *) = op->delete_inode; 1002 if (!is_bad_inode(inode)) 1003 DQUOT_INIT(inode); 1004 /* s_op->delete_inode internally recalls clear_inode() */ 1005 delete(inode); 1006 } else 1007 clear_inode(inode); 1008 spin_lock(&inode_lock); 1009 hlist_del_init(&inode->i_hash); 1010 spin_unlock(&inode_lock); 1011 wake_up_inode(inode); 1012 if (inode->i_state != I_CLEAR) 1013 BUG(); 1014 destroy_inode(inode); 1015 } 1016 1017 EXPORT_SYMBOL(generic_delete_inode); 1018 1019 static void generic_forget_inode(struct inode *inode) 1020 { 1021 struct super_block *sb = inode->i_sb; 1022 1023 if (!hlist_unhashed(&inode->i_hash)) { 1024 if (!(inode->i_state & (I_DIRTY|I_LOCK))) 1025 list_move(&inode->i_list, &inode_unused); 1026 inodes_stat.nr_unused++; 1027 if (!sb || (sb->s_flags & MS_ACTIVE)) { 1028 spin_unlock(&inode_lock); 1029 return; 1030 } 1031 inode->i_state |= I_WILL_FREE; 1032 spin_unlock(&inode_lock); 1033 write_inode_now(inode, 1); 1034 spin_lock(&inode_lock); 1035 inode->i_state &= ~I_WILL_FREE; 1036 inodes_stat.nr_unused--; 1037 hlist_del_init(&inode->i_hash); 1038 } 1039 list_del_init(&inode->i_list); 1040 list_del_init(&inode->i_sb_list); 1041 inode->i_state |= I_FREEING; 1042 inodes_stat.nr_inodes--; 1043 spin_unlock(&inode_lock); 1044 if (inode->i_data.nrpages) 1045 truncate_inode_pages(&inode->i_data, 0); 1046 clear_inode(inode); 1047 destroy_inode(inode); 1048 } 1049 1050 /* 1051 * Normal UNIX filesystem behaviour: delete the 1052 * inode when the usage count drops to zero, and 1053 * i_nlink is zero. 1054 */ 1055 void generic_drop_inode(struct inode *inode) 1056 { 1057 if (!inode->i_nlink) 1058 generic_delete_inode(inode); 1059 else 1060 generic_forget_inode(inode); 1061 } 1062 1063 EXPORT_SYMBOL_GPL(generic_drop_inode); 1064 1065 /* 1066 * Called when we're dropping the last reference 1067 * to an inode. 1068 * 1069 * Call the FS "drop()" function, defaulting to 1070 * the legacy UNIX filesystem behaviour.. 1071 * 1072 * NOTE! NOTE! NOTE! We're called with the inode lock 1073 * held, and the drop function is supposed to release 1074 * the lock! 1075 */ 1076 static inline void iput_final(struct inode *inode) 1077 { 1078 struct super_operations *op = inode->i_sb->s_op; 1079 void (*drop)(struct inode *) = generic_drop_inode; 1080 1081 if (op && op->drop_inode) 1082 drop = op->drop_inode; 1083 drop(inode); 1084 } 1085 1086 /** 1087 * iput - put an inode 1088 * @inode: inode to put 1089 * 1090 * Puts an inode, dropping its usage count. If the inode use count hits 1091 * zero, the inode is then freed and may also be destroyed. 1092 * 1093 * Consequently, iput() can sleep. 1094 */ 1095 void iput(struct inode *inode) 1096 { 1097 if (inode) { 1098 struct super_operations *op = inode->i_sb->s_op; 1099 1100 BUG_ON(inode->i_state == I_CLEAR); 1101 1102 if (op && op->put_inode) 1103 op->put_inode(inode); 1104 1105 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1106 iput_final(inode); 1107 } 1108 } 1109 1110 EXPORT_SYMBOL(iput); 1111 1112 /** 1113 * bmap - find a block number in a file 1114 * @inode: inode of file 1115 * @block: block to find 1116 * 1117 * Returns the block number on the device holding the inode that 1118 * is the disk block number for the block of the file requested. 1119 * That is, asked for block 4 of inode 1 the function will return the 1120 * disk block relative to the disk start that holds that block of the 1121 * file. 1122 */ 1123 sector_t bmap(struct inode * inode, sector_t block) 1124 { 1125 sector_t res = 0; 1126 if (inode->i_mapping->a_ops->bmap) 1127 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1128 return res; 1129 } 1130 1131 EXPORT_SYMBOL(bmap); 1132 1133 /** 1134 * update_atime - update the access time 1135 * @inode: inode accessed 1136 * 1137 * Update the accessed time on an inode and mark it for writeback. 1138 * This function automatically handles read only file systems and media, 1139 * as well as the "noatime" flag and inode specific "noatime" markers. 1140 */ 1141 void update_atime(struct inode *inode) 1142 { 1143 struct timespec now; 1144 1145 if (IS_NOATIME(inode)) 1146 return; 1147 if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) 1148 return; 1149 if (IS_RDONLY(inode)) 1150 return; 1151 1152 now = current_fs_time(inode->i_sb); 1153 if (!timespec_equal(&inode->i_atime, &now)) { 1154 inode->i_atime = now; 1155 mark_inode_dirty_sync(inode); 1156 } else { 1157 if (!timespec_equal(&inode->i_atime, &now)) 1158 inode->i_atime = now; 1159 } 1160 } 1161 1162 EXPORT_SYMBOL(update_atime); 1163 1164 /** 1165 * inode_update_time - update mtime and ctime time 1166 * @inode: inode accessed 1167 * @ctime_too: update ctime too 1168 * 1169 * Update the mtime time on an inode and mark it for writeback. 1170 * When ctime_too is specified update the ctime too. 1171 */ 1172 1173 void inode_update_time(struct inode *inode, int ctime_too) 1174 { 1175 struct timespec now; 1176 int sync_it = 0; 1177 1178 if (IS_NOCMTIME(inode)) 1179 return; 1180 if (IS_RDONLY(inode)) 1181 return; 1182 1183 now = current_fs_time(inode->i_sb); 1184 if (!timespec_equal(&inode->i_mtime, &now)) 1185 sync_it = 1; 1186 inode->i_mtime = now; 1187 1188 if (ctime_too) { 1189 if (!timespec_equal(&inode->i_ctime, &now)) 1190 sync_it = 1; 1191 inode->i_ctime = now; 1192 } 1193 if (sync_it) 1194 mark_inode_dirty_sync(inode); 1195 } 1196 1197 EXPORT_SYMBOL(inode_update_time); 1198 1199 int inode_needs_sync(struct inode *inode) 1200 { 1201 if (IS_SYNC(inode)) 1202 return 1; 1203 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1204 return 1; 1205 return 0; 1206 } 1207 1208 EXPORT_SYMBOL(inode_needs_sync); 1209 1210 /* 1211 * Quota functions that want to walk the inode lists.. 1212 */ 1213 #ifdef CONFIG_QUOTA 1214 1215 /* Function back in dquot.c */ 1216 int remove_inode_dquot_ref(struct inode *, int, struct list_head *); 1217 1218 void remove_dquot_ref(struct super_block *sb, int type, 1219 struct list_head *tofree_head) 1220 { 1221 struct inode *inode; 1222 1223 if (!sb->dq_op) 1224 return; /* nothing to do */ 1225 spin_lock(&inode_lock); /* This lock is for inodes code */ 1226 1227 /* 1228 * We don't have to lock against quota code - test IS_QUOTAINIT is 1229 * just for speedup... 1230 */ 1231 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) 1232 if (!IS_NOQUOTA(inode)) 1233 remove_inode_dquot_ref(inode, type, tofree_head); 1234 1235 spin_unlock(&inode_lock); 1236 } 1237 1238 #endif 1239 1240 int inode_wait(void *word) 1241 { 1242 schedule(); 1243 return 0; 1244 } 1245 1246 /* 1247 * If we try to find an inode in the inode hash while it is being deleted, we 1248 * have to wait until the filesystem completes its deletion before reporting 1249 * that it isn't found. This is because iget will immediately call 1250 * ->read_inode, and we want to be sure that evidence of the deletion is found 1251 * by ->read_inode. 1252 * This is called with inode_lock held. 1253 */ 1254 static void __wait_on_freeing_inode(struct inode *inode) 1255 { 1256 wait_queue_head_t *wq; 1257 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); 1258 1259 /* 1260 * I_FREEING and I_CLEAR are cleared in process context under 1261 * inode_lock, so we have to give the tasks who would clear them 1262 * a chance to run and acquire inode_lock. 1263 */ 1264 if (!(inode->i_state & I_LOCK)) { 1265 spin_unlock(&inode_lock); 1266 yield(); 1267 spin_lock(&inode_lock); 1268 return; 1269 } 1270 wq = bit_waitqueue(&inode->i_state, __I_LOCK); 1271 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1272 spin_unlock(&inode_lock); 1273 schedule(); 1274 finish_wait(wq, &wait.wait); 1275 spin_lock(&inode_lock); 1276 } 1277 1278 void wake_up_inode(struct inode *inode) 1279 { 1280 /* 1281 * Prevent speculative execution through spin_unlock(&inode_lock); 1282 */ 1283 smp_mb(); 1284 wake_up_bit(&inode->i_state, __I_LOCK); 1285 } 1286 1287 static __initdata unsigned long ihash_entries; 1288 static int __init set_ihash_entries(char *str) 1289 { 1290 if (!str) 1291 return 0; 1292 ihash_entries = simple_strtoul(str, &str, 0); 1293 return 1; 1294 } 1295 __setup("ihash_entries=", set_ihash_entries); 1296 1297 /* 1298 * Initialize the waitqueues and inode hash table. 1299 */ 1300 void __init inode_init_early(void) 1301 { 1302 int loop; 1303 1304 /* If hashes are distributed across NUMA nodes, defer 1305 * hash allocation until vmalloc space is available. 1306 */ 1307 if (hashdist) 1308 return; 1309 1310 inode_hashtable = 1311 alloc_large_system_hash("Inode-cache", 1312 sizeof(struct hlist_head), 1313 ihash_entries, 1314 14, 1315 HASH_EARLY, 1316 &i_hash_shift, 1317 &i_hash_mask, 1318 0); 1319 1320 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1321 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1322 } 1323 1324 void __init inode_init(unsigned long mempages) 1325 { 1326 int loop; 1327 1328 /* inode slab cache */ 1329 inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 1330 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, init_once, NULL); 1331 set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); 1332 1333 /* Hash may have been set up in inode_init_early */ 1334 if (!hashdist) 1335 return; 1336 1337 inode_hashtable = 1338 alloc_large_system_hash("Inode-cache", 1339 sizeof(struct hlist_head), 1340 ihash_entries, 1341 14, 1342 0, 1343 &i_hash_shift, 1344 &i_hash_mask, 1345 0); 1346 1347 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1348 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1349 } 1350 1351 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1352 { 1353 inode->i_mode = mode; 1354 if (S_ISCHR(mode)) { 1355 inode->i_fop = &def_chr_fops; 1356 inode->i_rdev = rdev; 1357 } else if (S_ISBLK(mode)) { 1358 inode->i_fop = &def_blk_fops; 1359 inode->i_rdev = rdev; 1360 } else if (S_ISFIFO(mode)) 1361 inode->i_fop = &def_fifo_fops; 1362 else if (S_ISSOCK(mode)) 1363 inode->i_fop = &bad_sock_fops; 1364 else 1365 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", 1366 mode); 1367 } 1368 EXPORT_SYMBOL(init_special_inode); 1369