xref: /linux/fs/inode.c (revision b43ab901d671e3e3cad425ea5e9a3c74e266dcdd)
1 /*
2  * (C) 1997 Linus Torvalds
3  * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
4  */
5 #include <linux/fs.h>
6 #include <linux/mm.h>
7 #include <linux/dcache.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/writeback.h>
11 #include <linux/module.h>
12 #include <linux/backing-dev.h>
13 #include <linux/wait.h>
14 #include <linux/rwsem.h>
15 #include <linux/hash.h>
16 #include <linux/swap.h>
17 #include <linux/security.h>
18 #include <linux/pagemap.h>
19 #include <linux/cdev.h>
20 #include <linux/bootmem.h>
21 #include <linux/fsnotify.h>
22 #include <linux/mount.h>
23 #include <linux/async.h>
24 #include <linux/posix_acl.h>
25 #include <linux/prefetch.h>
26 #include <linux/ima.h>
27 #include <linux/cred.h>
28 #include <linux/buffer_head.h> /* for inode_has_buffers */
29 #include <linux/ratelimit.h>
30 #include "internal.h"
31 
32 /*
33  * Inode locking rules:
34  *
35  * inode->i_lock protects:
36  *   inode->i_state, inode->i_hash, __iget()
37  * inode->i_sb->s_inode_lru_lock protects:
38  *   inode->i_sb->s_inode_lru, inode->i_lru
39  * inode_sb_list_lock protects:
40  *   sb->s_inodes, inode->i_sb_list
41  * bdi->wb.list_lock protects:
42  *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
43  * inode_hash_lock protects:
44  *   inode_hashtable, inode->i_hash
45  *
46  * Lock ordering:
47  *
48  * inode_sb_list_lock
49  *   inode->i_lock
50  *     inode->i_sb->s_inode_lru_lock
51  *
52  * bdi->wb.list_lock
53  *   inode->i_lock
54  *
55  * inode_hash_lock
56  *   inode_sb_list_lock
57  *   inode->i_lock
58  *
59  * iunique_lock
60  *   inode_hash_lock
61  */
62 
63 static unsigned int i_hash_mask __read_mostly;
64 static unsigned int i_hash_shift __read_mostly;
65 static struct hlist_head *inode_hashtable __read_mostly;
66 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
67 
68 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
69 
70 /*
71  * Empty aops. Can be used for the cases where the user does not
72  * define any of the address_space operations.
73  */
74 const struct address_space_operations empty_aops = {
75 };
76 EXPORT_SYMBOL(empty_aops);
77 
78 /*
79  * Statistics gathering..
80  */
81 struct inodes_stat_t inodes_stat;
82 
83 static DEFINE_PER_CPU(unsigned int, nr_inodes);
84 static DEFINE_PER_CPU(unsigned int, nr_unused);
85 
86 static struct kmem_cache *inode_cachep __read_mostly;
87 
88 static int get_nr_inodes(void)
89 {
90 	int i;
91 	int sum = 0;
92 	for_each_possible_cpu(i)
93 		sum += per_cpu(nr_inodes, i);
94 	return sum < 0 ? 0 : sum;
95 }
96 
97 static inline int get_nr_inodes_unused(void)
98 {
99 	int i;
100 	int sum = 0;
101 	for_each_possible_cpu(i)
102 		sum += per_cpu(nr_unused, i);
103 	return sum < 0 ? 0 : sum;
104 }
105 
106 int get_nr_dirty_inodes(void)
107 {
108 	/* not actually dirty inodes, but a wild approximation */
109 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
110 	return nr_dirty > 0 ? nr_dirty : 0;
111 }
112 
113 /*
114  * Handle nr_inode sysctl
115  */
116 #ifdef CONFIG_SYSCTL
117 int proc_nr_inodes(ctl_table *table, int write,
118 		   void __user *buffer, size_t *lenp, loff_t *ppos)
119 {
120 	inodes_stat.nr_inodes = get_nr_inodes();
121 	inodes_stat.nr_unused = get_nr_inodes_unused();
122 	return proc_dointvec(table, write, buffer, lenp, ppos);
123 }
124 #endif
125 
126 /**
127  * inode_init_always - perform inode structure intialisation
128  * @sb: superblock inode belongs to
129  * @inode: inode to initialise
130  *
131  * These are initializations that need to be done on every inode
132  * allocation as the fields are not initialised by slab allocation.
133  */
134 int inode_init_always(struct super_block *sb, struct inode *inode)
135 {
136 	static const struct inode_operations empty_iops;
137 	static const struct file_operations empty_fops;
138 	struct address_space *const mapping = &inode->i_data;
139 
140 	inode->i_sb = sb;
141 	inode->i_blkbits = sb->s_blocksize_bits;
142 	inode->i_flags = 0;
143 	atomic_set(&inode->i_count, 1);
144 	inode->i_op = &empty_iops;
145 	inode->i_fop = &empty_fops;
146 	inode->__i_nlink = 1;
147 	inode->i_opflags = 0;
148 	inode->i_uid = 0;
149 	inode->i_gid = 0;
150 	atomic_set(&inode->i_writecount, 0);
151 	inode->i_size = 0;
152 	inode->i_blocks = 0;
153 	inode->i_bytes = 0;
154 	inode->i_generation = 0;
155 #ifdef CONFIG_QUOTA
156 	memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
157 #endif
158 	inode->i_pipe = NULL;
159 	inode->i_bdev = NULL;
160 	inode->i_cdev = NULL;
161 	inode->i_rdev = 0;
162 	inode->dirtied_when = 0;
163 
164 	if (security_inode_alloc(inode))
165 		goto out;
166 	spin_lock_init(&inode->i_lock);
167 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
168 
169 	mutex_init(&inode->i_mutex);
170 	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
171 
172 	atomic_set(&inode->i_dio_count, 0);
173 
174 	mapping->a_ops = &empty_aops;
175 	mapping->host = inode;
176 	mapping->flags = 0;
177 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
178 	mapping->assoc_mapping = NULL;
179 	mapping->backing_dev_info = &default_backing_dev_info;
180 	mapping->writeback_index = 0;
181 
182 	/*
183 	 * If the block_device provides a backing_dev_info for client
184 	 * inodes then use that.  Otherwise the inode share the bdev's
185 	 * backing_dev_info.
186 	 */
187 	if (sb->s_bdev) {
188 		struct backing_dev_info *bdi;
189 
190 		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
191 		mapping->backing_dev_info = bdi;
192 	}
193 	inode->i_private = NULL;
194 	inode->i_mapping = mapping;
195 	INIT_LIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
196 #ifdef CONFIG_FS_POSIX_ACL
197 	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
198 #endif
199 
200 #ifdef CONFIG_FSNOTIFY
201 	inode->i_fsnotify_mask = 0;
202 #endif
203 
204 	this_cpu_inc(nr_inodes);
205 
206 	return 0;
207 out:
208 	return -ENOMEM;
209 }
210 EXPORT_SYMBOL(inode_init_always);
211 
212 static struct inode *alloc_inode(struct super_block *sb)
213 {
214 	struct inode *inode;
215 
216 	if (sb->s_op->alloc_inode)
217 		inode = sb->s_op->alloc_inode(sb);
218 	else
219 		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
220 
221 	if (!inode)
222 		return NULL;
223 
224 	if (unlikely(inode_init_always(sb, inode))) {
225 		if (inode->i_sb->s_op->destroy_inode)
226 			inode->i_sb->s_op->destroy_inode(inode);
227 		else
228 			kmem_cache_free(inode_cachep, inode);
229 		return NULL;
230 	}
231 
232 	return inode;
233 }
234 
235 void free_inode_nonrcu(struct inode *inode)
236 {
237 	kmem_cache_free(inode_cachep, inode);
238 }
239 EXPORT_SYMBOL(free_inode_nonrcu);
240 
241 void __destroy_inode(struct inode *inode)
242 {
243 	BUG_ON(inode_has_buffers(inode));
244 	security_inode_free(inode);
245 	fsnotify_inode_delete(inode);
246 	if (!inode->i_nlink) {
247 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
248 		atomic_long_dec(&inode->i_sb->s_remove_count);
249 	}
250 
251 #ifdef CONFIG_FS_POSIX_ACL
252 	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
253 		posix_acl_release(inode->i_acl);
254 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
255 		posix_acl_release(inode->i_default_acl);
256 #endif
257 	this_cpu_dec(nr_inodes);
258 }
259 EXPORT_SYMBOL(__destroy_inode);
260 
261 static void i_callback(struct rcu_head *head)
262 {
263 	struct inode *inode = container_of(head, struct inode, i_rcu);
264 	kmem_cache_free(inode_cachep, inode);
265 }
266 
267 static void destroy_inode(struct inode *inode)
268 {
269 	BUG_ON(!list_empty(&inode->i_lru));
270 	__destroy_inode(inode);
271 	if (inode->i_sb->s_op->destroy_inode)
272 		inode->i_sb->s_op->destroy_inode(inode);
273 	else
274 		call_rcu(&inode->i_rcu, i_callback);
275 }
276 
277 /**
278  * drop_nlink - directly drop an inode's link count
279  * @inode: inode
280  *
281  * This is a low-level filesystem helper to replace any
282  * direct filesystem manipulation of i_nlink.  In cases
283  * where we are attempting to track writes to the
284  * filesystem, a decrement to zero means an imminent
285  * write when the file is truncated and actually unlinked
286  * on the filesystem.
287  */
288 void drop_nlink(struct inode *inode)
289 {
290 	WARN_ON(inode->i_nlink == 0);
291 	inode->__i_nlink--;
292 	if (!inode->i_nlink)
293 		atomic_long_inc(&inode->i_sb->s_remove_count);
294 }
295 EXPORT_SYMBOL(drop_nlink);
296 
297 /**
298  * clear_nlink - directly zero an inode's link count
299  * @inode: inode
300  *
301  * This is a low-level filesystem helper to replace any
302  * direct filesystem manipulation of i_nlink.  See
303  * drop_nlink() for why we care about i_nlink hitting zero.
304  */
305 void clear_nlink(struct inode *inode)
306 {
307 	if (inode->i_nlink) {
308 		inode->__i_nlink = 0;
309 		atomic_long_inc(&inode->i_sb->s_remove_count);
310 	}
311 }
312 EXPORT_SYMBOL(clear_nlink);
313 
314 /**
315  * set_nlink - directly set an inode's link count
316  * @inode: inode
317  * @nlink: new nlink (should be non-zero)
318  *
319  * This is a low-level filesystem helper to replace any
320  * direct filesystem manipulation of i_nlink.
321  */
322 void set_nlink(struct inode *inode, unsigned int nlink)
323 {
324 	if (!nlink) {
325 		clear_nlink(inode);
326 	} else {
327 		/* Yes, some filesystems do change nlink from zero to one */
328 		if (inode->i_nlink == 0)
329 			atomic_long_dec(&inode->i_sb->s_remove_count);
330 
331 		inode->__i_nlink = nlink;
332 	}
333 }
334 EXPORT_SYMBOL(set_nlink);
335 
336 /**
337  * inc_nlink - directly increment an inode's link count
338  * @inode: inode
339  *
340  * This is a low-level filesystem helper to replace any
341  * direct filesystem manipulation of i_nlink.  Currently,
342  * it is only here for parity with dec_nlink().
343  */
344 void inc_nlink(struct inode *inode)
345 {
346 	if (WARN_ON(inode->i_nlink == 0))
347 		atomic_long_dec(&inode->i_sb->s_remove_count);
348 
349 	inode->__i_nlink++;
350 }
351 EXPORT_SYMBOL(inc_nlink);
352 
353 void address_space_init_once(struct address_space *mapping)
354 {
355 	memset(mapping, 0, sizeof(*mapping));
356 	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
357 	spin_lock_init(&mapping->tree_lock);
358 	mutex_init(&mapping->i_mmap_mutex);
359 	INIT_LIST_HEAD(&mapping->private_list);
360 	spin_lock_init(&mapping->private_lock);
361 	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
362 	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
363 }
364 EXPORT_SYMBOL(address_space_init_once);
365 
366 /*
367  * These are initializations that only need to be done
368  * once, because the fields are idempotent across use
369  * of the inode, so let the slab aware of that.
370  */
371 void inode_init_once(struct inode *inode)
372 {
373 	memset(inode, 0, sizeof(*inode));
374 	INIT_HLIST_NODE(&inode->i_hash);
375 	INIT_LIST_HEAD(&inode->i_devices);
376 	INIT_LIST_HEAD(&inode->i_wb_list);
377 	INIT_LIST_HEAD(&inode->i_lru);
378 	address_space_init_once(&inode->i_data);
379 	i_size_ordered_init(inode);
380 #ifdef CONFIG_FSNOTIFY
381 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
382 #endif
383 }
384 EXPORT_SYMBOL(inode_init_once);
385 
386 static void init_once(void *foo)
387 {
388 	struct inode *inode = (struct inode *) foo;
389 
390 	inode_init_once(inode);
391 }
392 
393 /*
394  * inode->i_lock must be held
395  */
396 void __iget(struct inode *inode)
397 {
398 	atomic_inc(&inode->i_count);
399 }
400 
401 /*
402  * get additional reference to inode; caller must already hold one.
403  */
404 void ihold(struct inode *inode)
405 {
406 	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
407 }
408 EXPORT_SYMBOL(ihold);
409 
410 static void inode_lru_list_add(struct inode *inode)
411 {
412 	spin_lock(&inode->i_sb->s_inode_lru_lock);
413 	if (list_empty(&inode->i_lru)) {
414 		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
415 		inode->i_sb->s_nr_inodes_unused++;
416 		this_cpu_inc(nr_unused);
417 	}
418 	spin_unlock(&inode->i_sb->s_inode_lru_lock);
419 }
420 
421 static void inode_lru_list_del(struct inode *inode)
422 {
423 	spin_lock(&inode->i_sb->s_inode_lru_lock);
424 	if (!list_empty(&inode->i_lru)) {
425 		list_del_init(&inode->i_lru);
426 		inode->i_sb->s_nr_inodes_unused--;
427 		this_cpu_dec(nr_unused);
428 	}
429 	spin_unlock(&inode->i_sb->s_inode_lru_lock);
430 }
431 
432 /**
433  * inode_sb_list_add - add inode to the superblock list of inodes
434  * @inode: inode to add
435  */
436 void inode_sb_list_add(struct inode *inode)
437 {
438 	spin_lock(&inode_sb_list_lock);
439 	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
440 	spin_unlock(&inode_sb_list_lock);
441 }
442 EXPORT_SYMBOL_GPL(inode_sb_list_add);
443 
444 static inline void inode_sb_list_del(struct inode *inode)
445 {
446 	if (!list_empty(&inode->i_sb_list)) {
447 		spin_lock(&inode_sb_list_lock);
448 		list_del_init(&inode->i_sb_list);
449 		spin_unlock(&inode_sb_list_lock);
450 	}
451 }
452 
453 static unsigned long hash(struct super_block *sb, unsigned long hashval)
454 {
455 	unsigned long tmp;
456 
457 	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
458 			L1_CACHE_BYTES;
459 	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
460 	return tmp & i_hash_mask;
461 }
462 
463 /**
464  *	__insert_inode_hash - hash an inode
465  *	@inode: unhashed inode
466  *	@hashval: unsigned long value used to locate this object in the
467  *		inode_hashtable.
468  *
469  *	Add an inode to the inode hash for this superblock.
470  */
471 void __insert_inode_hash(struct inode *inode, unsigned long hashval)
472 {
473 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
474 
475 	spin_lock(&inode_hash_lock);
476 	spin_lock(&inode->i_lock);
477 	hlist_add_head(&inode->i_hash, b);
478 	spin_unlock(&inode->i_lock);
479 	spin_unlock(&inode_hash_lock);
480 }
481 EXPORT_SYMBOL(__insert_inode_hash);
482 
483 /**
484  *	__remove_inode_hash - remove an inode from the hash
485  *	@inode: inode to unhash
486  *
487  *	Remove an inode from the superblock.
488  */
489 void __remove_inode_hash(struct inode *inode)
490 {
491 	spin_lock(&inode_hash_lock);
492 	spin_lock(&inode->i_lock);
493 	hlist_del_init(&inode->i_hash);
494 	spin_unlock(&inode->i_lock);
495 	spin_unlock(&inode_hash_lock);
496 }
497 EXPORT_SYMBOL(__remove_inode_hash);
498 
499 void end_writeback(struct inode *inode)
500 {
501 	might_sleep();
502 	/*
503 	 * We have to cycle tree_lock here because reclaim can be still in the
504 	 * process of removing the last page (in __delete_from_page_cache())
505 	 * and we must not free mapping under it.
506 	 */
507 	spin_lock_irq(&inode->i_data.tree_lock);
508 	BUG_ON(inode->i_data.nrpages);
509 	spin_unlock_irq(&inode->i_data.tree_lock);
510 	BUG_ON(!list_empty(&inode->i_data.private_list));
511 	BUG_ON(!(inode->i_state & I_FREEING));
512 	BUG_ON(inode->i_state & I_CLEAR);
513 	inode_sync_wait(inode);
514 	/* don't need i_lock here, no concurrent mods to i_state */
515 	inode->i_state = I_FREEING | I_CLEAR;
516 }
517 EXPORT_SYMBOL(end_writeback);
518 
519 /*
520  * Free the inode passed in, removing it from the lists it is still connected
521  * to. We remove any pages still attached to the inode and wait for any IO that
522  * is still in progress before finally destroying the inode.
523  *
524  * An inode must already be marked I_FREEING so that we avoid the inode being
525  * moved back onto lists if we race with other code that manipulates the lists
526  * (e.g. writeback_single_inode). The caller is responsible for setting this.
527  *
528  * An inode must already be removed from the LRU list before being evicted from
529  * the cache. This should occur atomically with setting the I_FREEING state
530  * flag, so no inodes here should ever be on the LRU when being evicted.
531  */
532 static void evict(struct inode *inode)
533 {
534 	const struct super_operations *op = inode->i_sb->s_op;
535 
536 	BUG_ON(!(inode->i_state & I_FREEING));
537 	BUG_ON(!list_empty(&inode->i_lru));
538 
539 	if (!list_empty(&inode->i_wb_list))
540 		inode_wb_list_del(inode);
541 
542 	inode_sb_list_del(inode);
543 
544 	if (op->evict_inode) {
545 		op->evict_inode(inode);
546 	} else {
547 		if (inode->i_data.nrpages)
548 			truncate_inode_pages(&inode->i_data, 0);
549 		end_writeback(inode);
550 	}
551 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
552 		bd_forget(inode);
553 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
554 		cd_forget(inode);
555 
556 	remove_inode_hash(inode);
557 
558 	spin_lock(&inode->i_lock);
559 	wake_up_bit(&inode->i_state, __I_NEW);
560 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
561 	spin_unlock(&inode->i_lock);
562 
563 	destroy_inode(inode);
564 }
565 
566 /*
567  * dispose_list - dispose of the contents of a local list
568  * @head: the head of the list to free
569  *
570  * Dispose-list gets a local list with local inodes in it, so it doesn't
571  * need to worry about list corruption and SMP locks.
572  */
573 static void dispose_list(struct list_head *head)
574 {
575 	while (!list_empty(head)) {
576 		struct inode *inode;
577 
578 		inode = list_first_entry(head, struct inode, i_lru);
579 		list_del_init(&inode->i_lru);
580 
581 		evict(inode);
582 	}
583 }
584 
585 /**
586  * evict_inodes	- evict all evictable inodes for a superblock
587  * @sb:		superblock to operate on
588  *
589  * Make sure that no inodes with zero refcount are retained.  This is
590  * called by superblock shutdown after having MS_ACTIVE flag removed,
591  * so any inode reaching zero refcount during or after that call will
592  * be immediately evicted.
593  */
594 void evict_inodes(struct super_block *sb)
595 {
596 	struct inode *inode, *next;
597 	LIST_HEAD(dispose);
598 
599 	spin_lock(&inode_sb_list_lock);
600 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
601 		if (atomic_read(&inode->i_count))
602 			continue;
603 
604 		spin_lock(&inode->i_lock);
605 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
606 			spin_unlock(&inode->i_lock);
607 			continue;
608 		}
609 
610 		inode->i_state |= I_FREEING;
611 		inode_lru_list_del(inode);
612 		spin_unlock(&inode->i_lock);
613 		list_add(&inode->i_lru, &dispose);
614 	}
615 	spin_unlock(&inode_sb_list_lock);
616 
617 	dispose_list(&dispose);
618 }
619 
620 /**
621  * invalidate_inodes	- attempt to free all inodes on a superblock
622  * @sb:		superblock to operate on
623  * @kill_dirty: flag to guide handling of dirty inodes
624  *
625  * Attempts to free all inodes for a given superblock.  If there were any
626  * busy inodes return a non-zero value, else zero.
627  * If @kill_dirty is set, discard dirty inodes too, otherwise treat
628  * them as busy.
629  */
630 int invalidate_inodes(struct super_block *sb, bool kill_dirty)
631 {
632 	int busy = 0;
633 	struct inode *inode, *next;
634 	LIST_HEAD(dispose);
635 
636 	spin_lock(&inode_sb_list_lock);
637 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
638 		spin_lock(&inode->i_lock);
639 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
640 			spin_unlock(&inode->i_lock);
641 			continue;
642 		}
643 		if (inode->i_state & I_DIRTY && !kill_dirty) {
644 			spin_unlock(&inode->i_lock);
645 			busy = 1;
646 			continue;
647 		}
648 		if (atomic_read(&inode->i_count)) {
649 			spin_unlock(&inode->i_lock);
650 			busy = 1;
651 			continue;
652 		}
653 
654 		inode->i_state |= I_FREEING;
655 		inode_lru_list_del(inode);
656 		spin_unlock(&inode->i_lock);
657 		list_add(&inode->i_lru, &dispose);
658 	}
659 	spin_unlock(&inode_sb_list_lock);
660 
661 	dispose_list(&dispose);
662 
663 	return busy;
664 }
665 
666 static int can_unuse(struct inode *inode)
667 {
668 	if (inode->i_state & ~I_REFERENCED)
669 		return 0;
670 	if (inode_has_buffers(inode))
671 		return 0;
672 	if (atomic_read(&inode->i_count))
673 		return 0;
674 	if (inode->i_data.nrpages)
675 		return 0;
676 	return 1;
677 }
678 
679 /*
680  * Walk the superblock inode LRU for freeable inodes and attempt to free them.
681  * This is called from the superblock shrinker function with a number of inodes
682  * to trim from the LRU. Inodes to be freed are moved to a temporary list and
683  * then are freed outside inode_lock by dispose_list().
684  *
685  * Any inodes which are pinned purely because of attached pagecache have their
686  * pagecache removed.  If the inode has metadata buffers attached to
687  * mapping->private_list then try to remove them.
688  *
689  * If the inode has the I_REFERENCED flag set, then it means that it has been
690  * used recently - the flag is set in iput_final(). When we encounter such an
691  * inode, clear the flag and move it to the back of the LRU so it gets another
692  * pass through the LRU before it gets reclaimed. This is necessary because of
693  * the fact we are doing lazy LRU updates to minimise lock contention so the
694  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
695  * with this flag set because they are the inodes that are out of order.
696  */
697 void prune_icache_sb(struct super_block *sb, int nr_to_scan)
698 {
699 	LIST_HEAD(freeable);
700 	int nr_scanned;
701 	unsigned long reap = 0;
702 
703 	spin_lock(&sb->s_inode_lru_lock);
704 	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
705 		struct inode *inode;
706 
707 		if (list_empty(&sb->s_inode_lru))
708 			break;
709 
710 		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
711 
712 		/*
713 		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
714 		 * so use a trylock. If we fail to get the lock, just move the
715 		 * inode to the back of the list so we don't spin on it.
716 		 */
717 		if (!spin_trylock(&inode->i_lock)) {
718 			list_move_tail(&inode->i_lru, &sb->s_inode_lru);
719 			continue;
720 		}
721 
722 		/*
723 		 * Referenced or dirty inodes are still in use. Give them
724 		 * another pass through the LRU as we canot reclaim them now.
725 		 */
726 		if (atomic_read(&inode->i_count) ||
727 		    (inode->i_state & ~I_REFERENCED)) {
728 			list_del_init(&inode->i_lru);
729 			spin_unlock(&inode->i_lock);
730 			sb->s_nr_inodes_unused--;
731 			this_cpu_dec(nr_unused);
732 			continue;
733 		}
734 
735 		/* recently referenced inodes get one more pass */
736 		if (inode->i_state & I_REFERENCED) {
737 			inode->i_state &= ~I_REFERENCED;
738 			list_move(&inode->i_lru, &sb->s_inode_lru);
739 			spin_unlock(&inode->i_lock);
740 			continue;
741 		}
742 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
743 			__iget(inode);
744 			spin_unlock(&inode->i_lock);
745 			spin_unlock(&sb->s_inode_lru_lock);
746 			if (remove_inode_buffers(inode))
747 				reap += invalidate_mapping_pages(&inode->i_data,
748 								0, -1);
749 			iput(inode);
750 			spin_lock(&sb->s_inode_lru_lock);
751 
752 			if (inode != list_entry(sb->s_inode_lru.next,
753 						struct inode, i_lru))
754 				continue;	/* wrong inode or list_empty */
755 			/* avoid lock inversions with trylock */
756 			if (!spin_trylock(&inode->i_lock))
757 				continue;
758 			if (!can_unuse(inode)) {
759 				spin_unlock(&inode->i_lock);
760 				continue;
761 			}
762 		}
763 		WARN_ON(inode->i_state & I_NEW);
764 		inode->i_state |= I_FREEING;
765 		spin_unlock(&inode->i_lock);
766 
767 		list_move(&inode->i_lru, &freeable);
768 		sb->s_nr_inodes_unused--;
769 		this_cpu_dec(nr_unused);
770 	}
771 	if (current_is_kswapd())
772 		__count_vm_events(KSWAPD_INODESTEAL, reap);
773 	else
774 		__count_vm_events(PGINODESTEAL, reap);
775 	spin_unlock(&sb->s_inode_lru_lock);
776 	if (current->reclaim_state)
777 		current->reclaim_state->reclaimed_slab += reap;
778 
779 	dispose_list(&freeable);
780 }
781 
782 static void __wait_on_freeing_inode(struct inode *inode);
783 /*
784  * Called with the inode lock held.
785  */
786 static struct inode *find_inode(struct super_block *sb,
787 				struct hlist_head *head,
788 				int (*test)(struct inode *, void *),
789 				void *data)
790 {
791 	struct hlist_node *node;
792 	struct inode *inode = NULL;
793 
794 repeat:
795 	hlist_for_each_entry(inode, node, head, i_hash) {
796 		spin_lock(&inode->i_lock);
797 		if (inode->i_sb != sb) {
798 			spin_unlock(&inode->i_lock);
799 			continue;
800 		}
801 		if (!test(inode, data)) {
802 			spin_unlock(&inode->i_lock);
803 			continue;
804 		}
805 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
806 			__wait_on_freeing_inode(inode);
807 			goto repeat;
808 		}
809 		__iget(inode);
810 		spin_unlock(&inode->i_lock);
811 		return inode;
812 	}
813 	return NULL;
814 }
815 
816 /*
817  * find_inode_fast is the fast path version of find_inode, see the comment at
818  * iget_locked for details.
819  */
820 static struct inode *find_inode_fast(struct super_block *sb,
821 				struct hlist_head *head, unsigned long ino)
822 {
823 	struct hlist_node *node;
824 	struct inode *inode = NULL;
825 
826 repeat:
827 	hlist_for_each_entry(inode, node, head, i_hash) {
828 		spin_lock(&inode->i_lock);
829 		if (inode->i_ino != ino) {
830 			spin_unlock(&inode->i_lock);
831 			continue;
832 		}
833 		if (inode->i_sb != sb) {
834 			spin_unlock(&inode->i_lock);
835 			continue;
836 		}
837 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
838 			__wait_on_freeing_inode(inode);
839 			goto repeat;
840 		}
841 		__iget(inode);
842 		spin_unlock(&inode->i_lock);
843 		return inode;
844 	}
845 	return NULL;
846 }
847 
848 /*
849  * Each cpu owns a range of LAST_INO_BATCH numbers.
850  * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
851  * to renew the exhausted range.
852  *
853  * This does not significantly increase overflow rate because every CPU can
854  * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
855  * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
856  * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
857  * overflow rate by 2x, which does not seem too significant.
858  *
859  * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
860  * error if st_ino won't fit in target struct field. Use 32bit counter
861  * here to attempt to avoid that.
862  */
863 #define LAST_INO_BATCH 1024
864 static DEFINE_PER_CPU(unsigned int, last_ino);
865 
866 unsigned int get_next_ino(void)
867 {
868 	unsigned int *p = &get_cpu_var(last_ino);
869 	unsigned int res = *p;
870 
871 #ifdef CONFIG_SMP
872 	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
873 		static atomic_t shared_last_ino;
874 		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
875 
876 		res = next - LAST_INO_BATCH;
877 	}
878 #endif
879 
880 	*p = ++res;
881 	put_cpu_var(last_ino);
882 	return res;
883 }
884 EXPORT_SYMBOL(get_next_ino);
885 
886 /**
887  *	new_inode_pseudo 	- obtain an inode
888  *	@sb: superblock
889  *
890  *	Allocates a new inode for given superblock.
891  *	Inode wont be chained in superblock s_inodes list
892  *	This means :
893  *	- fs can't be unmount
894  *	- quotas, fsnotify, writeback can't work
895  */
896 struct inode *new_inode_pseudo(struct super_block *sb)
897 {
898 	struct inode *inode = alloc_inode(sb);
899 
900 	if (inode) {
901 		spin_lock(&inode->i_lock);
902 		inode->i_state = 0;
903 		spin_unlock(&inode->i_lock);
904 		INIT_LIST_HEAD(&inode->i_sb_list);
905 	}
906 	return inode;
907 }
908 
909 /**
910  *	new_inode 	- obtain an inode
911  *	@sb: superblock
912  *
913  *	Allocates a new inode for given superblock. The default gfp_mask
914  *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
915  *	If HIGHMEM pages are unsuitable or it is known that pages allocated
916  *	for the page cache are not reclaimable or migratable,
917  *	mapping_set_gfp_mask() must be called with suitable flags on the
918  *	newly created inode's mapping
919  *
920  */
921 struct inode *new_inode(struct super_block *sb)
922 {
923 	struct inode *inode;
924 
925 	spin_lock_prefetch(&inode_sb_list_lock);
926 
927 	inode = new_inode_pseudo(sb);
928 	if (inode)
929 		inode_sb_list_add(inode);
930 	return inode;
931 }
932 EXPORT_SYMBOL(new_inode);
933 
934 #ifdef CONFIG_DEBUG_LOCK_ALLOC
935 void lockdep_annotate_inode_mutex_key(struct inode *inode)
936 {
937 	if (S_ISDIR(inode->i_mode)) {
938 		struct file_system_type *type = inode->i_sb->s_type;
939 
940 		/* Set new key only if filesystem hasn't already changed it */
941 		if (!lockdep_match_class(&inode->i_mutex,
942 		    &type->i_mutex_key)) {
943 			/*
944 			 * ensure nobody is actually holding i_mutex
945 			 */
946 			mutex_destroy(&inode->i_mutex);
947 			mutex_init(&inode->i_mutex);
948 			lockdep_set_class(&inode->i_mutex,
949 					  &type->i_mutex_dir_key);
950 		}
951 	}
952 }
953 EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
954 #endif
955 
956 /**
957  * unlock_new_inode - clear the I_NEW state and wake up any waiters
958  * @inode:	new inode to unlock
959  *
960  * Called when the inode is fully initialised to clear the new state of the
961  * inode and wake up anyone waiting for the inode to finish initialisation.
962  */
963 void unlock_new_inode(struct inode *inode)
964 {
965 	lockdep_annotate_inode_mutex_key(inode);
966 	spin_lock(&inode->i_lock);
967 	WARN_ON(!(inode->i_state & I_NEW));
968 	inode->i_state &= ~I_NEW;
969 	wake_up_bit(&inode->i_state, __I_NEW);
970 	spin_unlock(&inode->i_lock);
971 }
972 EXPORT_SYMBOL(unlock_new_inode);
973 
974 /**
975  * iget5_locked - obtain an inode from a mounted file system
976  * @sb:		super block of file system
977  * @hashval:	hash value (usually inode number) to get
978  * @test:	callback used for comparisons between inodes
979  * @set:	callback used to initialize a new struct inode
980  * @data:	opaque data pointer to pass to @test and @set
981  *
982  * Search for the inode specified by @hashval and @data in the inode cache,
983  * and if present it is return it with an increased reference count. This is
984  * a generalized version of iget_locked() for file systems where the inode
985  * number is not sufficient for unique identification of an inode.
986  *
987  * If the inode is not in cache, allocate a new inode and return it locked,
988  * hashed, and with the I_NEW flag set. The file system gets to fill it in
989  * before unlocking it via unlock_new_inode().
990  *
991  * Note both @test and @set are called with the inode_hash_lock held, so can't
992  * sleep.
993  */
994 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
995 		int (*test)(struct inode *, void *),
996 		int (*set)(struct inode *, void *), void *data)
997 {
998 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
999 	struct inode *inode;
1000 
1001 	spin_lock(&inode_hash_lock);
1002 	inode = find_inode(sb, head, test, data);
1003 	spin_unlock(&inode_hash_lock);
1004 
1005 	if (inode) {
1006 		wait_on_inode(inode);
1007 		return inode;
1008 	}
1009 
1010 	inode = alloc_inode(sb);
1011 	if (inode) {
1012 		struct inode *old;
1013 
1014 		spin_lock(&inode_hash_lock);
1015 		/* We released the lock, so.. */
1016 		old = find_inode(sb, head, test, data);
1017 		if (!old) {
1018 			if (set(inode, data))
1019 				goto set_failed;
1020 
1021 			spin_lock(&inode->i_lock);
1022 			inode->i_state = I_NEW;
1023 			hlist_add_head(&inode->i_hash, head);
1024 			spin_unlock(&inode->i_lock);
1025 			inode_sb_list_add(inode);
1026 			spin_unlock(&inode_hash_lock);
1027 
1028 			/* Return the locked inode with I_NEW set, the
1029 			 * caller is responsible for filling in the contents
1030 			 */
1031 			return inode;
1032 		}
1033 
1034 		/*
1035 		 * Uhhuh, somebody else created the same inode under
1036 		 * us. Use the old inode instead of the one we just
1037 		 * allocated.
1038 		 */
1039 		spin_unlock(&inode_hash_lock);
1040 		destroy_inode(inode);
1041 		inode = old;
1042 		wait_on_inode(inode);
1043 	}
1044 	return inode;
1045 
1046 set_failed:
1047 	spin_unlock(&inode_hash_lock);
1048 	destroy_inode(inode);
1049 	return NULL;
1050 }
1051 EXPORT_SYMBOL(iget5_locked);
1052 
1053 /**
1054  * iget_locked - obtain an inode from a mounted file system
1055  * @sb:		super block of file system
1056  * @ino:	inode number to get
1057  *
1058  * Search for the inode specified by @ino in the inode cache and if present
1059  * return it with an increased reference count. This is for file systems
1060  * where the inode number is sufficient for unique identification of an inode.
1061  *
1062  * If the inode is not in cache, allocate a new inode and return it locked,
1063  * hashed, and with the I_NEW flag set.  The file system gets to fill it in
1064  * before unlocking it via unlock_new_inode().
1065  */
1066 struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1067 {
1068 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1069 	struct inode *inode;
1070 
1071 	spin_lock(&inode_hash_lock);
1072 	inode = find_inode_fast(sb, head, ino);
1073 	spin_unlock(&inode_hash_lock);
1074 	if (inode) {
1075 		wait_on_inode(inode);
1076 		return inode;
1077 	}
1078 
1079 	inode = alloc_inode(sb);
1080 	if (inode) {
1081 		struct inode *old;
1082 
1083 		spin_lock(&inode_hash_lock);
1084 		/* We released the lock, so.. */
1085 		old = find_inode_fast(sb, head, ino);
1086 		if (!old) {
1087 			inode->i_ino = ino;
1088 			spin_lock(&inode->i_lock);
1089 			inode->i_state = I_NEW;
1090 			hlist_add_head(&inode->i_hash, head);
1091 			spin_unlock(&inode->i_lock);
1092 			inode_sb_list_add(inode);
1093 			spin_unlock(&inode_hash_lock);
1094 
1095 			/* Return the locked inode with I_NEW set, the
1096 			 * caller is responsible for filling in the contents
1097 			 */
1098 			return inode;
1099 		}
1100 
1101 		/*
1102 		 * Uhhuh, somebody else created the same inode under
1103 		 * us. Use the old inode instead of the one we just
1104 		 * allocated.
1105 		 */
1106 		spin_unlock(&inode_hash_lock);
1107 		destroy_inode(inode);
1108 		inode = old;
1109 		wait_on_inode(inode);
1110 	}
1111 	return inode;
1112 }
1113 EXPORT_SYMBOL(iget_locked);
1114 
1115 /*
1116  * search the inode cache for a matching inode number.
1117  * If we find one, then the inode number we are trying to
1118  * allocate is not unique and so we should not use it.
1119  *
1120  * Returns 1 if the inode number is unique, 0 if it is not.
1121  */
1122 static int test_inode_iunique(struct super_block *sb, unsigned long ino)
1123 {
1124 	struct hlist_head *b = inode_hashtable + hash(sb, ino);
1125 	struct hlist_node *node;
1126 	struct inode *inode;
1127 
1128 	spin_lock(&inode_hash_lock);
1129 	hlist_for_each_entry(inode, node, b, i_hash) {
1130 		if (inode->i_ino == ino && inode->i_sb == sb) {
1131 			spin_unlock(&inode_hash_lock);
1132 			return 0;
1133 		}
1134 	}
1135 	spin_unlock(&inode_hash_lock);
1136 
1137 	return 1;
1138 }
1139 
1140 /**
1141  *	iunique - get a unique inode number
1142  *	@sb: superblock
1143  *	@max_reserved: highest reserved inode number
1144  *
1145  *	Obtain an inode number that is unique on the system for a given
1146  *	superblock. This is used by file systems that have no natural
1147  *	permanent inode numbering system. An inode number is returned that
1148  *	is higher than the reserved limit but unique.
1149  *
1150  *	BUGS:
1151  *	With a large number of inodes live on the file system this function
1152  *	currently becomes quite slow.
1153  */
1154 ino_t iunique(struct super_block *sb, ino_t max_reserved)
1155 {
1156 	/*
1157 	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
1158 	 * error if st_ino won't fit in target struct field. Use 32bit counter
1159 	 * here to attempt to avoid that.
1160 	 */
1161 	static DEFINE_SPINLOCK(iunique_lock);
1162 	static unsigned int counter;
1163 	ino_t res;
1164 
1165 	spin_lock(&iunique_lock);
1166 	do {
1167 		if (counter <= max_reserved)
1168 			counter = max_reserved + 1;
1169 		res = counter++;
1170 	} while (!test_inode_iunique(sb, res));
1171 	spin_unlock(&iunique_lock);
1172 
1173 	return res;
1174 }
1175 EXPORT_SYMBOL(iunique);
1176 
1177 struct inode *igrab(struct inode *inode)
1178 {
1179 	spin_lock(&inode->i_lock);
1180 	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1181 		__iget(inode);
1182 		spin_unlock(&inode->i_lock);
1183 	} else {
1184 		spin_unlock(&inode->i_lock);
1185 		/*
1186 		 * Handle the case where s_op->clear_inode is not been
1187 		 * called yet, and somebody is calling igrab
1188 		 * while the inode is getting freed.
1189 		 */
1190 		inode = NULL;
1191 	}
1192 	return inode;
1193 }
1194 EXPORT_SYMBOL(igrab);
1195 
1196 /**
1197  * ilookup5_nowait - search for an inode in the inode cache
1198  * @sb:		super block of file system to search
1199  * @hashval:	hash value (usually inode number) to search for
1200  * @test:	callback used for comparisons between inodes
1201  * @data:	opaque data pointer to pass to @test
1202  *
1203  * Search for the inode specified by @hashval and @data in the inode cache.
1204  * If the inode is in the cache, the inode is returned with an incremented
1205  * reference count.
1206  *
1207  * Note: I_NEW is not waited upon so you have to be very careful what you do
1208  * with the returned inode.  You probably should be using ilookup5() instead.
1209  *
1210  * Note2: @test is called with the inode_hash_lock held, so can't sleep.
1211  */
1212 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1213 		int (*test)(struct inode *, void *), void *data)
1214 {
1215 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1216 	struct inode *inode;
1217 
1218 	spin_lock(&inode_hash_lock);
1219 	inode = find_inode(sb, head, test, data);
1220 	spin_unlock(&inode_hash_lock);
1221 
1222 	return inode;
1223 }
1224 EXPORT_SYMBOL(ilookup5_nowait);
1225 
1226 /**
1227  * ilookup5 - search for an inode in the inode cache
1228  * @sb:		super block of file system to search
1229  * @hashval:	hash value (usually inode number) to search for
1230  * @test:	callback used for comparisons between inodes
1231  * @data:	opaque data pointer to pass to @test
1232  *
1233  * Search for the inode specified by @hashval and @data in the inode cache,
1234  * and if the inode is in the cache, return the inode with an incremented
1235  * reference count.  Waits on I_NEW before returning the inode.
1236  * returned with an incremented reference count.
1237  *
1238  * This is a generalized version of ilookup() for file systems where the
1239  * inode number is not sufficient for unique identification of an inode.
1240  *
1241  * Note: @test is called with the inode_hash_lock held, so can't sleep.
1242  */
1243 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1244 		int (*test)(struct inode *, void *), void *data)
1245 {
1246 	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1247 
1248 	if (inode)
1249 		wait_on_inode(inode);
1250 	return inode;
1251 }
1252 EXPORT_SYMBOL(ilookup5);
1253 
1254 /**
1255  * ilookup - search for an inode in the inode cache
1256  * @sb:		super block of file system to search
1257  * @ino:	inode number to search for
1258  *
1259  * Search for the inode @ino in the inode cache, and if the inode is in the
1260  * cache, the inode is returned with an incremented reference count.
1261  */
1262 struct inode *ilookup(struct super_block *sb, unsigned long ino)
1263 {
1264 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1265 	struct inode *inode;
1266 
1267 	spin_lock(&inode_hash_lock);
1268 	inode = find_inode_fast(sb, head, ino);
1269 	spin_unlock(&inode_hash_lock);
1270 
1271 	if (inode)
1272 		wait_on_inode(inode);
1273 	return inode;
1274 }
1275 EXPORT_SYMBOL(ilookup);
1276 
1277 int insert_inode_locked(struct inode *inode)
1278 {
1279 	struct super_block *sb = inode->i_sb;
1280 	ino_t ino = inode->i_ino;
1281 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1282 
1283 	while (1) {
1284 		struct hlist_node *node;
1285 		struct inode *old = NULL;
1286 		spin_lock(&inode_hash_lock);
1287 		hlist_for_each_entry(old, node, head, i_hash) {
1288 			if (old->i_ino != ino)
1289 				continue;
1290 			if (old->i_sb != sb)
1291 				continue;
1292 			spin_lock(&old->i_lock);
1293 			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1294 				spin_unlock(&old->i_lock);
1295 				continue;
1296 			}
1297 			break;
1298 		}
1299 		if (likely(!node)) {
1300 			spin_lock(&inode->i_lock);
1301 			inode->i_state |= I_NEW;
1302 			hlist_add_head(&inode->i_hash, head);
1303 			spin_unlock(&inode->i_lock);
1304 			spin_unlock(&inode_hash_lock);
1305 			return 0;
1306 		}
1307 		__iget(old);
1308 		spin_unlock(&old->i_lock);
1309 		spin_unlock(&inode_hash_lock);
1310 		wait_on_inode(old);
1311 		if (unlikely(!inode_unhashed(old))) {
1312 			iput(old);
1313 			return -EBUSY;
1314 		}
1315 		iput(old);
1316 	}
1317 }
1318 EXPORT_SYMBOL(insert_inode_locked);
1319 
1320 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1321 		int (*test)(struct inode *, void *), void *data)
1322 {
1323 	struct super_block *sb = inode->i_sb;
1324 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1325 
1326 	while (1) {
1327 		struct hlist_node *node;
1328 		struct inode *old = NULL;
1329 
1330 		spin_lock(&inode_hash_lock);
1331 		hlist_for_each_entry(old, node, head, i_hash) {
1332 			if (old->i_sb != sb)
1333 				continue;
1334 			if (!test(old, data))
1335 				continue;
1336 			spin_lock(&old->i_lock);
1337 			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1338 				spin_unlock(&old->i_lock);
1339 				continue;
1340 			}
1341 			break;
1342 		}
1343 		if (likely(!node)) {
1344 			spin_lock(&inode->i_lock);
1345 			inode->i_state |= I_NEW;
1346 			hlist_add_head(&inode->i_hash, head);
1347 			spin_unlock(&inode->i_lock);
1348 			spin_unlock(&inode_hash_lock);
1349 			return 0;
1350 		}
1351 		__iget(old);
1352 		spin_unlock(&old->i_lock);
1353 		spin_unlock(&inode_hash_lock);
1354 		wait_on_inode(old);
1355 		if (unlikely(!inode_unhashed(old))) {
1356 			iput(old);
1357 			return -EBUSY;
1358 		}
1359 		iput(old);
1360 	}
1361 }
1362 EXPORT_SYMBOL(insert_inode_locked4);
1363 
1364 
1365 int generic_delete_inode(struct inode *inode)
1366 {
1367 	return 1;
1368 }
1369 EXPORT_SYMBOL(generic_delete_inode);
1370 
1371 /*
1372  * Normal UNIX filesystem behaviour: delete the
1373  * inode when the usage count drops to zero, and
1374  * i_nlink is zero.
1375  */
1376 int generic_drop_inode(struct inode *inode)
1377 {
1378 	return !inode->i_nlink || inode_unhashed(inode);
1379 }
1380 EXPORT_SYMBOL_GPL(generic_drop_inode);
1381 
1382 /*
1383  * Called when we're dropping the last reference
1384  * to an inode.
1385  *
1386  * Call the FS "drop_inode()" function, defaulting to
1387  * the legacy UNIX filesystem behaviour.  If it tells
1388  * us to evict inode, do so.  Otherwise, retain inode
1389  * in cache if fs is alive, sync and evict if fs is
1390  * shutting down.
1391  */
1392 static void iput_final(struct inode *inode)
1393 {
1394 	struct super_block *sb = inode->i_sb;
1395 	const struct super_operations *op = inode->i_sb->s_op;
1396 	int drop;
1397 
1398 	WARN_ON(inode->i_state & I_NEW);
1399 
1400 	if (op->drop_inode)
1401 		drop = op->drop_inode(inode);
1402 	else
1403 		drop = generic_drop_inode(inode);
1404 
1405 	if (!drop && (sb->s_flags & MS_ACTIVE)) {
1406 		inode->i_state |= I_REFERENCED;
1407 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1408 			inode_lru_list_add(inode);
1409 		spin_unlock(&inode->i_lock);
1410 		return;
1411 	}
1412 
1413 	if (!drop) {
1414 		inode->i_state |= I_WILL_FREE;
1415 		spin_unlock(&inode->i_lock);
1416 		write_inode_now(inode, 1);
1417 		spin_lock(&inode->i_lock);
1418 		WARN_ON(inode->i_state & I_NEW);
1419 		inode->i_state &= ~I_WILL_FREE;
1420 	}
1421 
1422 	inode->i_state |= I_FREEING;
1423 	if (!list_empty(&inode->i_lru))
1424 		inode_lru_list_del(inode);
1425 	spin_unlock(&inode->i_lock);
1426 
1427 	evict(inode);
1428 }
1429 
1430 /**
1431  *	iput	- put an inode
1432  *	@inode: inode to put
1433  *
1434  *	Puts an inode, dropping its usage count. If the inode use count hits
1435  *	zero, the inode is then freed and may also be destroyed.
1436  *
1437  *	Consequently, iput() can sleep.
1438  */
1439 void iput(struct inode *inode)
1440 {
1441 	if (inode) {
1442 		BUG_ON(inode->i_state & I_CLEAR);
1443 
1444 		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1445 			iput_final(inode);
1446 	}
1447 }
1448 EXPORT_SYMBOL(iput);
1449 
1450 /**
1451  *	bmap	- find a block number in a file
1452  *	@inode: inode of file
1453  *	@block: block to find
1454  *
1455  *	Returns the block number on the device holding the inode that
1456  *	is the disk block number for the block of the file requested.
1457  *	That is, asked for block 4 of inode 1 the function will return the
1458  *	disk block relative to the disk start that holds that block of the
1459  *	file.
1460  */
1461 sector_t bmap(struct inode *inode, sector_t block)
1462 {
1463 	sector_t res = 0;
1464 	if (inode->i_mapping->a_ops->bmap)
1465 		res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
1466 	return res;
1467 }
1468 EXPORT_SYMBOL(bmap);
1469 
1470 /*
1471  * With relative atime, only update atime if the previous atime is
1472  * earlier than either the ctime or mtime or if at least a day has
1473  * passed since the last atime update.
1474  */
1475 static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1476 			     struct timespec now)
1477 {
1478 
1479 	if (!(mnt->mnt_flags & MNT_RELATIME))
1480 		return 1;
1481 	/*
1482 	 * Is mtime younger than atime? If yes, update atime:
1483 	 */
1484 	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
1485 		return 1;
1486 	/*
1487 	 * Is ctime younger than atime? If yes, update atime:
1488 	 */
1489 	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
1490 		return 1;
1491 
1492 	/*
1493 	 * Is the previous atime value older than a day? If yes,
1494 	 * update atime:
1495 	 */
1496 	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
1497 		return 1;
1498 	/*
1499 	 * Good, we can skip the atime update:
1500 	 */
1501 	return 0;
1502 }
1503 
1504 /**
1505  *	touch_atime	-	update the access time
1506  *	@mnt: mount the inode is accessed on
1507  *	@dentry: dentry accessed
1508  *
1509  *	Update the accessed time on an inode and mark it for writeback.
1510  *	This function automatically handles read only file systems and media,
1511  *	as well as the "noatime" flag and inode specific "noatime" markers.
1512  */
1513 void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1514 {
1515 	struct inode *inode = dentry->d_inode;
1516 	struct timespec now;
1517 
1518 	if (inode->i_flags & S_NOATIME)
1519 		return;
1520 	if (IS_NOATIME(inode))
1521 		return;
1522 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1523 		return;
1524 
1525 	if (mnt->mnt_flags & MNT_NOATIME)
1526 		return;
1527 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1528 		return;
1529 
1530 	now = current_fs_time(inode->i_sb);
1531 
1532 	if (!relatime_need_update(mnt, inode, now))
1533 		return;
1534 
1535 	if (timespec_equal(&inode->i_atime, &now))
1536 		return;
1537 
1538 	if (mnt_want_write(mnt))
1539 		return;
1540 
1541 	inode->i_atime = now;
1542 	mark_inode_dirty_sync(inode);
1543 	mnt_drop_write(mnt);
1544 }
1545 EXPORT_SYMBOL(touch_atime);
1546 
1547 /**
1548  *	file_update_time	-	update mtime and ctime time
1549  *	@file: file accessed
1550  *
1551  *	Update the mtime and ctime members of an inode and mark the inode
1552  *	for writeback.  Note that this function is meant exclusively for
1553  *	usage in the file write path of filesystems, and filesystems may
1554  *	choose to explicitly ignore update via this function with the
1555  *	S_NOCMTIME inode flag, e.g. for network filesystem where these
1556  *	timestamps are handled by the server.
1557  */
1558 
1559 void file_update_time(struct file *file)
1560 {
1561 	struct inode *inode = file->f_path.dentry->d_inode;
1562 	struct timespec now;
1563 	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
1564 
1565 	/* First try to exhaust all avenues to not sync */
1566 	if (IS_NOCMTIME(inode))
1567 		return;
1568 
1569 	now = current_fs_time(inode->i_sb);
1570 	if (!timespec_equal(&inode->i_mtime, &now))
1571 		sync_it = S_MTIME;
1572 
1573 	if (!timespec_equal(&inode->i_ctime, &now))
1574 		sync_it |= S_CTIME;
1575 
1576 	if (IS_I_VERSION(inode))
1577 		sync_it |= S_VERSION;
1578 
1579 	if (!sync_it)
1580 		return;
1581 
1582 	/* Finally allowed to write? Takes lock. */
1583 	if (mnt_want_write_file(file))
1584 		return;
1585 
1586 	/* Only change inode inside the lock region */
1587 	if (sync_it & S_VERSION)
1588 		inode_inc_iversion(inode);
1589 	if (sync_it & S_CTIME)
1590 		inode->i_ctime = now;
1591 	if (sync_it & S_MTIME)
1592 		inode->i_mtime = now;
1593 	mark_inode_dirty_sync(inode);
1594 	mnt_drop_write_file(file);
1595 }
1596 EXPORT_SYMBOL(file_update_time);
1597 
1598 int inode_needs_sync(struct inode *inode)
1599 {
1600 	if (IS_SYNC(inode))
1601 		return 1;
1602 	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
1603 		return 1;
1604 	return 0;
1605 }
1606 EXPORT_SYMBOL(inode_needs_sync);
1607 
1608 int inode_wait(void *word)
1609 {
1610 	schedule();
1611 	return 0;
1612 }
1613 EXPORT_SYMBOL(inode_wait);
1614 
1615 /*
1616  * If we try to find an inode in the inode hash while it is being
1617  * deleted, we have to wait until the filesystem completes its
1618  * deletion before reporting that it isn't found.  This function waits
1619  * until the deletion _might_ have completed.  Callers are responsible
1620  * to recheck inode state.
1621  *
1622  * It doesn't matter if I_NEW is not set initially, a call to
1623  * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1624  * will DTRT.
1625  */
1626 static void __wait_on_freeing_inode(struct inode *inode)
1627 {
1628 	wait_queue_head_t *wq;
1629 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1630 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
1631 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1632 	spin_unlock(&inode->i_lock);
1633 	spin_unlock(&inode_hash_lock);
1634 	schedule();
1635 	finish_wait(wq, &wait.wait);
1636 	spin_lock(&inode_hash_lock);
1637 }
1638 
1639 static __initdata unsigned long ihash_entries;
1640 static int __init set_ihash_entries(char *str)
1641 {
1642 	if (!str)
1643 		return 0;
1644 	ihash_entries = simple_strtoul(str, &str, 0);
1645 	return 1;
1646 }
1647 __setup("ihash_entries=", set_ihash_entries);
1648 
1649 /*
1650  * Initialize the waitqueues and inode hash table.
1651  */
1652 void __init inode_init_early(void)
1653 {
1654 	int loop;
1655 
1656 	/* If hashes are distributed across NUMA nodes, defer
1657 	 * hash allocation until vmalloc space is available.
1658 	 */
1659 	if (hashdist)
1660 		return;
1661 
1662 	inode_hashtable =
1663 		alloc_large_system_hash("Inode-cache",
1664 					sizeof(struct hlist_head),
1665 					ihash_entries,
1666 					14,
1667 					HASH_EARLY,
1668 					&i_hash_shift,
1669 					&i_hash_mask,
1670 					0);
1671 
1672 	for (loop = 0; loop < (1 << i_hash_shift); loop++)
1673 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
1674 }
1675 
1676 void __init inode_init(void)
1677 {
1678 	int loop;
1679 
1680 	/* inode slab cache */
1681 	inode_cachep = kmem_cache_create("inode_cache",
1682 					 sizeof(struct inode),
1683 					 0,
1684 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
1685 					 SLAB_MEM_SPREAD),
1686 					 init_once);
1687 
1688 	/* Hash may have been set up in inode_init_early */
1689 	if (!hashdist)
1690 		return;
1691 
1692 	inode_hashtable =
1693 		alloc_large_system_hash("Inode-cache",
1694 					sizeof(struct hlist_head),
1695 					ihash_entries,
1696 					14,
1697 					0,
1698 					&i_hash_shift,
1699 					&i_hash_mask,
1700 					0);
1701 
1702 	for (loop = 0; loop < (1 << i_hash_shift); loop++)
1703 		INIT_HLIST_HEAD(&inode_hashtable[loop]);
1704 }
1705 
1706 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1707 {
1708 	inode->i_mode = mode;
1709 	if (S_ISCHR(mode)) {
1710 		inode->i_fop = &def_chr_fops;
1711 		inode->i_rdev = rdev;
1712 	} else if (S_ISBLK(mode)) {
1713 		inode->i_fop = &def_blk_fops;
1714 		inode->i_rdev = rdev;
1715 	} else if (S_ISFIFO(mode))
1716 		inode->i_fop = &def_fifo_fops;
1717 	else if (S_ISSOCK(mode))
1718 		inode->i_fop = &bad_sock_fops;
1719 	else
1720 		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
1721 				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
1722 				  inode->i_ino);
1723 }
1724 EXPORT_SYMBOL(init_special_inode);
1725 
1726 /**
1727  * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
1728  * @inode: New inode
1729  * @dir: Directory inode
1730  * @mode: mode of the new inode
1731  */
1732 void inode_init_owner(struct inode *inode, const struct inode *dir,
1733 			umode_t mode)
1734 {
1735 	inode->i_uid = current_fsuid();
1736 	if (dir && dir->i_mode & S_ISGID) {
1737 		inode->i_gid = dir->i_gid;
1738 		if (S_ISDIR(mode))
1739 			mode |= S_ISGID;
1740 	} else
1741 		inode->i_gid = current_fsgid();
1742 	inode->i_mode = mode;
1743 }
1744 EXPORT_SYMBOL(inode_init_owner);
1745 
1746 /**
1747  * inode_owner_or_capable - check current task permissions to inode
1748  * @inode: inode being checked
1749  *
1750  * Return true if current either has CAP_FOWNER to the inode, or
1751  * owns the file.
1752  */
1753 bool inode_owner_or_capable(const struct inode *inode)
1754 {
1755 	struct user_namespace *ns = inode_userns(inode);
1756 
1757 	if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
1758 		return true;
1759 	if (ns_capable(ns, CAP_FOWNER))
1760 		return true;
1761 	return false;
1762 }
1763 EXPORT_SYMBOL(inode_owner_or_capable);
1764