xref: /linux/fs/inode.c (revision f2ff7147c6834f244b8ce636b12e71a3bd044629)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * (C) 1997 Linus Torvalds
4   * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
5   */
6  #include <linux/export.h>
7  #include <linux/fs.h>
8  #include <linux/mm.h>
9  #include <linux/backing-dev.h>
10  #include <linux/hash.h>
11  #include <linux/swap.h>
12  #include <linux/security.h>
13  #include <linux/cdev.h>
14  #include <linux/memblock.h>
15  #include <linux/fsnotify.h>
16  #include <linux/mount.h>
17  #include <linux/posix_acl.h>
18  #include <linux/prefetch.h>
19  #include <linux/buffer_head.h> /* for inode_has_buffers */
20  #include <linux/ratelimit.h>
21  #include <linux/list_lru.h>
22  #include <linux/iversion.h>
23  #include <trace/events/writeback.h>
24  #include "internal.h"
25  
26  /*
27   * Inode locking rules:
28   *
29   * inode->i_lock protects:
30   *   inode->i_state, inode->i_hash, __iget()
31   * Inode LRU list locks protect:
32   *   inode->i_sb->s_inode_lru, inode->i_lru
33   * inode->i_sb->s_inode_list_lock protects:
34   *   inode->i_sb->s_inodes, inode->i_sb_list
35   * bdi->wb.list_lock protects:
36   *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
37   * inode_hash_lock protects:
38   *   inode_hashtable, inode->i_hash
39   *
40   * Lock ordering:
41   *
42   * inode->i_sb->s_inode_list_lock
43   *   inode->i_lock
44   *     Inode LRU list locks
45   *
46   * bdi->wb.list_lock
47   *   inode->i_lock
48   *
49   * inode_hash_lock
50   *   inode->i_sb->s_inode_list_lock
51   *   inode->i_lock
52   *
53   * iunique_lock
54   *   inode_hash_lock
55   */
56  
57  static unsigned int i_hash_mask __read_mostly;
58  static unsigned int i_hash_shift __read_mostly;
59  static struct hlist_head *inode_hashtable __read_mostly;
60  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
61  
62  /*
63   * Empty aops. Can be used for the cases where the user does not
64   * define any of the address_space operations.
65   */
66  const struct address_space_operations empty_aops = {
67  };
68  EXPORT_SYMBOL(empty_aops);
69  
70  /*
71   * Statistics gathering..
72   */
73  struct inodes_stat_t inodes_stat;
74  
75  static DEFINE_PER_CPU(unsigned long, nr_inodes);
76  static DEFINE_PER_CPU(unsigned long, nr_unused);
77  
78  static struct kmem_cache *inode_cachep __read_mostly;
79  
80  static long get_nr_inodes(void)
81  {
82  	int i;
83  	long sum = 0;
84  	for_each_possible_cpu(i)
85  		sum += per_cpu(nr_inodes, i);
86  	return sum < 0 ? 0 : sum;
87  }
88  
89  static inline long get_nr_inodes_unused(void)
90  {
91  	int i;
92  	long sum = 0;
93  	for_each_possible_cpu(i)
94  		sum += per_cpu(nr_unused, i);
95  	return sum < 0 ? 0 : sum;
96  }
97  
98  long get_nr_dirty_inodes(void)
99  {
100  	/* not actually dirty inodes, but a wild approximation */
101  	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
102  	return nr_dirty > 0 ? nr_dirty : 0;
103  }
104  
105  /*
106   * Handle nr_inode sysctl
107   */
108  #ifdef CONFIG_SYSCTL
109  int proc_nr_inodes(struct ctl_table *table, int write,
110  		   void *buffer, size_t *lenp, loff_t *ppos)
111  {
112  	inodes_stat.nr_inodes = get_nr_inodes();
113  	inodes_stat.nr_unused = get_nr_inodes_unused();
114  	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
115  }
116  #endif
117  
118  static int no_open(struct inode *inode, struct file *file)
119  {
120  	return -ENXIO;
121  }
122  
123  /**
124   * inode_init_always - perform inode structure initialisation
125   * @sb: superblock inode belongs to
126   * @inode: inode to initialise
127   *
128   * These are initializations that need to be done on every inode
129   * allocation as the fields are not initialised by slab allocation.
130   */
131  int inode_init_always(struct super_block *sb, struct inode *inode)
132  {
133  	static const struct inode_operations empty_iops;
134  	static const struct file_operations no_open_fops = {.open = no_open};
135  	struct address_space *const mapping = &inode->i_data;
136  
137  	inode->i_sb = sb;
138  	inode->i_blkbits = sb->s_blocksize_bits;
139  	inode->i_flags = 0;
140  	atomic64_set(&inode->i_sequence, 0);
141  	atomic_set(&inode->i_count, 1);
142  	inode->i_op = &empty_iops;
143  	inode->i_fop = &no_open_fops;
144  	inode->i_ino = 0;
145  	inode->__i_nlink = 1;
146  	inode->i_opflags = 0;
147  	if (sb->s_xattr)
148  		inode->i_opflags |= IOP_XATTR;
149  	i_uid_write(inode, 0);
150  	i_gid_write(inode, 0);
151  	atomic_set(&inode->i_writecount, 0);
152  	inode->i_size = 0;
153  	inode->i_write_hint = WRITE_LIFE_NOT_SET;
154  	inode->i_blocks = 0;
155  	inode->i_bytes = 0;
156  	inode->i_generation = 0;
157  	inode->i_pipe = NULL;
158  	inode->i_cdev = NULL;
159  	inode->i_link = NULL;
160  	inode->i_dir_seq = 0;
161  	inode->i_rdev = 0;
162  	inode->dirtied_when = 0;
163  
164  #ifdef CONFIG_CGROUP_WRITEBACK
165  	inode->i_wb_frn_winner = 0;
166  	inode->i_wb_frn_avg_time = 0;
167  	inode->i_wb_frn_history = 0;
168  #endif
169  
170  	if (security_inode_alloc(inode))
171  		goto out;
172  	spin_lock_init(&inode->i_lock);
173  	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
174  
175  	init_rwsem(&inode->i_rwsem);
176  	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
177  
178  	atomic_set(&inode->i_dio_count, 0);
179  
180  	mapping->a_ops = &empty_aops;
181  	mapping->host = inode;
182  	mapping->flags = 0;
183  	if (sb->s_type->fs_flags & FS_THP_SUPPORT)
184  		__set_bit(AS_THP_SUPPORT, &mapping->flags);
185  	mapping->wb_err = 0;
186  	atomic_set(&mapping->i_mmap_writable, 0);
187  #ifdef CONFIG_READ_ONLY_THP_FOR_FS
188  	atomic_set(&mapping->nr_thps, 0);
189  #endif
190  	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
191  	mapping->private_data = NULL;
192  	mapping->writeback_index = 0;
193  	__init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock",
194  		     &sb->s_type->invalidate_lock_key);
195  	inode->i_private = NULL;
196  	inode->i_mapping = mapping;
197  	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
198  #ifdef CONFIG_FS_POSIX_ACL
199  	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
200  #endif
201  
202  #ifdef CONFIG_FSNOTIFY
203  	inode->i_fsnotify_mask = 0;
204  #endif
205  	inode->i_flctx = NULL;
206  	this_cpu_inc(nr_inodes);
207  
208  	return 0;
209  out:
210  	return -ENOMEM;
211  }
212  EXPORT_SYMBOL(inode_init_always);
213  
214  void free_inode_nonrcu(struct inode *inode)
215  {
216  	kmem_cache_free(inode_cachep, inode);
217  }
218  EXPORT_SYMBOL(free_inode_nonrcu);
219  
220  static void i_callback(struct rcu_head *head)
221  {
222  	struct inode *inode = container_of(head, struct inode, i_rcu);
223  	if (inode->free_inode)
224  		inode->free_inode(inode);
225  	else
226  		free_inode_nonrcu(inode);
227  }
228  
229  static struct inode *alloc_inode(struct super_block *sb)
230  {
231  	const struct super_operations *ops = sb->s_op;
232  	struct inode *inode;
233  
234  	if (ops->alloc_inode)
235  		inode = ops->alloc_inode(sb);
236  	else
237  		inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
238  
239  	if (!inode)
240  		return NULL;
241  
242  	if (unlikely(inode_init_always(sb, inode))) {
243  		if (ops->destroy_inode) {
244  			ops->destroy_inode(inode);
245  			if (!ops->free_inode)
246  				return NULL;
247  		}
248  		inode->free_inode = ops->free_inode;
249  		i_callback(&inode->i_rcu);
250  		return NULL;
251  	}
252  
253  	return inode;
254  }
255  
256  void __destroy_inode(struct inode *inode)
257  {
258  	BUG_ON(inode_has_buffers(inode));
259  	inode_detach_wb(inode);
260  	security_inode_free(inode);
261  	fsnotify_inode_delete(inode);
262  	locks_free_lock_context(inode);
263  	if (!inode->i_nlink) {
264  		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
265  		atomic_long_dec(&inode->i_sb->s_remove_count);
266  	}
267  
268  #ifdef CONFIG_FS_POSIX_ACL
269  	if (inode->i_acl && !is_uncached_acl(inode->i_acl))
270  		posix_acl_release(inode->i_acl);
271  	if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
272  		posix_acl_release(inode->i_default_acl);
273  #endif
274  	this_cpu_dec(nr_inodes);
275  }
276  EXPORT_SYMBOL(__destroy_inode);
277  
278  static void destroy_inode(struct inode *inode)
279  {
280  	const struct super_operations *ops = inode->i_sb->s_op;
281  
282  	BUG_ON(!list_empty(&inode->i_lru));
283  	__destroy_inode(inode);
284  	if (ops->destroy_inode) {
285  		ops->destroy_inode(inode);
286  		if (!ops->free_inode)
287  			return;
288  	}
289  	inode->free_inode = ops->free_inode;
290  	call_rcu(&inode->i_rcu, i_callback);
291  }
292  
293  /**
294   * drop_nlink - directly drop an inode's link count
295   * @inode: inode
296   *
297   * This is a low-level filesystem helper to replace any
298   * direct filesystem manipulation of i_nlink.  In cases
299   * where we are attempting to track writes to the
300   * filesystem, a decrement to zero means an imminent
301   * write when the file is truncated and actually unlinked
302   * on the filesystem.
303   */
304  void drop_nlink(struct inode *inode)
305  {
306  	WARN_ON(inode->i_nlink == 0);
307  	inode->__i_nlink--;
308  	if (!inode->i_nlink)
309  		atomic_long_inc(&inode->i_sb->s_remove_count);
310  }
311  EXPORT_SYMBOL(drop_nlink);
312  
313  /**
314   * clear_nlink - directly zero an inode's link count
315   * @inode: inode
316   *
317   * This is a low-level filesystem helper to replace any
318   * direct filesystem manipulation of i_nlink.  See
319   * drop_nlink() for why we care about i_nlink hitting zero.
320   */
321  void clear_nlink(struct inode *inode)
322  {
323  	if (inode->i_nlink) {
324  		inode->__i_nlink = 0;
325  		atomic_long_inc(&inode->i_sb->s_remove_count);
326  	}
327  }
328  EXPORT_SYMBOL(clear_nlink);
329  
330  /**
331   * set_nlink - directly set an inode's link count
332   * @inode: inode
333   * @nlink: new nlink (should be non-zero)
334   *
335   * This is a low-level filesystem helper to replace any
336   * direct filesystem manipulation of i_nlink.
337   */
338  void set_nlink(struct inode *inode, unsigned int nlink)
339  {
340  	if (!nlink) {
341  		clear_nlink(inode);
342  	} else {
343  		/* Yes, some filesystems do change nlink from zero to one */
344  		if (inode->i_nlink == 0)
345  			atomic_long_dec(&inode->i_sb->s_remove_count);
346  
347  		inode->__i_nlink = nlink;
348  	}
349  }
350  EXPORT_SYMBOL(set_nlink);
351  
352  /**
353   * inc_nlink - directly increment an inode's link count
354   * @inode: inode
355   *
356   * This is a low-level filesystem helper to replace any
357   * direct filesystem manipulation of i_nlink.  Currently,
358   * it is only here for parity with dec_nlink().
359   */
360  void inc_nlink(struct inode *inode)
361  {
362  	if (unlikely(inode->i_nlink == 0)) {
363  		WARN_ON(!(inode->i_state & I_LINKABLE));
364  		atomic_long_dec(&inode->i_sb->s_remove_count);
365  	}
366  
367  	inode->__i_nlink++;
368  }
369  EXPORT_SYMBOL(inc_nlink);
370  
371  static void __address_space_init_once(struct address_space *mapping)
372  {
373  	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
374  	init_rwsem(&mapping->i_mmap_rwsem);
375  	INIT_LIST_HEAD(&mapping->private_list);
376  	spin_lock_init(&mapping->private_lock);
377  	mapping->i_mmap = RB_ROOT_CACHED;
378  }
379  
380  void address_space_init_once(struct address_space *mapping)
381  {
382  	memset(mapping, 0, sizeof(*mapping));
383  	__address_space_init_once(mapping);
384  }
385  EXPORT_SYMBOL(address_space_init_once);
386  
387  /*
388   * These are initializations that only need to be done
389   * once, because the fields are idempotent across use
390   * of the inode, so let the slab aware of that.
391   */
392  void inode_init_once(struct inode *inode)
393  {
394  	memset(inode, 0, sizeof(*inode));
395  	INIT_HLIST_NODE(&inode->i_hash);
396  	INIT_LIST_HEAD(&inode->i_devices);
397  	INIT_LIST_HEAD(&inode->i_io_list);
398  	INIT_LIST_HEAD(&inode->i_wb_list);
399  	INIT_LIST_HEAD(&inode->i_lru);
400  	__address_space_init_once(&inode->i_data);
401  	i_size_ordered_init(inode);
402  }
403  EXPORT_SYMBOL(inode_init_once);
404  
405  static void init_once(void *foo)
406  {
407  	struct inode *inode = (struct inode *) foo;
408  
409  	inode_init_once(inode);
410  }
411  
412  /*
413   * inode->i_lock must be held
414   */
415  void __iget(struct inode *inode)
416  {
417  	atomic_inc(&inode->i_count);
418  }
419  
420  /*
421   * get additional reference to inode; caller must already hold one.
422   */
423  void ihold(struct inode *inode)
424  {
425  	WARN_ON(atomic_inc_return(&inode->i_count) < 2);
426  }
427  EXPORT_SYMBOL(ihold);
428  
429  static void inode_lru_list_add(struct inode *inode)
430  {
431  	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
432  		this_cpu_inc(nr_unused);
433  	else
434  		inode->i_state |= I_REFERENCED;
435  }
436  
437  /*
438   * Add inode to LRU if needed (inode is unused and clean).
439   *
440   * Needs inode->i_lock held.
441   */
442  void inode_add_lru(struct inode *inode)
443  {
444  	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
445  				I_FREEING | I_WILL_FREE)) &&
446  	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE)
447  		inode_lru_list_add(inode);
448  }
449  
450  
451  static void inode_lru_list_del(struct inode *inode)
452  {
453  
454  	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
455  		this_cpu_dec(nr_unused);
456  }
457  
458  /**
459   * inode_sb_list_add - add inode to the superblock list of inodes
460   * @inode: inode to add
461   */
462  void inode_sb_list_add(struct inode *inode)
463  {
464  	spin_lock(&inode->i_sb->s_inode_list_lock);
465  	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
466  	spin_unlock(&inode->i_sb->s_inode_list_lock);
467  }
468  EXPORT_SYMBOL_GPL(inode_sb_list_add);
469  
470  static inline void inode_sb_list_del(struct inode *inode)
471  {
472  	if (!list_empty(&inode->i_sb_list)) {
473  		spin_lock(&inode->i_sb->s_inode_list_lock);
474  		list_del_init(&inode->i_sb_list);
475  		spin_unlock(&inode->i_sb->s_inode_list_lock);
476  	}
477  }
478  
479  static unsigned long hash(struct super_block *sb, unsigned long hashval)
480  {
481  	unsigned long tmp;
482  
483  	tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
484  			L1_CACHE_BYTES;
485  	tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
486  	return tmp & i_hash_mask;
487  }
488  
489  /**
490   *	__insert_inode_hash - hash an inode
491   *	@inode: unhashed inode
492   *	@hashval: unsigned long value used to locate this object in the
493   *		inode_hashtable.
494   *
495   *	Add an inode to the inode hash for this superblock.
496   */
497  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
498  {
499  	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
500  
501  	spin_lock(&inode_hash_lock);
502  	spin_lock(&inode->i_lock);
503  	hlist_add_head_rcu(&inode->i_hash, b);
504  	spin_unlock(&inode->i_lock);
505  	spin_unlock(&inode_hash_lock);
506  }
507  EXPORT_SYMBOL(__insert_inode_hash);
508  
509  /**
510   *	__remove_inode_hash - remove an inode from the hash
511   *	@inode: inode to unhash
512   *
513   *	Remove an inode from the superblock.
514   */
515  void __remove_inode_hash(struct inode *inode)
516  {
517  	spin_lock(&inode_hash_lock);
518  	spin_lock(&inode->i_lock);
519  	hlist_del_init_rcu(&inode->i_hash);
520  	spin_unlock(&inode->i_lock);
521  	spin_unlock(&inode_hash_lock);
522  }
523  EXPORT_SYMBOL(__remove_inode_hash);
524  
525  void clear_inode(struct inode *inode)
526  {
527  	/*
528  	 * We have to cycle the i_pages lock here because reclaim can be in the
529  	 * process of removing the last page (in __delete_from_page_cache())
530  	 * and we must not free the mapping under it.
531  	 */
532  	xa_lock_irq(&inode->i_data.i_pages);
533  	BUG_ON(inode->i_data.nrpages);
534  	/*
535  	 * Almost always, mapping_empty(&inode->i_data) here; but there are
536  	 * two known and long-standing ways in which nodes may get left behind
537  	 * (when deep radix-tree node allocation failed partway; or when THP
538  	 * collapse_file() failed). Until those two known cases are cleaned up,
539  	 * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
540  	 * nor even WARN_ON(!mapping_empty).
541  	 */
542  	xa_unlock_irq(&inode->i_data.i_pages);
543  	BUG_ON(!list_empty(&inode->i_data.private_list));
544  	BUG_ON(!(inode->i_state & I_FREEING));
545  	BUG_ON(inode->i_state & I_CLEAR);
546  	BUG_ON(!list_empty(&inode->i_wb_list));
547  	/* don't need i_lock here, no concurrent mods to i_state */
548  	inode->i_state = I_FREEING | I_CLEAR;
549  }
550  EXPORT_SYMBOL(clear_inode);
551  
552  /*
553   * Free the inode passed in, removing it from the lists it is still connected
554   * to. We remove any pages still attached to the inode and wait for any IO that
555   * is still in progress before finally destroying the inode.
556   *
557   * An inode must already be marked I_FREEING so that we avoid the inode being
558   * moved back onto lists if we race with other code that manipulates the lists
559   * (e.g. writeback_single_inode). The caller is responsible for setting this.
560   *
561   * An inode must already be removed from the LRU list before being evicted from
562   * the cache. This should occur atomically with setting the I_FREEING state
563   * flag, so no inodes here should ever be on the LRU when being evicted.
564   */
565  static void evict(struct inode *inode)
566  {
567  	const struct super_operations *op = inode->i_sb->s_op;
568  
569  	BUG_ON(!(inode->i_state & I_FREEING));
570  	BUG_ON(!list_empty(&inode->i_lru));
571  
572  	if (!list_empty(&inode->i_io_list))
573  		inode_io_list_del(inode);
574  
575  	inode_sb_list_del(inode);
576  
577  	/*
578  	 * Wait for flusher thread to be done with the inode so that filesystem
579  	 * does not start destroying it while writeback is still running. Since
580  	 * the inode has I_FREEING set, flusher thread won't start new work on
581  	 * the inode.  We just have to wait for running writeback to finish.
582  	 */
583  	inode_wait_for_writeback(inode);
584  
585  	if (op->evict_inode) {
586  		op->evict_inode(inode);
587  	} else {
588  		truncate_inode_pages_final(&inode->i_data);
589  		clear_inode(inode);
590  	}
591  	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
592  		cd_forget(inode);
593  
594  	remove_inode_hash(inode);
595  
596  	spin_lock(&inode->i_lock);
597  	wake_up_bit(&inode->i_state, __I_NEW);
598  	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
599  	spin_unlock(&inode->i_lock);
600  
601  	destroy_inode(inode);
602  }
603  
604  /*
605   * dispose_list - dispose of the contents of a local list
606   * @head: the head of the list to free
607   *
608   * Dispose-list gets a local list with local inodes in it, so it doesn't
609   * need to worry about list corruption and SMP locks.
610   */
611  static void dispose_list(struct list_head *head)
612  {
613  	while (!list_empty(head)) {
614  		struct inode *inode;
615  
616  		inode = list_first_entry(head, struct inode, i_lru);
617  		list_del_init(&inode->i_lru);
618  
619  		evict(inode);
620  		cond_resched();
621  	}
622  }
623  
624  /**
625   * evict_inodes	- evict all evictable inodes for a superblock
626   * @sb:		superblock to operate on
627   *
628   * Make sure that no inodes with zero refcount are retained.  This is
629   * called by superblock shutdown after having SB_ACTIVE flag removed,
630   * so any inode reaching zero refcount during or after that call will
631   * be immediately evicted.
632   */
633  void evict_inodes(struct super_block *sb)
634  {
635  	struct inode *inode, *next;
636  	LIST_HEAD(dispose);
637  
638  again:
639  	spin_lock(&sb->s_inode_list_lock);
640  	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
641  		if (atomic_read(&inode->i_count))
642  			continue;
643  
644  		spin_lock(&inode->i_lock);
645  		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
646  			spin_unlock(&inode->i_lock);
647  			continue;
648  		}
649  
650  		inode->i_state |= I_FREEING;
651  		inode_lru_list_del(inode);
652  		spin_unlock(&inode->i_lock);
653  		list_add(&inode->i_lru, &dispose);
654  
655  		/*
656  		 * We can have a ton of inodes to evict at unmount time given
657  		 * enough memory, check to see if we need to go to sleep for a
658  		 * bit so we don't livelock.
659  		 */
660  		if (need_resched()) {
661  			spin_unlock(&sb->s_inode_list_lock);
662  			cond_resched();
663  			dispose_list(&dispose);
664  			goto again;
665  		}
666  	}
667  	spin_unlock(&sb->s_inode_list_lock);
668  
669  	dispose_list(&dispose);
670  }
671  EXPORT_SYMBOL_GPL(evict_inodes);
672  
673  /**
674   * invalidate_inodes	- attempt to free all inodes on a superblock
675   * @sb:		superblock to operate on
676   * @kill_dirty: flag to guide handling of dirty inodes
677   *
678   * Attempts to free all inodes for a given superblock.  If there were any
679   * busy inodes return a non-zero value, else zero.
680   * If @kill_dirty is set, discard dirty inodes too, otherwise treat
681   * them as busy.
682   */
683  int invalidate_inodes(struct super_block *sb, bool kill_dirty)
684  {
685  	int busy = 0;
686  	struct inode *inode, *next;
687  	LIST_HEAD(dispose);
688  
689  again:
690  	spin_lock(&sb->s_inode_list_lock);
691  	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
692  		spin_lock(&inode->i_lock);
693  		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
694  			spin_unlock(&inode->i_lock);
695  			continue;
696  		}
697  		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
698  			spin_unlock(&inode->i_lock);
699  			busy = 1;
700  			continue;
701  		}
702  		if (atomic_read(&inode->i_count)) {
703  			spin_unlock(&inode->i_lock);
704  			busy = 1;
705  			continue;
706  		}
707  
708  		inode->i_state |= I_FREEING;
709  		inode_lru_list_del(inode);
710  		spin_unlock(&inode->i_lock);
711  		list_add(&inode->i_lru, &dispose);
712  		if (need_resched()) {
713  			spin_unlock(&sb->s_inode_list_lock);
714  			cond_resched();
715  			dispose_list(&dispose);
716  			goto again;
717  		}
718  	}
719  	spin_unlock(&sb->s_inode_list_lock);
720  
721  	dispose_list(&dispose);
722  
723  	return busy;
724  }
725  
726  /*
727   * Isolate the inode from the LRU in preparation for freeing it.
728   *
729   * Any inodes which are pinned purely because of attached pagecache have their
730   * pagecache removed.  If the inode has metadata buffers attached to
731   * mapping->private_list then try to remove them.
732   *
733   * If the inode has the I_REFERENCED flag set, then it means that it has been
734   * used recently - the flag is set in iput_final(). When we encounter such an
735   * inode, clear the flag and move it to the back of the LRU so it gets another
736   * pass through the LRU before it gets reclaimed. This is necessary because of
737   * the fact we are doing lazy LRU updates to minimise lock contention so the
738   * LRU does not have strict ordering. Hence we don't want to reclaim inodes
739   * with this flag set because they are the inodes that are out of order.
740   */
741  static enum lru_status inode_lru_isolate(struct list_head *item,
742  		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
743  {
744  	struct list_head *freeable = arg;
745  	struct inode	*inode = container_of(item, struct inode, i_lru);
746  
747  	/*
748  	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
749  	 * If we fail to get the lock, just skip it.
750  	 */
751  	if (!spin_trylock(&inode->i_lock))
752  		return LRU_SKIP;
753  
754  	/*
755  	 * Referenced or dirty inodes are still in use. Give them another pass
756  	 * through the LRU as we canot reclaim them now.
757  	 */
758  	if (atomic_read(&inode->i_count) ||
759  	    (inode->i_state & ~I_REFERENCED)) {
760  		list_lru_isolate(lru, &inode->i_lru);
761  		spin_unlock(&inode->i_lock);
762  		this_cpu_dec(nr_unused);
763  		return LRU_REMOVED;
764  	}
765  
766  	/* recently referenced inodes get one more pass */
767  	if (inode->i_state & I_REFERENCED) {
768  		inode->i_state &= ~I_REFERENCED;
769  		spin_unlock(&inode->i_lock);
770  		return LRU_ROTATE;
771  	}
772  
773  	if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
774  		__iget(inode);
775  		spin_unlock(&inode->i_lock);
776  		spin_unlock(lru_lock);
777  		if (remove_inode_buffers(inode)) {
778  			unsigned long reap;
779  			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
780  			if (current_is_kswapd())
781  				__count_vm_events(KSWAPD_INODESTEAL, reap);
782  			else
783  				__count_vm_events(PGINODESTEAL, reap);
784  			if (current->reclaim_state)
785  				current->reclaim_state->reclaimed_slab += reap;
786  		}
787  		iput(inode);
788  		spin_lock(lru_lock);
789  		return LRU_RETRY;
790  	}
791  
792  	WARN_ON(inode->i_state & I_NEW);
793  	inode->i_state |= I_FREEING;
794  	list_lru_isolate_move(lru, &inode->i_lru, freeable);
795  	spin_unlock(&inode->i_lock);
796  
797  	this_cpu_dec(nr_unused);
798  	return LRU_REMOVED;
799  }
800  
801  /*
802   * Walk the superblock inode LRU for freeable inodes and attempt to free them.
803   * This is called from the superblock shrinker function with a number of inodes
804   * to trim from the LRU. Inodes to be freed are moved to a temporary list and
805   * then are freed outside inode_lock by dispose_list().
806   */
807  long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
808  {
809  	LIST_HEAD(freeable);
810  	long freed;
811  
812  	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
813  				     inode_lru_isolate, &freeable);
814  	dispose_list(&freeable);
815  	return freed;
816  }
817  
818  static void __wait_on_freeing_inode(struct inode *inode);
819  /*
820   * Called with the inode lock held.
821   */
822  static struct inode *find_inode(struct super_block *sb,
823  				struct hlist_head *head,
824  				int (*test)(struct inode *, void *),
825  				void *data)
826  {
827  	struct inode *inode = NULL;
828  
829  repeat:
830  	hlist_for_each_entry(inode, head, i_hash) {
831  		if (inode->i_sb != sb)
832  			continue;
833  		if (!test(inode, data))
834  			continue;
835  		spin_lock(&inode->i_lock);
836  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
837  			__wait_on_freeing_inode(inode);
838  			goto repeat;
839  		}
840  		if (unlikely(inode->i_state & I_CREATING)) {
841  			spin_unlock(&inode->i_lock);
842  			return ERR_PTR(-ESTALE);
843  		}
844  		__iget(inode);
845  		spin_unlock(&inode->i_lock);
846  		return inode;
847  	}
848  	return NULL;
849  }
850  
851  /*
852   * find_inode_fast is the fast path version of find_inode, see the comment at
853   * iget_locked for details.
854   */
855  static struct inode *find_inode_fast(struct super_block *sb,
856  				struct hlist_head *head, unsigned long ino)
857  {
858  	struct inode *inode = NULL;
859  
860  repeat:
861  	hlist_for_each_entry(inode, head, i_hash) {
862  		if (inode->i_ino != ino)
863  			continue;
864  		if (inode->i_sb != sb)
865  			continue;
866  		spin_lock(&inode->i_lock);
867  		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
868  			__wait_on_freeing_inode(inode);
869  			goto repeat;
870  		}
871  		if (unlikely(inode->i_state & I_CREATING)) {
872  			spin_unlock(&inode->i_lock);
873  			return ERR_PTR(-ESTALE);
874  		}
875  		__iget(inode);
876  		spin_unlock(&inode->i_lock);
877  		return inode;
878  	}
879  	return NULL;
880  }
881  
882  /*
883   * Each cpu owns a range of LAST_INO_BATCH numbers.
884   * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
885   * to renew the exhausted range.
886   *
887   * This does not significantly increase overflow rate because every CPU can
888   * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
889   * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
890   * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
891   * overflow rate by 2x, which does not seem too significant.
892   *
893   * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
894   * error if st_ino won't fit in target struct field. Use 32bit counter
895   * here to attempt to avoid that.
896   */
897  #define LAST_INO_BATCH 1024
898  static DEFINE_PER_CPU(unsigned int, last_ino);
899  
900  unsigned int get_next_ino(void)
901  {
902  	unsigned int *p = &get_cpu_var(last_ino);
903  	unsigned int res = *p;
904  
905  #ifdef CONFIG_SMP
906  	if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
907  		static atomic_t shared_last_ino;
908  		int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
909  
910  		res = next - LAST_INO_BATCH;
911  	}
912  #endif
913  
914  	res++;
915  	/* get_next_ino should not provide a 0 inode number */
916  	if (unlikely(!res))
917  		res++;
918  	*p = res;
919  	put_cpu_var(last_ino);
920  	return res;
921  }
922  EXPORT_SYMBOL(get_next_ino);
923  
924  /**
925   *	new_inode_pseudo 	- obtain an inode
926   *	@sb: superblock
927   *
928   *	Allocates a new inode for given superblock.
929   *	Inode wont be chained in superblock s_inodes list
930   *	This means :
931   *	- fs can't be unmount
932   *	- quotas, fsnotify, writeback can't work
933   */
934  struct inode *new_inode_pseudo(struct super_block *sb)
935  {
936  	struct inode *inode = alloc_inode(sb);
937  
938  	if (inode) {
939  		spin_lock(&inode->i_lock);
940  		inode->i_state = 0;
941  		spin_unlock(&inode->i_lock);
942  		INIT_LIST_HEAD(&inode->i_sb_list);
943  	}
944  	return inode;
945  }
946  
947  /**
948   *	new_inode 	- obtain an inode
949   *	@sb: superblock
950   *
951   *	Allocates a new inode for given superblock. The default gfp_mask
952   *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
953   *	If HIGHMEM pages are unsuitable or it is known that pages allocated
954   *	for the page cache are not reclaimable or migratable,
955   *	mapping_set_gfp_mask() must be called with suitable flags on the
956   *	newly created inode's mapping
957   *
958   */
959  struct inode *new_inode(struct super_block *sb)
960  {
961  	struct inode *inode;
962  
963  	spin_lock_prefetch(&sb->s_inode_list_lock);
964  
965  	inode = new_inode_pseudo(sb);
966  	if (inode)
967  		inode_sb_list_add(inode);
968  	return inode;
969  }
970  EXPORT_SYMBOL(new_inode);
971  
972  #ifdef CONFIG_DEBUG_LOCK_ALLOC
973  void lockdep_annotate_inode_mutex_key(struct inode *inode)
974  {
975  	if (S_ISDIR(inode->i_mode)) {
976  		struct file_system_type *type = inode->i_sb->s_type;
977  
978  		/* Set new key only if filesystem hasn't already changed it */
979  		if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
980  			/*
981  			 * ensure nobody is actually holding i_mutex
982  			 */
983  			// mutex_destroy(&inode->i_mutex);
984  			init_rwsem(&inode->i_rwsem);
985  			lockdep_set_class(&inode->i_rwsem,
986  					  &type->i_mutex_dir_key);
987  		}
988  	}
989  }
990  EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
991  #endif
992  
993  /**
994   * unlock_new_inode - clear the I_NEW state and wake up any waiters
995   * @inode:	new inode to unlock
996   *
997   * Called when the inode is fully initialised to clear the new state of the
998   * inode and wake up anyone waiting for the inode to finish initialisation.
999   */
1000  void unlock_new_inode(struct inode *inode)
1001  {
1002  	lockdep_annotate_inode_mutex_key(inode);
1003  	spin_lock(&inode->i_lock);
1004  	WARN_ON(!(inode->i_state & I_NEW));
1005  	inode->i_state &= ~I_NEW & ~I_CREATING;
1006  	smp_mb();
1007  	wake_up_bit(&inode->i_state, __I_NEW);
1008  	spin_unlock(&inode->i_lock);
1009  }
1010  EXPORT_SYMBOL(unlock_new_inode);
1011  
1012  void discard_new_inode(struct inode *inode)
1013  {
1014  	lockdep_annotate_inode_mutex_key(inode);
1015  	spin_lock(&inode->i_lock);
1016  	WARN_ON(!(inode->i_state & I_NEW));
1017  	inode->i_state &= ~I_NEW;
1018  	smp_mb();
1019  	wake_up_bit(&inode->i_state, __I_NEW);
1020  	spin_unlock(&inode->i_lock);
1021  	iput(inode);
1022  }
1023  EXPORT_SYMBOL(discard_new_inode);
1024  
1025  /**
1026   * lock_two_nondirectories - take two i_mutexes on non-directory objects
1027   *
1028   * Lock any non-NULL argument that is not a directory.
1029   * Zero, one or two objects may be locked by this function.
1030   *
1031   * @inode1: first inode to lock
1032   * @inode2: second inode to lock
1033   */
1034  void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
1035  {
1036  	if (inode1 > inode2)
1037  		swap(inode1, inode2);
1038  
1039  	if (inode1 && !S_ISDIR(inode1->i_mode))
1040  		inode_lock(inode1);
1041  	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
1042  		inode_lock_nested(inode2, I_MUTEX_NONDIR2);
1043  }
1044  EXPORT_SYMBOL(lock_two_nondirectories);
1045  
1046  /**
1047   * unlock_two_nondirectories - release locks from lock_two_nondirectories()
1048   * @inode1: first inode to unlock
1049   * @inode2: second inode to unlock
1050   */
1051  void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
1052  {
1053  	if (inode1 && !S_ISDIR(inode1->i_mode))
1054  		inode_unlock(inode1);
1055  	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
1056  		inode_unlock(inode2);
1057  }
1058  EXPORT_SYMBOL(unlock_two_nondirectories);
1059  
1060  /**
1061   * inode_insert5 - obtain an inode from a mounted file system
1062   * @inode:	pre-allocated inode to use for insert to cache
1063   * @hashval:	hash value (usually inode number) to get
1064   * @test:	callback used for comparisons between inodes
1065   * @set:	callback used to initialize a new struct inode
1066   * @data:	opaque data pointer to pass to @test and @set
1067   *
1068   * Search for the inode specified by @hashval and @data in the inode cache,
1069   * and if present it is return it with an increased reference count. This is
1070   * a variant of iget5_locked() for callers that don't want to fail on memory
1071   * allocation of inode.
1072   *
1073   * If the inode is not in cache, insert the pre-allocated inode to cache and
1074   * return it locked, hashed, and with the I_NEW flag set. The file system gets
1075   * to fill it in before unlocking it via unlock_new_inode().
1076   *
1077   * Note both @test and @set are called with the inode_hash_lock held, so can't
1078   * sleep.
1079   */
1080  struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
1081  			    int (*test)(struct inode *, void *),
1082  			    int (*set)(struct inode *, void *), void *data)
1083  {
1084  	struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
1085  	struct inode *old;
1086  	bool creating = inode->i_state & I_CREATING;
1087  
1088  again:
1089  	spin_lock(&inode_hash_lock);
1090  	old = find_inode(inode->i_sb, head, test, data);
1091  	if (unlikely(old)) {
1092  		/*
1093  		 * Uhhuh, somebody else created the same inode under us.
1094  		 * Use the old inode instead of the preallocated one.
1095  		 */
1096  		spin_unlock(&inode_hash_lock);
1097  		if (IS_ERR(old))
1098  			return NULL;
1099  		wait_on_inode(old);
1100  		if (unlikely(inode_unhashed(old))) {
1101  			iput(old);
1102  			goto again;
1103  		}
1104  		return old;
1105  	}
1106  
1107  	if (set && unlikely(set(inode, data))) {
1108  		inode = NULL;
1109  		goto unlock;
1110  	}
1111  
1112  	/*
1113  	 * Return the locked inode with I_NEW set, the
1114  	 * caller is responsible for filling in the contents
1115  	 */
1116  	spin_lock(&inode->i_lock);
1117  	inode->i_state |= I_NEW;
1118  	hlist_add_head_rcu(&inode->i_hash, head);
1119  	spin_unlock(&inode->i_lock);
1120  	if (!creating)
1121  		inode_sb_list_add(inode);
1122  unlock:
1123  	spin_unlock(&inode_hash_lock);
1124  
1125  	return inode;
1126  }
1127  EXPORT_SYMBOL(inode_insert5);
1128  
1129  /**
1130   * iget5_locked - obtain an inode from a mounted file system
1131   * @sb:		super block of file system
1132   * @hashval:	hash value (usually inode number) to get
1133   * @test:	callback used for comparisons between inodes
1134   * @set:	callback used to initialize a new struct inode
1135   * @data:	opaque data pointer to pass to @test and @set
1136   *
1137   * Search for the inode specified by @hashval and @data in the inode cache,
1138   * and if present it is return it with an increased reference count. This is
1139   * a generalized version of iget_locked() for file systems where the inode
1140   * number is not sufficient for unique identification of an inode.
1141   *
1142   * If the inode is not in cache, allocate a new inode and return it locked,
1143   * hashed, and with the I_NEW flag set. The file system gets to fill it in
1144   * before unlocking it via unlock_new_inode().
1145   *
1146   * Note both @test and @set are called with the inode_hash_lock held, so can't
1147   * sleep.
1148   */
1149  struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1150  		int (*test)(struct inode *, void *),
1151  		int (*set)(struct inode *, void *), void *data)
1152  {
1153  	struct inode *inode = ilookup5(sb, hashval, test, data);
1154  
1155  	if (!inode) {
1156  		struct inode *new = alloc_inode(sb);
1157  
1158  		if (new) {
1159  			new->i_state = 0;
1160  			inode = inode_insert5(new, hashval, test, set, data);
1161  			if (unlikely(inode != new))
1162  				destroy_inode(new);
1163  		}
1164  	}
1165  	return inode;
1166  }
1167  EXPORT_SYMBOL(iget5_locked);
1168  
1169  /**
1170   * iget_locked - obtain an inode from a mounted file system
1171   * @sb:		super block of file system
1172   * @ino:	inode number to get
1173   *
1174   * Search for the inode specified by @ino in the inode cache and if present
1175   * return it with an increased reference count. This is for file systems
1176   * where the inode number is sufficient for unique identification of an inode.
1177   *
1178   * If the inode is not in cache, allocate a new inode and return it locked,
1179   * hashed, and with the I_NEW flag set.  The file system gets to fill it in
1180   * before unlocking it via unlock_new_inode().
1181   */
1182  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1183  {
1184  	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1185  	struct inode *inode;
1186  again:
1187  	spin_lock(&inode_hash_lock);
1188  	inode = find_inode_fast(sb, head, ino);
1189  	spin_unlock(&inode_hash_lock);
1190  	if (inode) {
1191  		if (IS_ERR(inode))
1192  			return NULL;
1193  		wait_on_inode(inode);
1194  		if (unlikely(inode_unhashed(inode))) {
1195  			iput(inode);
1196  			goto again;
1197  		}
1198  		return inode;
1199  	}
1200  
1201  	inode = alloc_inode(sb);
1202  	if (inode) {
1203  		struct inode *old;
1204  
1205  		spin_lock(&inode_hash_lock);
1206  		/* We released the lock, so.. */
1207  		old = find_inode_fast(sb, head, ino);
1208  		if (!old) {
1209  			inode->i_ino = ino;
1210  			spin_lock(&inode->i_lock);
1211  			inode->i_state = I_NEW;
1212  			hlist_add_head_rcu(&inode->i_hash, head);
1213  			spin_unlock(&inode->i_lock);
1214  			inode_sb_list_add(inode);
1215  			spin_unlock(&inode_hash_lock);
1216  
1217  			/* Return the locked inode with I_NEW set, the
1218  			 * caller is responsible for filling in the contents
1219  			 */
1220  			return inode;
1221  		}
1222  
1223  		/*
1224  		 * Uhhuh, somebody else created the same inode under
1225  		 * us. Use the old inode instead of the one we just
1226  		 * allocated.
1227  		 */
1228  		spin_unlock(&inode_hash_lock);
1229  		destroy_inode(inode);
1230  		if (IS_ERR(old))
1231  			return NULL;
1232  		inode = old;
1233  		wait_on_inode(inode);
1234  		if (unlikely(inode_unhashed(inode))) {
1235  			iput(inode);
1236  			goto again;
1237  		}
1238  	}
1239  	return inode;
1240  }
1241  EXPORT_SYMBOL(iget_locked);
1242  
1243  /*
1244   * search the inode cache for a matching inode number.
1245   * If we find one, then the inode number we are trying to
1246   * allocate is not unique and so we should not use it.
1247   *
1248   * Returns 1 if the inode number is unique, 0 if it is not.
1249   */
1250  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
1251  {
1252  	struct hlist_head *b = inode_hashtable + hash(sb, ino);
1253  	struct inode *inode;
1254  
1255  	hlist_for_each_entry_rcu(inode, b, i_hash) {
1256  		if (inode->i_ino == ino && inode->i_sb == sb)
1257  			return 0;
1258  	}
1259  	return 1;
1260  }
1261  
1262  /**
1263   *	iunique - get a unique inode number
1264   *	@sb: superblock
1265   *	@max_reserved: highest reserved inode number
1266   *
1267   *	Obtain an inode number that is unique on the system for a given
1268   *	superblock. This is used by file systems that have no natural
1269   *	permanent inode numbering system. An inode number is returned that
1270   *	is higher than the reserved limit but unique.
1271   *
1272   *	BUGS:
1273   *	With a large number of inodes live on the file system this function
1274   *	currently becomes quite slow.
1275   */
1276  ino_t iunique(struct super_block *sb, ino_t max_reserved)
1277  {
1278  	/*
1279  	 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
1280  	 * error if st_ino won't fit in target struct field. Use 32bit counter
1281  	 * here to attempt to avoid that.
1282  	 */
1283  	static DEFINE_SPINLOCK(iunique_lock);
1284  	static unsigned int counter;
1285  	ino_t res;
1286  
1287  	rcu_read_lock();
1288  	spin_lock(&iunique_lock);
1289  	do {
1290  		if (counter <= max_reserved)
1291  			counter = max_reserved + 1;
1292  		res = counter++;
1293  	} while (!test_inode_iunique(sb, res));
1294  	spin_unlock(&iunique_lock);
1295  	rcu_read_unlock();
1296  
1297  	return res;
1298  }
1299  EXPORT_SYMBOL(iunique);
1300  
1301  struct inode *igrab(struct inode *inode)
1302  {
1303  	spin_lock(&inode->i_lock);
1304  	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1305  		__iget(inode);
1306  		spin_unlock(&inode->i_lock);
1307  	} else {
1308  		spin_unlock(&inode->i_lock);
1309  		/*
1310  		 * Handle the case where s_op->clear_inode is not been
1311  		 * called yet, and somebody is calling igrab
1312  		 * while the inode is getting freed.
1313  		 */
1314  		inode = NULL;
1315  	}
1316  	return inode;
1317  }
1318  EXPORT_SYMBOL(igrab);
1319  
1320  /**
1321   * ilookup5_nowait - search for an inode in the inode cache
1322   * @sb:		super block of file system to search
1323   * @hashval:	hash value (usually inode number) to search for
1324   * @test:	callback used for comparisons between inodes
1325   * @data:	opaque data pointer to pass to @test
1326   *
1327   * Search for the inode specified by @hashval and @data in the inode cache.
1328   * If the inode is in the cache, the inode is returned with an incremented
1329   * reference count.
1330   *
1331   * Note: I_NEW is not waited upon so you have to be very careful what you do
1332   * with the returned inode.  You probably should be using ilookup5() instead.
1333   *
1334   * Note2: @test is called with the inode_hash_lock held, so can't sleep.
1335   */
1336  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1337  		int (*test)(struct inode *, void *), void *data)
1338  {
1339  	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1340  	struct inode *inode;
1341  
1342  	spin_lock(&inode_hash_lock);
1343  	inode = find_inode(sb, head, test, data);
1344  	spin_unlock(&inode_hash_lock);
1345  
1346  	return IS_ERR(inode) ? NULL : inode;
1347  }
1348  EXPORT_SYMBOL(ilookup5_nowait);
1349  
1350  /**
1351   * ilookup5 - search for an inode in the inode cache
1352   * @sb:		super block of file system to search
1353   * @hashval:	hash value (usually inode number) to search for
1354   * @test:	callback used for comparisons between inodes
1355   * @data:	opaque data pointer to pass to @test
1356   *
1357   * Search for the inode specified by @hashval and @data in the inode cache,
1358   * and if the inode is in the cache, return the inode with an incremented
1359   * reference count.  Waits on I_NEW before returning the inode.
1360   * returned with an incremented reference count.
1361   *
1362   * This is a generalized version of ilookup() for file systems where the
1363   * inode number is not sufficient for unique identification of an inode.
1364   *
1365   * Note: @test is called with the inode_hash_lock held, so can't sleep.
1366   */
1367  struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1368  		int (*test)(struct inode *, void *), void *data)
1369  {
1370  	struct inode *inode;
1371  again:
1372  	inode = ilookup5_nowait(sb, hashval, test, data);
1373  	if (inode) {
1374  		wait_on_inode(inode);
1375  		if (unlikely(inode_unhashed(inode))) {
1376  			iput(inode);
1377  			goto again;
1378  		}
1379  	}
1380  	return inode;
1381  }
1382  EXPORT_SYMBOL(ilookup5);
1383  
1384  /**
1385   * ilookup - search for an inode in the inode cache
1386   * @sb:		super block of file system to search
1387   * @ino:	inode number to search for
1388   *
1389   * Search for the inode @ino in the inode cache, and if the inode is in the
1390   * cache, the inode is returned with an incremented reference count.
1391   */
1392  struct inode *ilookup(struct super_block *sb, unsigned long ino)
1393  {
1394  	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1395  	struct inode *inode;
1396  again:
1397  	spin_lock(&inode_hash_lock);
1398  	inode = find_inode_fast(sb, head, ino);
1399  	spin_unlock(&inode_hash_lock);
1400  
1401  	if (inode) {
1402  		if (IS_ERR(inode))
1403  			return NULL;
1404  		wait_on_inode(inode);
1405  		if (unlikely(inode_unhashed(inode))) {
1406  			iput(inode);
1407  			goto again;
1408  		}
1409  	}
1410  	return inode;
1411  }
1412  EXPORT_SYMBOL(ilookup);
1413  
1414  /**
1415   * find_inode_nowait - find an inode in the inode cache
1416   * @sb:		super block of file system to search
1417   * @hashval:	hash value (usually inode number) to search for
1418   * @match:	callback used for comparisons between inodes
1419   * @data:	opaque data pointer to pass to @match
1420   *
1421   * Search for the inode specified by @hashval and @data in the inode
1422   * cache, where the helper function @match will return 0 if the inode
1423   * does not match, 1 if the inode does match, and -1 if the search
1424   * should be stopped.  The @match function must be responsible for
1425   * taking the i_lock spin_lock and checking i_state for an inode being
1426   * freed or being initialized, and incrementing the reference count
1427   * before returning 1.  It also must not sleep, since it is called with
1428   * the inode_hash_lock spinlock held.
1429   *
1430   * This is a even more generalized version of ilookup5() when the
1431   * function must never block --- find_inode() can block in
1432   * __wait_on_freeing_inode() --- or when the caller can not increment
1433   * the reference count because the resulting iput() might cause an
1434   * inode eviction.  The tradeoff is that the @match funtion must be
1435   * very carefully implemented.
1436   */
1437  struct inode *find_inode_nowait(struct super_block *sb,
1438  				unsigned long hashval,
1439  				int (*match)(struct inode *, unsigned long,
1440  					     void *),
1441  				void *data)
1442  {
1443  	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1444  	struct inode *inode, *ret_inode = NULL;
1445  	int mval;
1446  
1447  	spin_lock(&inode_hash_lock);
1448  	hlist_for_each_entry(inode, head, i_hash) {
1449  		if (inode->i_sb != sb)
1450  			continue;
1451  		mval = match(inode, hashval, data);
1452  		if (mval == 0)
1453  			continue;
1454  		if (mval == 1)
1455  			ret_inode = inode;
1456  		goto out;
1457  	}
1458  out:
1459  	spin_unlock(&inode_hash_lock);
1460  	return ret_inode;
1461  }
1462  EXPORT_SYMBOL(find_inode_nowait);
1463  
1464  /**
1465   * find_inode_rcu - find an inode in the inode cache
1466   * @sb:		Super block of file system to search
1467   * @hashval:	Key to hash
1468   * @test:	Function to test match on an inode
1469   * @data:	Data for test function
1470   *
1471   * Search for the inode specified by @hashval and @data in the inode cache,
1472   * where the helper function @test will return 0 if the inode does not match
1473   * and 1 if it does.  The @test function must be responsible for taking the
1474   * i_lock spin_lock and checking i_state for an inode being freed or being
1475   * initialized.
1476   *
1477   * If successful, this will return the inode for which the @test function
1478   * returned 1 and NULL otherwise.
1479   *
1480   * The @test function is not permitted to take a ref on any inode presented.
1481   * It is also not permitted to sleep.
1482   *
1483   * The caller must hold the RCU read lock.
1484   */
1485  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
1486  			     int (*test)(struct inode *, void *), void *data)
1487  {
1488  	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1489  	struct inode *inode;
1490  
1491  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
1492  			 "suspicious find_inode_rcu() usage");
1493  
1494  	hlist_for_each_entry_rcu(inode, head, i_hash) {
1495  		if (inode->i_sb == sb &&
1496  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
1497  		    test(inode, data))
1498  			return inode;
1499  	}
1500  	return NULL;
1501  }
1502  EXPORT_SYMBOL(find_inode_rcu);
1503  
1504  /**
1505   * find_inode_by_ino_rcu - Find an inode in the inode cache
1506   * @sb:		Super block of file system to search
1507   * @ino:	The inode number to match
1508   *
1509   * Search for the inode specified by @hashval and @data in the inode cache,
1510   * where the helper function @test will return 0 if the inode does not match
1511   * and 1 if it does.  The @test function must be responsible for taking the
1512   * i_lock spin_lock and checking i_state for an inode being freed or being
1513   * initialized.
1514   *
1515   * If successful, this will return the inode for which the @test function
1516   * returned 1 and NULL otherwise.
1517   *
1518   * The @test function is not permitted to take a ref on any inode presented.
1519   * It is also not permitted to sleep.
1520   *
1521   * The caller must hold the RCU read lock.
1522   */
1523  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
1524  				    unsigned long ino)
1525  {
1526  	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1527  	struct inode *inode;
1528  
1529  	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
1530  			 "suspicious find_inode_by_ino_rcu() usage");
1531  
1532  	hlist_for_each_entry_rcu(inode, head, i_hash) {
1533  		if (inode->i_ino == ino &&
1534  		    inode->i_sb == sb &&
1535  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
1536  		    return inode;
1537  	}
1538  	return NULL;
1539  }
1540  EXPORT_SYMBOL(find_inode_by_ino_rcu);
1541  
1542  int insert_inode_locked(struct inode *inode)
1543  {
1544  	struct super_block *sb = inode->i_sb;
1545  	ino_t ino = inode->i_ino;
1546  	struct hlist_head *head = inode_hashtable + hash(sb, ino);
1547  
1548  	while (1) {
1549  		struct inode *old = NULL;
1550  		spin_lock(&inode_hash_lock);
1551  		hlist_for_each_entry(old, head, i_hash) {
1552  			if (old->i_ino != ino)
1553  				continue;
1554  			if (old->i_sb != sb)
1555  				continue;
1556  			spin_lock(&old->i_lock);
1557  			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1558  				spin_unlock(&old->i_lock);
1559  				continue;
1560  			}
1561  			break;
1562  		}
1563  		if (likely(!old)) {
1564  			spin_lock(&inode->i_lock);
1565  			inode->i_state |= I_NEW | I_CREATING;
1566  			hlist_add_head_rcu(&inode->i_hash, head);
1567  			spin_unlock(&inode->i_lock);
1568  			spin_unlock(&inode_hash_lock);
1569  			return 0;
1570  		}
1571  		if (unlikely(old->i_state & I_CREATING)) {
1572  			spin_unlock(&old->i_lock);
1573  			spin_unlock(&inode_hash_lock);
1574  			return -EBUSY;
1575  		}
1576  		__iget(old);
1577  		spin_unlock(&old->i_lock);
1578  		spin_unlock(&inode_hash_lock);
1579  		wait_on_inode(old);
1580  		if (unlikely(!inode_unhashed(old))) {
1581  			iput(old);
1582  			return -EBUSY;
1583  		}
1584  		iput(old);
1585  	}
1586  }
1587  EXPORT_SYMBOL(insert_inode_locked);
1588  
1589  int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1590  		int (*test)(struct inode *, void *), void *data)
1591  {
1592  	struct inode *old;
1593  
1594  	inode->i_state |= I_CREATING;
1595  	old = inode_insert5(inode, hashval, test, NULL, data);
1596  
1597  	if (old != inode) {
1598  		iput(old);
1599  		return -EBUSY;
1600  	}
1601  	return 0;
1602  }
1603  EXPORT_SYMBOL(insert_inode_locked4);
1604  
1605  
1606  int generic_delete_inode(struct inode *inode)
1607  {
1608  	return 1;
1609  }
1610  EXPORT_SYMBOL(generic_delete_inode);
1611  
1612  /*
1613   * Called when we're dropping the last reference
1614   * to an inode.
1615   *
1616   * Call the FS "drop_inode()" function, defaulting to
1617   * the legacy UNIX filesystem behaviour.  If it tells
1618   * us to evict inode, do so.  Otherwise, retain inode
1619   * in cache if fs is alive, sync and evict if fs is
1620   * shutting down.
1621   */
1622  static void iput_final(struct inode *inode)
1623  {
1624  	struct super_block *sb = inode->i_sb;
1625  	const struct super_operations *op = inode->i_sb->s_op;
1626  	unsigned long state;
1627  	int drop;
1628  
1629  	WARN_ON(inode->i_state & I_NEW);
1630  
1631  	if (op->drop_inode)
1632  		drop = op->drop_inode(inode);
1633  	else
1634  		drop = generic_drop_inode(inode);
1635  
1636  	if (!drop &&
1637  	    !(inode->i_state & I_DONTCACHE) &&
1638  	    (sb->s_flags & SB_ACTIVE)) {
1639  		inode_add_lru(inode);
1640  		spin_unlock(&inode->i_lock);
1641  		return;
1642  	}
1643  
1644  	state = inode->i_state;
1645  	if (!drop) {
1646  		WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
1647  		spin_unlock(&inode->i_lock);
1648  
1649  		write_inode_now(inode, 1);
1650  
1651  		spin_lock(&inode->i_lock);
1652  		state = inode->i_state;
1653  		WARN_ON(state & I_NEW);
1654  		state &= ~I_WILL_FREE;
1655  	}
1656  
1657  	WRITE_ONCE(inode->i_state, state | I_FREEING);
1658  	if (!list_empty(&inode->i_lru))
1659  		inode_lru_list_del(inode);
1660  	spin_unlock(&inode->i_lock);
1661  
1662  	evict(inode);
1663  }
1664  
1665  /**
1666   *	iput	- put an inode
1667   *	@inode: inode to put
1668   *
1669   *	Puts an inode, dropping its usage count. If the inode use count hits
1670   *	zero, the inode is then freed and may also be destroyed.
1671   *
1672   *	Consequently, iput() can sleep.
1673   */
1674  void iput(struct inode *inode)
1675  {
1676  	if (!inode)
1677  		return;
1678  	BUG_ON(inode->i_state & I_CLEAR);
1679  retry:
1680  	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
1681  		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
1682  			atomic_inc(&inode->i_count);
1683  			spin_unlock(&inode->i_lock);
1684  			trace_writeback_lazytime_iput(inode);
1685  			mark_inode_dirty_sync(inode);
1686  			goto retry;
1687  		}
1688  		iput_final(inode);
1689  	}
1690  }
1691  EXPORT_SYMBOL(iput);
1692  
1693  #ifdef CONFIG_BLOCK
1694  /**
1695   *	bmap	- find a block number in a file
1696   *	@inode:  inode owning the block number being requested
1697   *	@block: pointer containing the block to find
1698   *
1699   *	Replaces the value in ``*block`` with the block number on the device holding
1700   *	corresponding to the requested block number in the file.
1701   *	That is, asked for block 4 of inode 1 the function will replace the
1702   *	4 in ``*block``, with disk block relative to the disk start that holds that
1703   *	block of the file.
1704   *
1705   *	Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
1706   *	hole, returns 0 and ``*block`` is also set to 0.
1707   */
1708  int bmap(struct inode *inode, sector_t *block)
1709  {
1710  	if (!inode->i_mapping->a_ops->bmap)
1711  		return -EINVAL;
1712  
1713  	*block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
1714  	return 0;
1715  }
1716  EXPORT_SYMBOL(bmap);
1717  #endif
1718  
1719  /*
1720   * With relative atime, only update atime if the previous atime is
1721   * earlier than either the ctime or mtime or if at least a day has
1722   * passed since the last atime update.
1723   */
1724  static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
1725  			     struct timespec64 now)
1726  {
1727  
1728  	if (!(mnt->mnt_flags & MNT_RELATIME))
1729  		return 1;
1730  	/*
1731  	 * Is mtime younger than atime? If yes, update atime:
1732  	 */
1733  	if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
1734  		return 1;
1735  	/*
1736  	 * Is ctime younger than atime? If yes, update atime:
1737  	 */
1738  	if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
1739  		return 1;
1740  
1741  	/*
1742  	 * Is the previous atime value older than a day? If yes,
1743  	 * update atime:
1744  	 */
1745  	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
1746  		return 1;
1747  	/*
1748  	 * Good, we can skip the atime update:
1749  	 */
1750  	return 0;
1751  }
1752  
1753  int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
1754  {
1755  	int dirty_flags = 0;
1756  
1757  	if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
1758  		if (flags & S_ATIME)
1759  			inode->i_atime = *time;
1760  		if (flags & S_CTIME)
1761  			inode->i_ctime = *time;
1762  		if (flags & S_MTIME)
1763  			inode->i_mtime = *time;
1764  
1765  		if (inode->i_sb->s_flags & SB_LAZYTIME)
1766  			dirty_flags |= I_DIRTY_TIME;
1767  		else
1768  			dirty_flags |= I_DIRTY_SYNC;
1769  	}
1770  
1771  	if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
1772  		dirty_flags |= I_DIRTY_SYNC;
1773  
1774  	__mark_inode_dirty(inode, dirty_flags);
1775  	return 0;
1776  }
1777  EXPORT_SYMBOL(generic_update_time);
1778  
1779  /*
1780   * This does the actual work of updating an inodes time or version.  Must have
1781   * had called mnt_want_write() before calling this.
1782   */
1783  static int update_time(struct inode *inode, struct timespec64 *time, int flags)
1784  {
1785  	if (inode->i_op->update_time)
1786  		return inode->i_op->update_time(inode, time, flags);
1787  	return generic_update_time(inode, time, flags);
1788  }
1789  
1790  /**
1791   *	atime_needs_update	-	update the access time
1792   *	@path: the &struct path to update
1793   *	@inode: inode to update
1794   *
1795   *	Update the accessed time on an inode and mark it for writeback.
1796   *	This function automatically handles read only file systems and media,
1797   *	as well as the "noatime" flag and inode specific "noatime" markers.
1798   */
1799  bool atime_needs_update(const struct path *path, struct inode *inode)
1800  {
1801  	struct vfsmount *mnt = path->mnt;
1802  	struct timespec64 now;
1803  
1804  	if (inode->i_flags & S_NOATIME)
1805  		return false;
1806  
1807  	/* Atime updates will likely cause i_uid and i_gid to be written
1808  	 * back improprely if their true value is unknown to the vfs.
1809  	 */
1810  	if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
1811  		return false;
1812  
1813  	if (IS_NOATIME(inode))
1814  		return false;
1815  	if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1816  		return false;
1817  
1818  	if (mnt->mnt_flags & MNT_NOATIME)
1819  		return false;
1820  	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1821  		return false;
1822  
1823  	now = current_time(inode);
1824  
1825  	if (!relatime_need_update(mnt, inode, now))
1826  		return false;
1827  
1828  	if (timespec64_equal(&inode->i_atime, &now))
1829  		return false;
1830  
1831  	return true;
1832  }
1833  
1834  void touch_atime(const struct path *path)
1835  {
1836  	struct vfsmount *mnt = path->mnt;
1837  	struct inode *inode = d_inode(path->dentry);
1838  	struct timespec64 now;
1839  
1840  	if (!atime_needs_update(path, inode))
1841  		return;
1842  
1843  	if (!sb_start_write_trylock(inode->i_sb))
1844  		return;
1845  
1846  	if (__mnt_want_write(mnt) != 0)
1847  		goto skip_update;
1848  	/*
1849  	 * File systems can error out when updating inodes if they need to
1850  	 * allocate new space to modify an inode (such is the case for
1851  	 * Btrfs), but since we touch atime while walking down the path we
1852  	 * really don't care if we failed to update the atime of the file,
1853  	 * so just ignore the return value.
1854  	 * We may also fail on filesystems that have the ability to make parts
1855  	 * of the fs read only, e.g. subvolumes in Btrfs.
1856  	 */
1857  	now = current_time(inode);
1858  	update_time(inode, &now, S_ATIME);
1859  	__mnt_drop_write(mnt);
1860  skip_update:
1861  	sb_end_write(inode->i_sb);
1862  }
1863  EXPORT_SYMBOL(touch_atime);
1864  
1865  /*
1866   * The logic we want is
1867   *
1868   *	if suid or (sgid and xgrp)
1869   *		remove privs
1870   */
1871  int should_remove_suid(struct dentry *dentry)
1872  {
1873  	umode_t mode = d_inode(dentry)->i_mode;
1874  	int kill = 0;
1875  
1876  	/* suid always must be killed */
1877  	if (unlikely(mode & S_ISUID))
1878  		kill = ATTR_KILL_SUID;
1879  
1880  	/*
1881  	 * sgid without any exec bits is just a mandatory locking mark; leave
1882  	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
1883  	 */
1884  	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1885  		kill |= ATTR_KILL_SGID;
1886  
1887  	if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1888  		return kill;
1889  
1890  	return 0;
1891  }
1892  EXPORT_SYMBOL(should_remove_suid);
1893  
1894  /*
1895   * Return mask of changes for notify_change() that need to be done as a
1896   * response to write or truncate. Return 0 if nothing has to be changed.
1897   * Negative value on error (change should be denied).
1898   */
1899  int dentry_needs_remove_privs(struct dentry *dentry)
1900  {
1901  	struct inode *inode = d_inode(dentry);
1902  	int mask = 0;
1903  	int ret;
1904  
1905  	if (IS_NOSEC(inode))
1906  		return 0;
1907  
1908  	mask = should_remove_suid(dentry);
1909  	ret = security_inode_need_killpriv(dentry);
1910  	if (ret < 0)
1911  		return ret;
1912  	if (ret)
1913  		mask |= ATTR_KILL_PRIV;
1914  	return mask;
1915  }
1916  
1917  static int __remove_privs(struct user_namespace *mnt_userns,
1918  			  struct dentry *dentry, int kill)
1919  {
1920  	struct iattr newattrs;
1921  
1922  	newattrs.ia_valid = ATTR_FORCE | kill;
1923  	/*
1924  	 * Note we call this on write, so notify_change will not
1925  	 * encounter any conflicting delegations:
1926  	 */
1927  	return notify_change(mnt_userns, dentry, &newattrs, NULL);
1928  }
1929  
1930  /*
1931   * Remove special file priviledges (suid, capabilities) when file is written
1932   * to or truncated.
1933   */
1934  int file_remove_privs(struct file *file)
1935  {
1936  	struct dentry *dentry = file_dentry(file);
1937  	struct inode *inode = file_inode(file);
1938  	int kill;
1939  	int error = 0;
1940  
1941  	/*
1942  	 * Fast path for nothing security related.
1943  	 * As well for non-regular files, e.g. blkdev inodes.
1944  	 * For example, blkdev_write_iter() might get here
1945  	 * trying to remove privs which it is not allowed to.
1946  	 */
1947  	if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
1948  		return 0;
1949  
1950  	kill = dentry_needs_remove_privs(dentry);
1951  	if (kill < 0)
1952  		return kill;
1953  	if (kill)
1954  		error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
1955  	if (!error)
1956  		inode_has_no_xattr(inode);
1957  
1958  	return error;
1959  }
1960  EXPORT_SYMBOL(file_remove_privs);
1961  
1962  /**
1963   *	file_update_time	-	update mtime and ctime time
1964   *	@file: file accessed
1965   *
1966   *	Update the mtime and ctime members of an inode and mark the inode
1967   *	for writeback.  Note that this function is meant exclusively for
1968   *	usage in the file write path of filesystems, and filesystems may
1969   *	choose to explicitly ignore update via this function with the
1970   *	S_NOCMTIME inode flag, e.g. for network filesystem where these
1971   *	timestamps are handled by the server.  This can return an error for
1972   *	file systems who need to allocate space in order to update an inode.
1973   */
1974  
1975  int file_update_time(struct file *file)
1976  {
1977  	struct inode *inode = file_inode(file);
1978  	struct timespec64 now;
1979  	int sync_it = 0;
1980  	int ret;
1981  
1982  	/* First try to exhaust all avenues to not sync */
1983  	if (IS_NOCMTIME(inode))
1984  		return 0;
1985  
1986  	now = current_time(inode);
1987  	if (!timespec64_equal(&inode->i_mtime, &now))
1988  		sync_it = S_MTIME;
1989  
1990  	if (!timespec64_equal(&inode->i_ctime, &now))
1991  		sync_it |= S_CTIME;
1992  
1993  	if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
1994  		sync_it |= S_VERSION;
1995  
1996  	if (!sync_it)
1997  		return 0;
1998  
1999  	/* Finally allowed to write? Takes lock. */
2000  	if (__mnt_want_write_file(file))
2001  		return 0;
2002  
2003  	ret = update_time(inode, &now, sync_it);
2004  	__mnt_drop_write_file(file);
2005  
2006  	return ret;
2007  }
2008  EXPORT_SYMBOL(file_update_time);
2009  
2010  /* Caller must hold the file's inode lock */
2011  int file_modified(struct file *file)
2012  {
2013  	int err;
2014  
2015  	/*
2016  	 * Clear the security bits if the process is not being run by root.
2017  	 * This keeps people from modifying setuid and setgid binaries.
2018  	 */
2019  	err = file_remove_privs(file);
2020  	if (err)
2021  		return err;
2022  
2023  	if (unlikely(file->f_mode & FMODE_NOCMTIME))
2024  		return 0;
2025  
2026  	return file_update_time(file);
2027  }
2028  EXPORT_SYMBOL(file_modified);
2029  
2030  int inode_needs_sync(struct inode *inode)
2031  {
2032  	if (IS_SYNC(inode))
2033  		return 1;
2034  	if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
2035  		return 1;
2036  	return 0;
2037  }
2038  EXPORT_SYMBOL(inode_needs_sync);
2039  
2040  /*
2041   * If we try to find an inode in the inode hash while it is being
2042   * deleted, we have to wait until the filesystem completes its
2043   * deletion before reporting that it isn't found.  This function waits
2044   * until the deletion _might_ have completed.  Callers are responsible
2045   * to recheck inode state.
2046   *
2047   * It doesn't matter if I_NEW is not set initially, a call to
2048   * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
2049   * will DTRT.
2050   */
2051  static void __wait_on_freeing_inode(struct inode *inode)
2052  {
2053  	wait_queue_head_t *wq;
2054  	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
2055  	wq = bit_waitqueue(&inode->i_state, __I_NEW);
2056  	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2057  	spin_unlock(&inode->i_lock);
2058  	spin_unlock(&inode_hash_lock);
2059  	schedule();
2060  	finish_wait(wq, &wait.wq_entry);
2061  	spin_lock(&inode_hash_lock);
2062  }
2063  
2064  static __initdata unsigned long ihash_entries;
2065  static int __init set_ihash_entries(char *str)
2066  {
2067  	if (!str)
2068  		return 0;
2069  	ihash_entries = simple_strtoul(str, &str, 0);
2070  	return 1;
2071  }
2072  __setup("ihash_entries=", set_ihash_entries);
2073  
2074  /*
2075   * Initialize the waitqueues and inode hash table.
2076   */
2077  void __init inode_init_early(void)
2078  {
2079  	/* If hashes are distributed across NUMA nodes, defer
2080  	 * hash allocation until vmalloc space is available.
2081  	 */
2082  	if (hashdist)
2083  		return;
2084  
2085  	inode_hashtable =
2086  		alloc_large_system_hash("Inode-cache",
2087  					sizeof(struct hlist_head),
2088  					ihash_entries,
2089  					14,
2090  					HASH_EARLY | HASH_ZERO,
2091  					&i_hash_shift,
2092  					&i_hash_mask,
2093  					0,
2094  					0);
2095  }
2096  
2097  void __init inode_init(void)
2098  {
2099  	/* inode slab cache */
2100  	inode_cachep = kmem_cache_create("inode_cache",
2101  					 sizeof(struct inode),
2102  					 0,
2103  					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
2104  					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
2105  					 init_once);
2106  
2107  	/* Hash may have been set up in inode_init_early */
2108  	if (!hashdist)
2109  		return;
2110  
2111  	inode_hashtable =
2112  		alloc_large_system_hash("Inode-cache",
2113  					sizeof(struct hlist_head),
2114  					ihash_entries,
2115  					14,
2116  					HASH_ZERO,
2117  					&i_hash_shift,
2118  					&i_hash_mask,
2119  					0,
2120  					0);
2121  }
2122  
2123  void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
2124  {
2125  	inode->i_mode = mode;
2126  	if (S_ISCHR(mode)) {
2127  		inode->i_fop = &def_chr_fops;
2128  		inode->i_rdev = rdev;
2129  	} else if (S_ISBLK(mode)) {
2130  		inode->i_fop = &def_blk_fops;
2131  		inode->i_rdev = rdev;
2132  	} else if (S_ISFIFO(mode))
2133  		inode->i_fop = &pipefifo_fops;
2134  	else if (S_ISSOCK(mode))
2135  		;	/* leave it no_open_fops */
2136  	else
2137  		printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
2138  				  " inode %s:%lu\n", mode, inode->i_sb->s_id,
2139  				  inode->i_ino);
2140  }
2141  EXPORT_SYMBOL(init_special_inode);
2142  
2143  /**
2144   * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
2145   * @mnt_userns:	User namespace of the mount the inode was created from
2146   * @inode: New inode
2147   * @dir: Directory inode
2148   * @mode: mode of the new inode
2149   *
2150   * If the inode has been created through an idmapped mount the user namespace of
2151   * the vfsmount must be passed through @mnt_userns. This function will then take
2152   * care to map the inode according to @mnt_userns before checking permissions
2153   * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
2154   * checking is to be performed on the raw inode simply passs init_user_ns.
2155   */
2156  void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
2157  		      const struct inode *dir, umode_t mode)
2158  {
2159  	inode_fsuid_set(inode, mnt_userns);
2160  	if (dir && dir->i_mode & S_ISGID) {
2161  		inode->i_gid = dir->i_gid;
2162  
2163  		/* Directories are special, and always inherit S_ISGID */
2164  		if (S_ISDIR(mode))
2165  			mode |= S_ISGID;
2166  		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
2167  			 !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
2168  			 !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
2169  			mode &= ~S_ISGID;
2170  	} else
2171  		inode_fsgid_set(inode, mnt_userns);
2172  	inode->i_mode = mode;
2173  }
2174  EXPORT_SYMBOL(inode_init_owner);
2175  
2176  /**
2177   * inode_owner_or_capable - check current task permissions to inode
2178   * @mnt_userns:	user namespace of the mount the inode was found from
2179   * @inode: inode being checked
2180   *
2181   * Return true if current either has CAP_FOWNER in a namespace with the
2182   * inode owner uid mapped, or owns the file.
2183   *
2184   * If the inode has been found through an idmapped mount the user namespace of
2185   * the vfsmount must be passed through @mnt_userns. This function will then take
2186   * care to map the inode according to @mnt_userns before checking permissions.
2187   * On non-idmapped mounts or if permission checking is to be performed on the
2188   * raw inode simply passs init_user_ns.
2189   */
2190  bool inode_owner_or_capable(struct user_namespace *mnt_userns,
2191  			    const struct inode *inode)
2192  {
2193  	kuid_t i_uid;
2194  	struct user_namespace *ns;
2195  
2196  	i_uid = i_uid_into_mnt(mnt_userns, inode);
2197  	if (uid_eq(current_fsuid(), i_uid))
2198  		return true;
2199  
2200  	ns = current_user_ns();
2201  	if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
2202  		return true;
2203  	return false;
2204  }
2205  EXPORT_SYMBOL(inode_owner_or_capable);
2206  
2207  /*
2208   * Direct i/o helper functions
2209   */
2210  static void __inode_dio_wait(struct inode *inode)
2211  {
2212  	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
2213  	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
2214  
2215  	do {
2216  		prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
2217  		if (atomic_read(&inode->i_dio_count))
2218  			schedule();
2219  	} while (atomic_read(&inode->i_dio_count));
2220  	finish_wait(wq, &q.wq_entry);
2221  }
2222  
2223  /**
2224   * inode_dio_wait - wait for outstanding DIO requests to finish
2225   * @inode: inode to wait for
2226   *
2227   * Waits for all pending direct I/O requests to finish so that we can
2228   * proceed with a truncate or equivalent operation.
2229   *
2230   * Must be called under a lock that serializes taking new references
2231   * to i_dio_count, usually by inode->i_mutex.
2232   */
2233  void inode_dio_wait(struct inode *inode)
2234  {
2235  	if (atomic_read(&inode->i_dio_count))
2236  		__inode_dio_wait(inode);
2237  }
2238  EXPORT_SYMBOL(inode_dio_wait);
2239  
2240  /*
2241   * inode_set_flags - atomically set some inode flags
2242   *
2243   * Note: the caller should be holding i_mutex, or else be sure that
2244   * they have exclusive access to the inode structure (i.e., while the
2245   * inode is being instantiated).  The reason for the cmpxchg() loop
2246   * --- which wouldn't be necessary if all code paths which modify
2247   * i_flags actually followed this rule, is that there is at least one
2248   * code path which doesn't today so we use cmpxchg() out of an abundance
2249   * of caution.
2250   *
2251   * In the long run, i_mutex is overkill, and we should probably look
2252   * at using the i_lock spinlock to protect i_flags, and then make sure
2253   * it is so documented in include/linux/fs.h and that all code follows
2254   * the locking convention!!
2255   */
2256  void inode_set_flags(struct inode *inode, unsigned int flags,
2257  		     unsigned int mask)
2258  {
2259  	WARN_ON_ONCE(flags & ~mask);
2260  	set_mask_bits(&inode->i_flags, mask, flags);
2261  }
2262  EXPORT_SYMBOL(inode_set_flags);
2263  
2264  void inode_nohighmem(struct inode *inode)
2265  {
2266  	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
2267  }
2268  EXPORT_SYMBOL(inode_nohighmem);
2269  
2270  /**
2271   * timestamp_truncate - Truncate timespec to a granularity
2272   * @t: Timespec
2273   * @inode: inode being updated
2274   *
2275   * Truncate a timespec to the granularity supported by the fs
2276   * containing the inode. Always rounds down. gran must
2277   * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
2278   */
2279  struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
2280  {
2281  	struct super_block *sb = inode->i_sb;
2282  	unsigned int gran = sb->s_time_gran;
2283  
2284  	t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
2285  	if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
2286  		t.tv_nsec = 0;
2287  
2288  	/* Avoid division in the common cases 1 ns and 1 s. */
2289  	if (gran == 1)
2290  		; /* nothing */
2291  	else if (gran == NSEC_PER_SEC)
2292  		t.tv_nsec = 0;
2293  	else if (gran > 1 && gran < NSEC_PER_SEC)
2294  		t.tv_nsec -= t.tv_nsec % gran;
2295  	else
2296  		WARN(1, "invalid file time granularity: %u", gran);
2297  	return t;
2298  }
2299  EXPORT_SYMBOL(timestamp_truncate);
2300  
2301  /**
2302   * current_time - Return FS time
2303   * @inode: inode.
2304   *
2305   * Return the current time truncated to the time granularity supported by
2306   * the fs.
2307   *
2308   * Note that inode and inode->sb cannot be NULL.
2309   * Otherwise, the function warns and returns time without truncation.
2310   */
2311  struct timespec64 current_time(struct inode *inode)
2312  {
2313  	struct timespec64 now;
2314  
2315  	ktime_get_coarse_real_ts64(&now);
2316  
2317  	if (unlikely(!inode->i_sb)) {
2318  		WARN(1, "current_time() called with uninitialized super_block in the inode");
2319  		return now;
2320  	}
2321  
2322  	return timestamp_truncate(now, inode);
2323  }
2324  EXPORT_SYMBOL(current_time);
2325