xref: /linux/mm/shmem.c (revision 60f253ea7f1ba5122dff18812610b59477690dab)
1  /*
2   * Resizable virtual memory filesystem for Linux.
3   *
4   * Copyright (C) 2000 Linus Torvalds.
5   *		 2000 Transmeta Corp.
6   *		 2000-2001 Christoph Rohland
7   *		 2000-2001 SAP AG
8   *		 2002 Red Hat Inc.
9   * Copyright (C) 2002-2011 Hugh Dickins.
10   * Copyright (C) 2011 Google Inc.
11   * Copyright (C) 2002-2005 VERITAS Software Corporation.
12   * Copyright (C) 2004 Andi Kleen, SuSE Labs
13   *
14   * Extended attribute support for tmpfs:
15   * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16   * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17   *
18   * tiny-shmem:
19   * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20   *
21   * This file is released under the GPL.
22   */
23  
24  #include <linux/fs.h>
25  #include <linux/init.h>
26  #include <linux/vfs.h>
27  #include <linux/mount.h>
28  #include <linux/ramfs.h>
29  #include <linux/pagemap.h>
30  #include <linux/file.h>
31  #include <linux/fileattr.h>
32  #include <linux/mm.h>
33  #include <linux/random.h>
34  #include <linux/sched/signal.h>
35  #include <linux/export.h>
36  #include <linux/shmem_fs.h>
37  #include <linux/swap.h>
38  #include <linux/uio.h>
39  #include <linux/hugetlb.h>
40  #include <linux/fs_parser.h>
41  #include <linux/swapfile.h>
42  #include <linux/iversion.h>
43  #include "swap.h"
44  
45  static struct vfsmount *shm_mnt;
46  
47  #ifdef CONFIG_SHMEM
48  /*
49   * This virtual memory filesystem is heavily based on the ramfs. It
50   * extends ramfs by the ability to use swap and honor resource limits
51   * which makes it a completely usable filesystem.
52   */
53  
54  #include <linux/xattr.h>
55  #include <linux/exportfs.h>
56  #include <linux/posix_acl.h>
57  #include <linux/posix_acl_xattr.h>
58  #include <linux/mman.h>
59  #include <linux/string.h>
60  #include <linux/slab.h>
61  #include <linux/backing-dev.h>
62  #include <linux/writeback.h>
63  #include <linux/pagevec.h>
64  #include <linux/percpu_counter.h>
65  #include <linux/falloc.h>
66  #include <linux/splice.h>
67  #include <linux/security.h>
68  #include <linux/swapops.h>
69  #include <linux/mempolicy.h>
70  #include <linux/namei.h>
71  #include <linux/ctype.h>
72  #include <linux/migrate.h>
73  #include <linux/highmem.h>
74  #include <linux/seq_file.h>
75  #include <linux/magic.h>
76  #include <linux/syscalls.h>
77  #include <linux/fcntl.h>
78  #include <uapi/linux/memfd.h>
79  #include <linux/rmap.h>
80  #include <linux/uuid.h>
81  
82  #include <linux/uaccess.h>
83  
84  #include "internal.h"
85  
86  #define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
87  #define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
88  
89  /* Pretend that each entry is of this size in directory's i_size */
90  #define BOGO_DIRENT_SIZE 20
91  
92  /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
93  #define SHORT_SYMLINK_LEN 128
94  
95  /*
96   * shmem_fallocate communicates with shmem_fault or shmem_writepage via
97   * inode->i_private (with i_rwsem making sure that it has only one user at
98   * a time): we would prefer not to enlarge the shmem inode just for that.
99   */
100  struct shmem_falloc {
101  	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
102  	pgoff_t start;		/* start of range currently being fallocated */
103  	pgoff_t next;		/* the next page offset to be fallocated */
104  	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
105  	pgoff_t nr_unswapped;	/* how often writepage refused to swap out */
106  };
107  
108  struct shmem_options {
109  	unsigned long long blocks;
110  	unsigned long long inodes;
111  	struct mempolicy *mpol;
112  	kuid_t uid;
113  	kgid_t gid;
114  	umode_t mode;
115  	bool full_inums;
116  	int huge;
117  	int seen;
118  	bool noswap;
119  #define SHMEM_SEEN_BLOCKS 1
120  #define SHMEM_SEEN_INODES 2
121  #define SHMEM_SEEN_HUGE 4
122  #define SHMEM_SEEN_INUMS 8
123  #define SHMEM_SEEN_NOSWAP 16
124  };
125  
126  #ifdef CONFIG_TMPFS
127  static unsigned long shmem_default_max_blocks(void)
128  {
129  	return totalram_pages() / 2;
130  }
131  
132  static unsigned long shmem_default_max_inodes(void)
133  {
134  	unsigned long nr_pages = totalram_pages();
135  
136  	return min(nr_pages - totalhigh_pages(), nr_pages / 2);
137  }
138  #endif
139  
140  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
141  			     struct folio **foliop, enum sgp_type sgp,
142  			     gfp_t gfp, struct vm_area_struct *vma,
143  			     vm_fault_t *fault_type);
144  
145  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
146  {
147  	return sb->s_fs_info;
148  }
149  
150  /*
151   * shmem_file_setup pre-accounts the whole fixed size of a VM object,
152   * for shared memory and for shared anonymous (/dev/zero) mappings
153   * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
154   * consistent with the pre-accounting of private mappings ...
155   */
156  static inline int shmem_acct_size(unsigned long flags, loff_t size)
157  {
158  	return (flags & VM_NORESERVE) ?
159  		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
160  }
161  
162  static inline void shmem_unacct_size(unsigned long flags, loff_t size)
163  {
164  	if (!(flags & VM_NORESERVE))
165  		vm_unacct_memory(VM_ACCT(size));
166  }
167  
168  static inline int shmem_reacct_size(unsigned long flags,
169  		loff_t oldsize, loff_t newsize)
170  {
171  	if (!(flags & VM_NORESERVE)) {
172  		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
173  			return security_vm_enough_memory_mm(current->mm,
174  					VM_ACCT(newsize) - VM_ACCT(oldsize));
175  		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
176  			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
177  	}
178  	return 0;
179  }
180  
181  /*
182   * ... whereas tmpfs objects are accounted incrementally as
183   * pages are allocated, in order to allow large sparse files.
184   * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
185   * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
186   */
187  static inline int shmem_acct_block(unsigned long flags, long pages)
188  {
189  	if (!(flags & VM_NORESERVE))
190  		return 0;
191  
192  	return security_vm_enough_memory_mm(current->mm,
193  			pages * VM_ACCT(PAGE_SIZE));
194  }
195  
196  static inline void shmem_unacct_blocks(unsigned long flags, long pages)
197  {
198  	if (flags & VM_NORESERVE)
199  		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
200  }
201  
202  static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
203  {
204  	struct shmem_inode_info *info = SHMEM_I(inode);
205  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
206  
207  	if (shmem_acct_block(info->flags, pages))
208  		return false;
209  
210  	if (sbinfo->max_blocks) {
211  		if (percpu_counter_compare(&sbinfo->used_blocks,
212  					   sbinfo->max_blocks - pages) > 0)
213  			goto unacct;
214  		percpu_counter_add(&sbinfo->used_blocks, pages);
215  	}
216  
217  	return true;
218  
219  unacct:
220  	shmem_unacct_blocks(info->flags, pages);
221  	return false;
222  }
223  
224  static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
225  {
226  	struct shmem_inode_info *info = SHMEM_I(inode);
227  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
228  
229  	if (sbinfo->max_blocks)
230  		percpu_counter_sub(&sbinfo->used_blocks, pages);
231  	shmem_unacct_blocks(info->flags, pages);
232  }
233  
234  static const struct super_operations shmem_ops;
235  const struct address_space_operations shmem_aops;
236  static const struct file_operations shmem_file_operations;
237  static const struct inode_operations shmem_inode_operations;
238  static const struct inode_operations shmem_dir_inode_operations;
239  static const struct inode_operations shmem_special_inode_operations;
240  static const struct vm_operations_struct shmem_vm_ops;
241  static const struct vm_operations_struct shmem_anon_vm_ops;
242  static struct file_system_type shmem_fs_type;
243  
244  bool vma_is_anon_shmem(struct vm_area_struct *vma)
245  {
246  	return vma->vm_ops == &shmem_anon_vm_ops;
247  }
248  
249  bool vma_is_shmem(struct vm_area_struct *vma)
250  {
251  	return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
252  }
253  
254  static LIST_HEAD(shmem_swaplist);
255  static DEFINE_MUTEX(shmem_swaplist_mutex);
256  
257  /*
258   * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
259   * produces a novel ino for the newly allocated inode.
260   *
261   * It may also be called when making a hard link to permit the space needed by
262   * each dentry. However, in that case, no new inode number is needed since that
263   * internally draws from another pool of inode numbers (currently global
264   * get_next_ino()). This case is indicated by passing NULL as inop.
265   */
266  #define SHMEM_INO_BATCH 1024
267  static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
268  {
269  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
270  	ino_t ino;
271  
272  	if (!(sb->s_flags & SB_KERNMOUNT)) {
273  		raw_spin_lock(&sbinfo->stat_lock);
274  		if (sbinfo->max_inodes) {
275  			if (!sbinfo->free_inodes) {
276  				raw_spin_unlock(&sbinfo->stat_lock);
277  				return -ENOSPC;
278  			}
279  			sbinfo->free_inodes--;
280  		}
281  		if (inop) {
282  			ino = sbinfo->next_ino++;
283  			if (unlikely(is_zero_ino(ino)))
284  				ino = sbinfo->next_ino++;
285  			if (unlikely(!sbinfo->full_inums &&
286  				     ino > UINT_MAX)) {
287  				/*
288  				 * Emulate get_next_ino uint wraparound for
289  				 * compatibility
290  				 */
291  				if (IS_ENABLED(CONFIG_64BIT))
292  					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
293  						__func__, MINOR(sb->s_dev));
294  				sbinfo->next_ino = 1;
295  				ino = sbinfo->next_ino++;
296  			}
297  			*inop = ino;
298  		}
299  		raw_spin_unlock(&sbinfo->stat_lock);
300  	} else if (inop) {
301  		/*
302  		 * __shmem_file_setup, one of our callers, is lock-free: it
303  		 * doesn't hold stat_lock in shmem_reserve_inode since
304  		 * max_inodes is always 0, and is called from potentially
305  		 * unknown contexts. As such, use a per-cpu batched allocator
306  		 * which doesn't require the per-sb stat_lock unless we are at
307  		 * the batch boundary.
308  		 *
309  		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
310  		 * shmem mounts are not exposed to userspace, so we don't need
311  		 * to worry about things like glibc compatibility.
312  		 */
313  		ino_t *next_ino;
314  
315  		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
316  		ino = *next_ino;
317  		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
318  			raw_spin_lock(&sbinfo->stat_lock);
319  			ino = sbinfo->next_ino;
320  			sbinfo->next_ino += SHMEM_INO_BATCH;
321  			raw_spin_unlock(&sbinfo->stat_lock);
322  			if (unlikely(is_zero_ino(ino)))
323  				ino++;
324  		}
325  		*inop = ino;
326  		*next_ino = ++ino;
327  		put_cpu();
328  	}
329  
330  	return 0;
331  }
332  
333  static void shmem_free_inode(struct super_block *sb)
334  {
335  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
336  	if (sbinfo->max_inodes) {
337  		raw_spin_lock(&sbinfo->stat_lock);
338  		sbinfo->free_inodes++;
339  		raw_spin_unlock(&sbinfo->stat_lock);
340  	}
341  }
342  
343  /**
344   * shmem_recalc_inode - recalculate the block usage of an inode
345   * @inode: inode to recalc
346   *
347   * We have to calculate the free blocks since the mm can drop
348   * undirtied hole pages behind our back.
349   *
350   * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
351   * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
352   *
353   * It has to be called with the spinlock held.
354   */
355  static void shmem_recalc_inode(struct inode *inode)
356  {
357  	struct shmem_inode_info *info = SHMEM_I(inode);
358  	long freed;
359  
360  	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
361  	if (freed > 0) {
362  		info->alloced -= freed;
363  		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
364  		shmem_inode_unacct_blocks(inode, freed);
365  	}
366  }
367  
368  bool shmem_charge(struct inode *inode, long pages)
369  {
370  	struct shmem_inode_info *info = SHMEM_I(inode);
371  	unsigned long flags;
372  
373  	if (!shmem_inode_acct_block(inode, pages))
374  		return false;
375  
376  	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
377  	inode->i_mapping->nrpages += pages;
378  
379  	spin_lock_irqsave(&info->lock, flags);
380  	info->alloced += pages;
381  	inode->i_blocks += pages * BLOCKS_PER_PAGE;
382  	shmem_recalc_inode(inode);
383  	spin_unlock_irqrestore(&info->lock, flags);
384  
385  	return true;
386  }
387  
388  void shmem_uncharge(struct inode *inode, long pages)
389  {
390  	struct shmem_inode_info *info = SHMEM_I(inode);
391  	unsigned long flags;
392  
393  	/* nrpages adjustment done by __filemap_remove_folio() or caller */
394  
395  	spin_lock_irqsave(&info->lock, flags);
396  	info->alloced -= pages;
397  	inode->i_blocks -= pages * BLOCKS_PER_PAGE;
398  	shmem_recalc_inode(inode);
399  	spin_unlock_irqrestore(&info->lock, flags);
400  
401  	shmem_inode_unacct_blocks(inode, pages);
402  }
403  
404  /*
405   * Replace item expected in xarray by a new item, while holding xa_lock.
406   */
407  static int shmem_replace_entry(struct address_space *mapping,
408  			pgoff_t index, void *expected, void *replacement)
409  {
410  	XA_STATE(xas, &mapping->i_pages, index);
411  	void *item;
412  
413  	VM_BUG_ON(!expected);
414  	VM_BUG_ON(!replacement);
415  	item = xas_load(&xas);
416  	if (item != expected)
417  		return -ENOENT;
418  	xas_store(&xas, replacement);
419  	return 0;
420  }
421  
422  /*
423   * Sometimes, before we decide whether to proceed or to fail, we must check
424   * that an entry was not already brought back from swap by a racing thread.
425   *
426   * Checking page is not enough: by the time a SwapCache page is locked, it
427   * might be reused, and again be SwapCache, using the same swap as before.
428   */
429  static bool shmem_confirm_swap(struct address_space *mapping,
430  			       pgoff_t index, swp_entry_t swap)
431  {
432  	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
433  }
434  
435  /*
436   * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
437   *
438   * SHMEM_HUGE_NEVER:
439   *	disables huge pages for the mount;
440   * SHMEM_HUGE_ALWAYS:
441   *	enables huge pages for the mount;
442   * SHMEM_HUGE_WITHIN_SIZE:
443   *	only allocate huge pages if the page will be fully within i_size,
444   *	also respect fadvise()/madvise() hints;
445   * SHMEM_HUGE_ADVISE:
446   *	only allocate huge pages if requested with fadvise()/madvise();
447   */
448  
449  #define SHMEM_HUGE_NEVER	0
450  #define SHMEM_HUGE_ALWAYS	1
451  #define SHMEM_HUGE_WITHIN_SIZE	2
452  #define SHMEM_HUGE_ADVISE	3
453  
454  /*
455   * Special values.
456   * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
457   *
458   * SHMEM_HUGE_DENY:
459   *	disables huge on shm_mnt and all mounts, for emergency use;
460   * SHMEM_HUGE_FORCE:
461   *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
462   *
463   */
464  #define SHMEM_HUGE_DENY		(-1)
465  #define SHMEM_HUGE_FORCE	(-2)
466  
467  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
468  /* ifdef here to avoid bloating shmem.o when not necessary */
469  
470  static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
471  
472  bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
473  		   struct mm_struct *mm, unsigned long vm_flags)
474  {
475  	loff_t i_size;
476  
477  	if (!S_ISREG(inode->i_mode))
478  		return false;
479  	if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags)))
480  		return false;
481  	if (shmem_huge == SHMEM_HUGE_DENY)
482  		return false;
483  	if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
484  		return true;
485  
486  	switch (SHMEM_SB(inode->i_sb)->huge) {
487  	case SHMEM_HUGE_ALWAYS:
488  		return true;
489  	case SHMEM_HUGE_WITHIN_SIZE:
490  		index = round_up(index + 1, HPAGE_PMD_NR);
491  		i_size = round_up(i_size_read(inode), PAGE_SIZE);
492  		if (i_size >> PAGE_SHIFT >= index)
493  			return true;
494  		fallthrough;
495  	case SHMEM_HUGE_ADVISE:
496  		if (mm && (vm_flags & VM_HUGEPAGE))
497  			return true;
498  		fallthrough;
499  	default:
500  		return false;
501  	}
502  }
503  
504  #if defined(CONFIG_SYSFS)
505  static int shmem_parse_huge(const char *str)
506  {
507  	if (!strcmp(str, "never"))
508  		return SHMEM_HUGE_NEVER;
509  	if (!strcmp(str, "always"))
510  		return SHMEM_HUGE_ALWAYS;
511  	if (!strcmp(str, "within_size"))
512  		return SHMEM_HUGE_WITHIN_SIZE;
513  	if (!strcmp(str, "advise"))
514  		return SHMEM_HUGE_ADVISE;
515  	if (!strcmp(str, "deny"))
516  		return SHMEM_HUGE_DENY;
517  	if (!strcmp(str, "force"))
518  		return SHMEM_HUGE_FORCE;
519  	return -EINVAL;
520  }
521  #endif
522  
523  #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
524  static const char *shmem_format_huge(int huge)
525  {
526  	switch (huge) {
527  	case SHMEM_HUGE_NEVER:
528  		return "never";
529  	case SHMEM_HUGE_ALWAYS:
530  		return "always";
531  	case SHMEM_HUGE_WITHIN_SIZE:
532  		return "within_size";
533  	case SHMEM_HUGE_ADVISE:
534  		return "advise";
535  	case SHMEM_HUGE_DENY:
536  		return "deny";
537  	case SHMEM_HUGE_FORCE:
538  		return "force";
539  	default:
540  		VM_BUG_ON(1);
541  		return "bad_val";
542  	}
543  }
544  #endif
545  
546  static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
547  		struct shrink_control *sc, unsigned long nr_to_split)
548  {
549  	LIST_HEAD(list), *pos, *next;
550  	LIST_HEAD(to_remove);
551  	struct inode *inode;
552  	struct shmem_inode_info *info;
553  	struct folio *folio;
554  	unsigned long batch = sc ? sc->nr_to_scan : 128;
555  	int split = 0;
556  
557  	if (list_empty(&sbinfo->shrinklist))
558  		return SHRINK_STOP;
559  
560  	spin_lock(&sbinfo->shrinklist_lock);
561  	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
562  		info = list_entry(pos, struct shmem_inode_info, shrinklist);
563  
564  		/* pin the inode */
565  		inode = igrab(&info->vfs_inode);
566  
567  		/* inode is about to be evicted */
568  		if (!inode) {
569  			list_del_init(&info->shrinklist);
570  			goto next;
571  		}
572  
573  		/* Check if there's anything to gain */
574  		if (round_up(inode->i_size, PAGE_SIZE) ==
575  				round_up(inode->i_size, HPAGE_PMD_SIZE)) {
576  			list_move(&info->shrinklist, &to_remove);
577  			goto next;
578  		}
579  
580  		list_move(&info->shrinklist, &list);
581  next:
582  		sbinfo->shrinklist_len--;
583  		if (!--batch)
584  			break;
585  	}
586  	spin_unlock(&sbinfo->shrinklist_lock);
587  
588  	list_for_each_safe(pos, next, &to_remove) {
589  		info = list_entry(pos, struct shmem_inode_info, shrinklist);
590  		inode = &info->vfs_inode;
591  		list_del_init(&info->shrinklist);
592  		iput(inode);
593  	}
594  
595  	list_for_each_safe(pos, next, &list) {
596  		int ret;
597  		pgoff_t index;
598  
599  		info = list_entry(pos, struct shmem_inode_info, shrinklist);
600  		inode = &info->vfs_inode;
601  
602  		if (nr_to_split && split >= nr_to_split)
603  			goto move_back;
604  
605  		index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
606  		folio = filemap_get_folio(inode->i_mapping, index);
607  		if (IS_ERR(folio))
608  			goto drop;
609  
610  		/* No huge page at the end of the file: nothing to split */
611  		if (!folio_test_large(folio)) {
612  			folio_put(folio);
613  			goto drop;
614  		}
615  
616  		/*
617  		 * Move the inode on the list back to shrinklist if we failed
618  		 * to lock the page at this time.
619  		 *
620  		 * Waiting for the lock may lead to deadlock in the
621  		 * reclaim path.
622  		 */
623  		if (!folio_trylock(folio)) {
624  			folio_put(folio);
625  			goto move_back;
626  		}
627  
628  		ret = split_folio(folio);
629  		folio_unlock(folio);
630  		folio_put(folio);
631  
632  		/* If split failed move the inode on the list back to shrinklist */
633  		if (ret)
634  			goto move_back;
635  
636  		split++;
637  drop:
638  		list_del_init(&info->shrinklist);
639  		goto put;
640  move_back:
641  		/*
642  		 * Make sure the inode is either on the global list or deleted
643  		 * from any local list before iput() since it could be deleted
644  		 * in another thread once we put the inode (then the local list
645  		 * is corrupted).
646  		 */
647  		spin_lock(&sbinfo->shrinklist_lock);
648  		list_move(&info->shrinklist, &sbinfo->shrinklist);
649  		sbinfo->shrinklist_len++;
650  		spin_unlock(&sbinfo->shrinklist_lock);
651  put:
652  		iput(inode);
653  	}
654  
655  	return split;
656  }
657  
658  static long shmem_unused_huge_scan(struct super_block *sb,
659  		struct shrink_control *sc)
660  {
661  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
662  
663  	if (!READ_ONCE(sbinfo->shrinklist_len))
664  		return SHRINK_STOP;
665  
666  	return shmem_unused_huge_shrink(sbinfo, sc, 0);
667  }
668  
669  static long shmem_unused_huge_count(struct super_block *sb,
670  		struct shrink_control *sc)
671  {
672  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
673  	return READ_ONCE(sbinfo->shrinklist_len);
674  }
675  #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
676  
677  #define shmem_huge SHMEM_HUGE_DENY
678  
679  bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
680  		   struct mm_struct *mm, unsigned long vm_flags)
681  {
682  	return false;
683  }
684  
685  static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
686  		struct shrink_control *sc, unsigned long nr_to_split)
687  {
688  	return 0;
689  }
690  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
691  
692  /*
693   * Like filemap_add_folio, but error if expected item has gone.
694   */
695  static int shmem_add_to_page_cache(struct folio *folio,
696  				   struct address_space *mapping,
697  				   pgoff_t index, void *expected, gfp_t gfp,
698  				   struct mm_struct *charge_mm)
699  {
700  	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
701  	long nr = folio_nr_pages(folio);
702  	int error;
703  
704  	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
705  	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
706  	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
707  	VM_BUG_ON(expected && folio_test_large(folio));
708  
709  	folio_ref_add(folio, nr);
710  	folio->mapping = mapping;
711  	folio->index = index;
712  
713  	if (!folio_test_swapcache(folio)) {
714  		error = mem_cgroup_charge(folio, charge_mm, gfp);
715  		if (error) {
716  			if (folio_test_pmd_mappable(folio)) {
717  				count_vm_event(THP_FILE_FALLBACK);
718  				count_vm_event(THP_FILE_FALLBACK_CHARGE);
719  			}
720  			goto error;
721  		}
722  	}
723  	folio_throttle_swaprate(folio, gfp);
724  
725  	do {
726  		xas_lock_irq(&xas);
727  		if (expected != xas_find_conflict(&xas)) {
728  			xas_set_err(&xas, -EEXIST);
729  			goto unlock;
730  		}
731  		if (expected && xas_find_conflict(&xas)) {
732  			xas_set_err(&xas, -EEXIST);
733  			goto unlock;
734  		}
735  		xas_store(&xas, folio);
736  		if (xas_error(&xas))
737  			goto unlock;
738  		if (folio_test_pmd_mappable(folio)) {
739  			count_vm_event(THP_FILE_ALLOC);
740  			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
741  		}
742  		mapping->nrpages += nr;
743  		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
744  		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
745  unlock:
746  		xas_unlock_irq(&xas);
747  	} while (xas_nomem(&xas, gfp));
748  
749  	if (xas_error(&xas)) {
750  		error = xas_error(&xas);
751  		goto error;
752  	}
753  
754  	return 0;
755  error:
756  	folio->mapping = NULL;
757  	folio_ref_sub(folio, nr);
758  	return error;
759  }
760  
761  /*
762   * Like delete_from_page_cache, but substitutes swap for @folio.
763   */
764  static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
765  {
766  	struct address_space *mapping = folio->mapping;
767  	long nr = folio_nr_pages(folio);
768  	int error;
769  
770  	xa_lock_irq(&mapping->i_pages);
771  	error = shmem_replace_entry(mapping, folio->index, folio, radswap);
772  	folio->mapping = NULL;
773  	mapping->nrpages -= nr;
774  	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
775  	__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
776  	xa_unlock_irq(&mapping->i_pages);
777  	folio_put(folio);
778  	BUG_ON(error);
779  }
780  
781  /*
782   * Remove swap entry from page cache, free the swap and its page cache.
783   */
784  static int shmem_free_swap(struct address_space *mapping,
785  			   pgoff_t index, void *radswap)
786  {
787  	void *old;
788  
789  	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
790  	if (old != radswap)
791  		return -ENOENT;
792  	free_swap_and_cache(radix_to_swp_entry(radswap));
793  	return 0;
794  }
795  
796  /*
797   * Determine (in bytes) how many of the shmem object's pages mapped by the
798   * given offsets are swapped out.
799   *
800   * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
801   * as long as the inode doesn't go away and racy results are not a problem.
802   */
803  unsigned long shmem_partial_swap_usage(struct address_space *mapping,
804  						pgoff_t start, pgoff_t end)
805  {
806  	XA_STATE(xas, &mapping->i_pages, start);
807  	struct page *page;
808  	unsigned long swapped = 0;
809  
810  	rcu_read_lock();
811  	xas_for_each(&xas, page, end - 1) {
812  		if (xas_retry(&xas, page))
813  			continue;
814  		if (xa_is_value(page))
815  			swapped++;
816  
817  		if (need_resched()) {
818  			xas_pause(&xas);
819  			cond_resched_rcu();
820  		}
821  	}
822  
823  	rcu_read_unlock();
824  
825  	return swapped << PAGE_SHIFT;
826  }
827  
828  /*
829   * Determine (in bytes) how many of the shmem object's pages mapped by the
830   * given vma is swapped out.
831   *
832   * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
833   * as long as the inode doesn't go away and racy results are not a problem.
834   */
835  unsigned long shmem_swap_usage(struct vm_area_struct *vma)
836  {
837  	struct inode *inode = file_inode(vma->vm_file);
838  	struct shmem_inode_info *info = SHMEM_I(inode);
839  	struct address_space *mapping = inode->i_mapping;
840  	unsigned long swapped;
841  
842  	/* Be careful as we don't hold info->lock */
843  	swapped = READ_ONCE(info->swapped);
844  
845  	/*
846  	 * The easier cases are when the shmem object has nothing in swap, or
847  	 * the vma maps it whole. Then we can simply use the stats that we
848  	 * already track.
849  	 */
850  	if (!swapped)
851  		return 0;
852  
853  	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
854  		return swapped << PAGE_SHIFT;
855  
856  	/* Here comes the more involved part */
857  	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
858  					vma->vm_pgoff + vma_pages(vma));
859  }
860  
861  /*
862   * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
863   */
864  void shmem_unlock_mapping(struct address_space *mapping)
865  {
866  	struct folio_batch fbatch;
867  	pgoff_t index = 0;
868  
869  	folio_batch_init(&fbatch);
870  	/*
871  	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
872  	 */
873  	while (!mapping_unevictable(mapping) &&
874  	       filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
875  		check_move_unevictable_folios(&fbatch);
876  		folio_batch_release(&fbatch);
877  		cond_resched();
878  	}
879  }
880  
881  static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
882  {
883  	struct folio *folio;
884  
885  	/*
886  	 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
887  	 * beyond i_size, and reports fallocated folios as holes.
888  	 */
889  	folio = filemap_get_entry(inode->i_mapping, index);
890  	if (!folio)
891  		return folio;
892  	if (!xa_is_value(folio)) {
893  		folio_lock(folio);
894  		if (folio->mapping == inode->i_mapping)
895  			return folio;
896  		/* The folio has been swapped out */
897  		folio_unlock(folio);
898  		folio_put(folio);
899  	}
900  	/*
901  	 * But read a folio back from swap if any of it is within i_size
902  	 * (although in some cases this is just a waste of time).
903  	 */
904  	folio = NULL;
905  	shmem_get_folio(inode, index, &folio, SGP_READ);
906  	return folio;
907  }
908  
909  /*
910   * Remove range of pages and swap entries from page cache, and free them.
911   * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
912   */
913  static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
914  								 bool unfalloc)
915  {
916  	struct address_space *mapping = inode->i_mapping;
917  	struct shmem_inode_info *info = SHMEM_I(inode);
918  	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
919  	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
920  	struct folio_batch fbatch;
921  	pgoff_t indices[PAGEVEC_SIZE];
922  	struct folio *folio;
923  	bool same_folio;
924  	long nr_swaps_freed = 0;
925  	pgoff_t index;
926  	int i;
927  
928  	if (lend == -1)
929  		end = -1;	/* unsigned, so actually very big */
930  
931  	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
932  		info->fallocend = start;
933  
934  	folio_batch_init(&fbatch);
935  	index = start;
936  	while (index < end && find_lock_entries(mapping, &index, end - 1,
937  			&fbatch, indices)) {
938  		for (i = 0; i < folio_batch_count(&fbatch); i++) {
939  			folio = fbatch.folios[i];
940  
941  			if (xa_is_value(folio)) {
942  				if (unfalloc)
943  					continue;
944  				nr_swaps_freed += !shmem_free_swap(mapping,
945  							indices[i], folio);
946  				continue;
947  			}
948  
949  			if (!unfalloc || !folio_test_uptodate(folio))
950  				truncate_inode_folio(mapping, folio);
951  			folio_unlock(folio);
952  		}
953  		folio_batch_remove_exceptionals(&fbatch);
954  		folio_batch_release(&fbatch);
955  		cond_resched();
956  	}
957  
958  	/*
959  	 * When undoing a failed fallocate, we want none of the partial folio
960  	 * zeroing and splitting below, but shall want to truncate the whole
961  	 * folio when !uptodate indicates that it was added by this fallocate,
962  	 * even when [lstart, lend] covers only a part of the folio.
963  	 */
964  	if (unfalloc)
965  		goto whole_folios;
966  
967  	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
968  	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
969  	if (folio) {
970  		same_folio = lend < folio_pos(folio) + folio_size(folio);
971  		folio_mark_dirty(folio);
972  		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
973  			start = folio->index + folio_nr_pages(folio);
974  			if (same_folio)
975  				end = folio->index;
976  		}
977  		folio_unlock(folio);
978  		folio_put(folio);
979  		folio = NULL;
980  	}
981  
982  	if (!same_folio)
983  		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
984  	if (folio) {
985  		folio_mark_dirty(folio);
986  		if (!truncate_inode_partial_folio(folio, lstart, lend))
987  			end = folio->index;
988  		folio_unlock(folio);
989  		folio_put(folio);
990  	}
991  
992  whole_folios:
993  
994  	index = start;
995  	while (index < end) {
996  		cond_resched();
997  
998  		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
999  				indices)) {
1000  			/* If all gone or hole-punch or unfalloc, we're done */
1001  			if (index == start || end != -1)
1002  				break;
1003  			/* But if truncating, restart to make sure all gone */
1004  			index = start;
1005  			continue;
1006  		}
1007  		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1008  			folio = fbatch.folios[i];
1009  
1010  			if (xa_is_value(folio)) {
1011  				if (unfalloc)
1012  					continue;
1013  				if (shmem_free_swap(mapping, indices[i], folio)) {
1014  					/* Swap was replaced by page: retry */
1015  					index = indices[i];
1016  					break;
1017  				}
1018  				nr_swaps_freed++;
1019  				continue;
1020  			}
1021  
1022  			folio_lock(folio);
1023  
1024  			if (!unfalloc || !folio_test_uptodate(folio)) {
1025  				if (folio_mapping(folio) != mapping) {
1026  					/* Page was replaced by swap: retry */
1027  					folio_unlock(folio);
1028  					index = indices[i];
1029  					break;
1030  				}
1031  				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1032  						folio);
1033  				truncate_inode_folio(mapping, folio);
1034  			}
1035  			folio_unlock(folio);
1036  		}
1037  		folio_batch_remove_exceptionals(&fbatch);
1038  		folio_batch_release(&fbatch);
1039  	}
1040  
1041  	spin_lock_irq(&info->lock);
1042  	info->swapped -= nr_swaps_freed;
1043  	shmem_recalc_inode(inode);
1044  	spin_unlock_irq(&info->lock);
1045  }
1046  
1047  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1048  {
1049  	shmem_undo_range(inode, lstart, lend, false);
1050  	inode->i_ctime = inode->i_mtime = current_time(inode);
1051  	inode_inc_iversion(inode);
1052  }
1053  EXPORT_SYMBOL_GPL(shmem_truncate_range);
1054  
1055  static int shmem_getattr(struct mnt_idmap *idmap,
1056  			 const struct path *path, struct kstat *stat,
1057  			 u32 request_mask, unsigned int query_flags)
1058  {
1059  	struct inode *inode = path->dentry->d_inode;
1060  	struct shmem_inode_info *info = SHMEM_I(inode);
1061  
1062  	if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
1063  		spin_lock_irq(&info->lock);
1064  		shmem_recalc_inode(inode);
1065  		spin_unlock_irq(&info->lock);
1066  	}
1067  	if (info->fsflags & FS_APPEND_FL)
1068  		stat->attributes |= STATX_ATTR_APPEND;
1069  	if (info->fsflags & FS_IMMUTABLE_FL)
1070  		stat->attributes |= STATX_ATTR_IMMUTABLE;
1071  	if (info->fsflags & FS_NODUMP_FL)
1072  		stat->attributes |= STATX_ATTR_NODUMP;
1073  	stat->attributes_mask |= (STATX_ATTR_APPEND |
1074  			STATX_ATTR_IMMUTABLE |
1075  			STATX_ATTR_NODUMP);
1076  	generic_fillattr(idmap, inode, stat);
1077  
1078  	if (shmem_is_huge(inode, 0, false, NULL, 0))
1079  		stat->blksize = HPAGE_PMD_SIZE;
1080  
1081  	if (request_mask & STATX_BTIME) {
1082  		stat->result_mask |= STATX_BTIME;
1083  		stat->btime.tv_sec = info->i_crtime.tv_sec;
1084  		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1085  	}
1086  
1087  	return 0;
1088  }
1089  
1090  static int shmem_setattr(struct mnt_idmap *idmap,
1091  			 struct dentry *dentry, struct iattr *attr)
1092  {
1093  	struct inode *inode = d_inode(dentry);
1094  	struct shmem_inode_info *info = SHMEM_I(inode);
1095  	int error;
1096  	bool update_mtime = false;
1097  	bool update_ctime = true;
1098  
1099  	error = setattr_prepare(idmap, dentry, attr);
1100  	if (error)
1101  		return error;
1102  
1103  	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1104  		if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1105  			return -EPERM;
1106  		}
1107  	}
1108  
1109  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1110  		loff_t oldsize = inode->i_size;
1111  		loff_t newsize = attr->ia_size;
1112  
1113  		/* protected by i_rwsem */
1114  		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1115  		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1116  			return -EPERM;
1117  
1118  		if (newsize != oldsize) {
1119  			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1120  					oldsize, newsize);
1121  			if (error)
1122  				return error;
1123  			i_size_write(inode, newsize);
1124  			update_mtime = true;
1125  		} else {
1126  			update_ctime = false;
1127  		}
1128  		if (newsize <= oldsize) {
1129  			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1130  			if (oldsize > holebegin)
1131  				unmap_mapping_range(inode->i_mapping,
1132  							holebegin, 0, 1);
1133  			if (info->alloced)
1134  				shmem_truncate_range(inode,
1135  							newsize, (loff_t)-1);
1136  			/* unmap again to remove racily COWed private pages */
1137  			if (oldsize > holebegin)
1138  				unmap_mapping_range(inode->i_mapping,
1139  							holebegin, 0, 1);
1140  		}
1141  	}
1142  
1143  	setattr_copy(idmap, inode, attr);
1144  	if (attr->ia_valid & ATTR_MODE)
1145  		error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1146  	if (!error && update_ctime) {
1147  		inode->i_ctime = current_time(inode);
1148  		if (update_mtime)
1149  			inode->i_mtime = inode->i_ctime;
1150  		inode_inc_iversion(inode);
1151  	}
1152  	return error;
1153  }
1154  
1155  static void shmem_evict_inode(struct inode *inode)
1156  {
1157  	struct shmem_inode_info *info = SHMEM_I(inode);
1158  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1159  
1160  	if (shmem_mapping(inode->i_mapping)) {
1161  		shmem_unacct_size(info->flags, inode->i_size);
1162  		inode->i_size = 0;
1163  		mapping_set_exiting(inode->i_mapping);
1164  		shmem_truncate_range(inode, 0, (loff_t)-1);
1165  		if (!list_empty(&info->shrinklist)) {
1166  			spin_lock(&sbinfo->shrinklist_lock);
1167  			if (!list_empty(&info->shrinklist)) {
1168  				list_del_init(&info->shrinklist);
1169  				sbinfo->shrinklist_len--;
1170  			}
1171  			spin_unlock(&sbinfo->shrinklist_lock);
1172  		}
1173  		while (!list_empty(&info->swaplist)) {
1174  			/* Wait while shmem_unuse() is scanning this inode... */
1175  			wait_var_event(&info->stop_eviction,
1176  				       !atomic_read(&info->stop_eviction));
1177  			mutex_lock(&shmem_swaplist_mutex);
1178  			/* ...but beware of the race if we peeked too early */
1179  			if (!atomic_read(&info->stop_eviction))
1180  				list_del_init(&info->swaplist);
1181  			mutex_unlock(&shmem_swaplist_mutex);
1182  		}
1183  	}
1184  
1185  	simple_xattrs_free(&info->xattrs);
1186  	WARN_ON(inode->i_blocks);
1187  	shmem_free_inode(inode->i_sb);
1188  	clear_inode(inode);
1189  }
1190  
1191  static int shmem_find_swap_entries(struct address_space *mapping,
1192  				   pgoff_t start, struct folio_batch *fbatch,
1193  				   pgoff_t *indices, unsigned int type)
1194  {
1195  	XA_STATE(xas, &mapping->i_pages, start);
1196  	struct folio *folio;
1197  	swp_entry_t entry;
1198  
1199  	rcu_read_lock();
1200  	xas_for_each(&xas, folio, ULONG_MAX) {
1201  		if (xas_retry(&xas, folio))
1202  			continue;
1203  
1204  		if (!xa_is_value(folio))
1205  			continue;
1206  
1207  		entry = radix_to_swp_entry(folio);
1208  		/*
1209  		 * swapin error entries can be found in the mapping. But they're
1210  		 * deliberately ignored here as we've done everything we can do.
1211  		 */
1212  		if (swp_type(entry) != type)
1213  			continue;
1214  
1215  		indices[folio_batch_count(fbatch)] = xas.xa_index;
1216  		if (!folio_batch_add(fbatch, folio))
1217  			break;
1218  
1219  		if (need_resched()) {
1220  			xas_pause(&xas);
1221  			cond_resched_rcu();
1222  		}
1223  	}
1224  	rcu_read_unlock();
1225  
1226  	return xas.xa_index;
1227  }
1228  
1229  /*
1230   * Move the swapped pages for an inode to page cache. Returns the count
1231   * of pages swapped in, or the error in case of failure.
1232   */
1233  static int shmem_unuse_swap_entries(struct inode *inode,
1234  		struct folio_batch *fbatch, pgoff_t *indices)
1235  {
1236  	int i = 0;
1237  	int ret = 0;
1238  	int error = 0;
1239  	struct address_space *mapping = inode->i_mapping;
1240  
1241  	for (i = 0; i < folio_batch_count(fbatch); i++) {
1242  		struct folio *folio = fbatch->folios[i];
1243  
1244  		if (!xa_is_value(folio))
1245  			continue;
1246  		error = shmem_swapin_folio(inode, indices[i],
1247  					  &folio, SGP_CACHE,
1248  					  mapping_gfp_mask(mapping),
1249  					  NULL, NULL);
1250  		if (error == 0) {
1251  			folio_unlock(folio);
1252  			folio_put(folio);
1253  			ret++;
1254  		}
1255  		if (error == -ENOMEM)
1256  			break;
1257  		error = 0;
1258  	}
1259  	return error ? error : ret;
1260  }
1261  
1262  /*
1263   * If swap found in inode, free it and move page from swapcache to filecache.
1264   */
1265  static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1266  {
1267  	struct address_space *mapping = inode->i_mapping;
1268  	pgoff_t start = 0;
1269  	struct folio_batch fbatch;
1270  	pgoff_t indices[PAGEVEC_SIZE];
1271  	int ret = 0;
1272  
1273  	do {
1274  		folio_batch_init(&fbatch);
1275  		shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
1276  		if (folio_batch_count(&fbatch) == 0) {
1277  			ret = 0;
1278  			break;
1279  		}
1280  
1281  		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1282  		if (ret < 0)
1283  			break;
1284  
1285  		start = indices[folio_batch_count(&fbatch) - 1];
1286  	} while (true);
1287  
1288  	return ret;
1289  }
1290  
1291  /*
1292   * Read all the shared memory data that resides in the swap
1293   * device 'type' back into memory, so the swap device can be
1294   * unused.
1295   */
1296  int shmem_unuse(unsigned int type)
1297  {
1298  	struct shmem_inode_info *info, *next;
1299  	int error = 0;
1300  
1301  	if (list_empty(&shmem_swaplist))
1302  		return 0;
1303  
1304  	mutex_lock(&shmem_swaplist_mutex);
1305  	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1306  		if (!info->swapped) {
1307  			list_del_init(&info->swaplist);
1308  			continue;
1309  		}
1310  		/*
1311  		 * Drop the swaplist mutex while searching the inode for swap;
1312  		 * but before doing so, make sure shmem_evict_inode() will not
1313  		 * remove placeholder inode from swaplist, nor let it be freed
1314  		 * (igrab() would protect from unlink, but not from unmount).
1315  		 */
1316  		atomic_inc(&info->stop_eviction);
1317  		mutex_unlock(&shmem_swaplist_mutex);
1318  
1319  		error = shmem_unuse_inode(&info->vfs_inode, type);
1320  		cond_resched();
1321  
1322  		mutex_lock(&shmem_swaplist_mutex);
1323  		next = list_next_entry(info, swaplist);
1324  		if (!info->swapped)
1325  			list_del_init(&info->swaplist);
1326  		if (atomic_dec_and_test(&info->stop_eviction))
1327  			wake_up_var(&info->stop_eviction);
1328  		if (error)
1329  			break;
1330  	}
1331  	mutex_unlock(&shmem_swaplist_mutex);
1332  
1333  	return error;
1334  }
1335  
1336  /*
1337   * Move the page from the page cache to the swap cache.
1338   */
1339  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1340  {
1341  	struct folio *folio = page_folio(page);
1342  	struct address_space *mapping = folio->mapping;
1343  	struct inode *inode = mapping->host;
1344  	struct shmem_inode_info *info = SHMEM_I(inode);
1345  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1346  	swp_entry_t swap;
1347  	pgoff_t index;
1348  
1349  	/*
1350  	 * Our capabilities prevent regular writeback or sync from ever calling
1351  	 * shmem_writepage; but a stacking filesystem might use ->writepage of
1352  	 * its underlying filesystem, in which case tmpfs should write out to
1353  	 * swap only in response to memory pressure, and not for the writeback
1354  	 * threads or sync.
1355  	 */
1356  	if (WARN_ON_ONCE(!wbc->for_reclaim))
1357  		goto redirty;
1358  
1359  	if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
1360  		goto redirty;
1361  
1362  	if (!total_swap_pages)
1363  		goto redirty;
1364  
1365  	/*
1366  	 * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
1367  	 * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
1368  	 * and its shmem_writeback() needs them to be split when swapping.
1369  	 */
1370  	if (folio_test_large(folio)) {
1371  		/* Ensure the subpages are still dirty */
1372  		folio_test_set_dirty(folio);
1373  		if (split_huge_page(page) < 0)
1374  			goto redirty;
1375  		folio = page_folio(page);
1376  		folio_clear_dirty(folio);
1377  	}
1378  
1379  	index = folio->index;
1380  
1381  	/*
1382  	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1383  	 * value into swapfile.c, the only way we can correctly account for a
1384  	 * fallocated folio arriving here is now to initialize it and write it.
1385  	 *
1386  	 * That's okay for a folio already fallocated earlier, but if we have
1387  	 * not yet completed the fallocation, then (a) we want to keep track
1388  	 * of this folio in case we have to undo it, and (b) it may not be a
1389  	 * good idea to continue anyway, once we're pushing into swap.  So
1390  	 * reactivate the folio, and let shmem_fallocate() quit when too many.
1391  	 */
1392  	if (!folio_test_uptodate(folio)) {
1393  		if (inode->i_private) {
1394  			struct shmem_falloc *shmem_falloc;
1395  			spin_lock(&inode->i_lock);
1396  			shmem_falloc = inode->i_private;
1397  			if (shmem_falloc &&
1398  			    !shmem_falloc->waitq &&
1399  			    index >= shmem_falloc->start &&
1400  			    index < shmem_falloc->next)
1401  				shmem_falloc->nr_unswapped++;
1402  			else
1403  				shmem_falloc = NULL;
1404  			spin_unlock(&inode->i_lock);
1405  			if (shmem_falloc)
1406  				goto redirty;
1407  		}
1408  		folio_zero_range(folio, 0, folio_size(folio));
1409  		flush_dcache_folio(folio);
1410  		folio_mark_uptodate(folio);
1411  	}
1412  
1413  	swap = folio_alloc_swap(folio);
1414  	if (!swap.val)
1415  		goto redirty;
1416  
1417  	/*
1418  	 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1419  	 * if it's not already there.  Do it now before the folio is
1420  	 * moved to swap cache, when its pagelock no longer protects
1421  	 * the inode from eviction.  But don't unlock the mutex until
1422  	 * we've incremented swapped, because shmem_unuse_inode() will
1423  	 * prune a !swapped inode from the swaplist under this mutex.
1424  	 */
1425  	mutex_lock(&shmem_swaplist_mutex);
1426  	if (list_empty(&info->swaplist))
1427  		list_add(&info->swaplist, &shmem_swaplist);
1428  
1429  	if (add_to_swap_cache(folio, swap,
1430  			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
1431  			NULL) == 0) {
1432  		spin_lock_irq(&info->lock);
1433  		shmem_recalc_inode(inode);
1434  		info->swapped++;
1435  		spin_unlock_irq(&info->lock);
1436  
1437  		swap_shmem_alloc(swap);
1438  		shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
1439  
1440  		mutex_unlock(&shmem_swaplist_mutex);
1441  		BUG_ON(folio_mapped(folio));
1442  		swap_writepage(&folio->page, wbc);
1443  		return 0;
1444  	}
1445  
1446  	mutex_unlock(&shmem_swaplist_mutex);
1447  	put_swap_folio(folio, swap);
1448  redirty:
1449  	folio_mark_dirty(folio);
1450  	if (wbc->for_reclaim)
1451  		return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
1452  	folio_unlock(folio);
1453  	return 0;
1454  }
1455  
1456  #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
1457  static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1458  {
1459  	char buffer[64];
1460  
1461  	if (!mpol || mpol->mode == MPOL_DEFAULT)
1462  		return;		/* show nothing */
1463  
1464  	mpol_to_str(buffer, sizeof(buffer), mpol);
1465  
1466  	seq_printf(seq, ",mpol=%s", buffer);
1467  }
1468  
1469  static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1470  {
1471  	struct mempolicy *mpol = NULL;
1472  	if (sbinfo->mpol) {
1473  		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1474  		mpol = sbinfo->mpol;
1475  		mpol_get(mpol);
1476  		raw_spin_unlock(&sbinfo->stat_lock);
1477  	}
1478  	return mpol;
1479  }
1480  #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
1481  static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1482  {
1483  }
1484  static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1485  {
1486  	return NULL;
1487  }
1488  #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1489  #ifndef CONFIG_NUMA
1490  #define vm_policy vm_private_data
1491  #endif
1492  
1493  static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1494  		struct shmem_inode_info *info, pgoff_t index)
1495  {
1496  	/* Create a pseudo vma that just contains the policy */
1497  	vma_init(vma, NULL);
1498  	/* Bias interleave by inode number to distribute better across nodes */
1499  	vma->vm_pgoff = index + info->vfs_inode.i_ino;
1500  	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1501  }
1502  
1503  static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1504  {
1505  	/* Drop reference taken by mpol_shared_policy_lookup() */
1506  	mpol_cond_put(vma->vm_policy);
1507  }
1508  
1509  static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1510  			struct shmem_inode_info *info, pgoff_t index)
1511  {
1512  	struct vm_area_struct pvma;
1513  	struct page *page;
1514  	struct vm_fault vmf = {
1515  		.vma = &pvma,
1516  	};
1517  
1518  	shmem_pseudo_vma_init(&pvma, info, index);
1519  	page = swap_cluster_readahead(swap, gfp, &vmf);
1520  	shmem_pseudo_vma_destroy(&pvma);
1521  
1522  	if (!page)
1523  		return NULL;
1524  	return page_folio(page);
1525  }
1526  
1527  /*
1528   * Make sure huge_gfp is always more limited than limit_gfp.
1529   * Some of the flags set permissions, while others set limitations.
1530   */
1531  static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1532  {
1533  	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1534  	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1535  	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1536  	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1537  
1538  	/* Allow allocations only from the originally specified zones. */
1539  	result |= zoneflags;
1540  
1541  	/*
1542  	 * Minimize the result gfp by taking the union with the deny flags,
1543  	 * and the intersection of the allow flags.
1544  	 */
1545  	result |= (limit_gfp & denyflags);
1546  	result |= (huge_gfp & limit_gfp) & allowflags;
1547  
1548  	return result;
1549  }
1550  
1551  static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
1552  		struct shmem_inode_info *info, pgoff_t index)
1553  {
1554  	struct vm_area_struct pvma;
1555  	struct address_space *mapping = info->vfs_inode.i_mapping;
1556  	pgoff_t hindex;
1557  	struct folio *folio;
1558  
1559  	hindex = round_down(index, HPAGE_PMD_NR);
1560  	if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
1561  								XA_PRESENT))
1562  		return NULL;
1563  
1564  	shmem_pseudo_vma_init(&pvma, info, hindex);
1565  	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
1566  	shmem_pseudo_vma_destroy(&pvma);
1567  	if (!folio)
1568  		count_vm_event(THP_FILE_FALLBACK);
1569  	return folio;
1570  }
1571  
1572  static struct folio *shmem_alloc_folio(gfp_t gfp,
1573  			struct shmem_inode_info *info, pgoff_t index)
1574  {
1575  	struct vm_area_struct pvma;
1576  	struct folio *folio;
1577  
1578  	shmem_pseudo_vma_init(&pvma, info, index);
1579  	folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
1580  	shmem_pseudo_vma_destroy(&pvma);
1581  
1582  	return folio;
1583  }
1584  
1585  static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
1586  		pgoff_t index, bool huge)
1587  {
1588  	struct shmem_inode_info *info = SHMEM_I(inode);
1589  	struct folio *folio;
1590  	int nr;
1591  	int err = -ENOSPC;
1592  
1593  	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1594  		huge = false;
1595  	nr = huge ? HPAGE_PMD_NR : 1;
1596  
1597  	if (!shmem_inode_acct_block(inode, nr))
1598  		goto failed;
1599  
1600  	if (huge)
1601  		folio = shmem_alloc_hugefolio(gfp, info, index);
1602  	else
1603  		folio = shmem_alloc_folio(gfp, info, index);
1604  	if (folio) {
1605  		__folio_set_locked(folio);
1606  		__folio_set_swapbacked(folio);
1607  		return folio;
1608  	}
1609  
1610  	err = -ENOMEM;
1611  	shmem_inode_unacct_blocks(inode, nr);
1612  failed:
1613  	return ERR_PTR(err);
1614  }
1615  
1616  /*
1617   * When a page is moved from swapcache to shmem filecache (either by the
1618   * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
1619   * shmem_unuse_inode()), it may have been read in earlier from swap, in
1620   * ignorance of the mapping it belongs to.  If that mapping has special
1621   * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
1622   * we may need to copy to a suitable page before moving to filecache.
1623   *
1624   * In a future release, this may well be extended to respect cpuset and
1625   * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
1626   * but for now it is a simple matter of zone.
1627   */
1628  static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
1629  {
1630  	return folio_zonenum(folio) > gfp_zone(gfp);
1631  }
1632  
1633  static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
1634  				struct shmem_inode_info *info, pgoff_t index)
1635  {
1636  	struct folio *old, *new;
1637  	struct address_space *swap_mapping;
1638  	swp_entry_t entry;
1639  	pgoff_t swap_index;
1640  	int error;
1641  
1642  	old = *foliop;
1643  	entry = folio_swap_entry(old);
1644  	swap_index = swp_offset(entry);
1645  	swap_mapping = swap_address_space(entry);
1646  
1647  	/*
1648  	 * We have arrived here because our zones are constrained, so don't
1649  	 * limit chance of success by further cpuset and node constraints.
1650  	 */
1651  	gfp &= ~GFP_CONSTRAINT_MASK;
1652  	VM_BUG_ON_FOLIO(folio_test_large(old), old);
1653  	new = shmem_alloc_folio(gfp, info, index);
1654  	if (!new)
1655  		return -ENOMEM;
1656  
1657  	folio_get(new);
1658  	folio_copy(new, old);
1659  	flush_dcache_folio(new);
1660  
1661  	__folio_set_locked(new);
1662  	__folio_set_swapbacked(new);
1663  	folio_mark_uptodate(new);
1664  	folio_set_swap_entry(new, entry);
1665  	folio_set_swapcache(new);
1666  
1667  	/*
1668  	 * Our caller will very soon move newpage out of swapcache, but it's
1669  	 * a nice clean interface for us to replace oldpage by newpage there.
1670  	 */
1671  	xa_lock_irq(&swap_mapping->i_pages);
1672  	error = shmem_replace_entry(swap_mapping, swap_index, old, new);
1673  	if (!error) {
1674  		mem_cgroup_migrate(old, new);
1675  		__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
1676  		__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
1677  		__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
1678  		__lruvec_stat_mod_folio(old, NR_SHMEM, -1);
1679  	}
1680  	xa_unlock_irq(&swap_mapping->i_pages);
1681  
1682  	if (unlikely(error)) {
1683  		/*
1684  		 * Is this possible?  I think not, now that our callers check
1685  		 * both PageSwapCache and page_private after getting page lock;
1686  		 * but be defensive.  Reverse old to newpage for clear and free.
1687  		 */
1688  		old = new;
1689  	} else {
1690  		folio_add_lru(new);
1691  		*foliop = new;
1692  	}
1693  
1694  	folio_clear_swapcache(old);
1695  	old->private = NULL;
1696  
1697  	folio_unlock(old);
1698  	folio_put_refs(old, 2);
1699  	return error;
1700  }
1701  
1702  static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
1703  					 struct folio *folio, swp_entry_t swap)
1704  {
1705  	struct address_space *mapping = inode->i_mapping;
1706  	struct shmem_inode_info *info = SHMEM_I(inode);
1707  	swp_entry_t swapin_error;
1708  	void *old;
1709  
1710  	swapin_error = make_swapin_error_entry();
1711  	old = xa_cmpxchg_irq(&mapping->i_pages, index,
1712  			     swp_to_radix_entry(swap),
1713  			     swp_to_radix_entry(swapin_error), 0);
1714  	if (old != swp_to_radix_entry(swap))
1715  		return;
1716  
1717  	folio_wait_writeback(folio);
1718  	delete_from_swap_cache(folio);
1719  	spin_lock_irq(&info->lock);
1720  	/*
1721  	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
1722  	 * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
1723  	 * shmem_evict_inode.
1724  	 */
1725  	info->alloced--;
1726  	info->swapped--;
1727  	shmem_recalc_inode(inode);
1728  	spin_unlock_irq(&info->lock);
1729  	swap_free(swap);
1730  }
1731  
1732  /*
1733   * Swap in the folio pointed to by *foliop.
1734   * Caller has to make sure that *foliop contains a valid swapped folio.
1735   * Returns 0 and the folio in foliop if success. On failure, returns the
1736   * error code and NULL in *foliop.
1737   */
1738  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
1739  			     struct folio **foliop, enum sgp_type sgp,
1740  			     gfp_t gfp, struct vm_area_struct *vma,
1741  			     vm_fault_t *fault_type)
1742  {
1743  	struct address_space *mapping = inode->i_mapping;
1744  	struct shmem_inode_info *info = SHMEM_I(inode);
1745  	struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
1746  	struct swap_info_struct *si;
1747  	struct folio *folio = NULL;
1748  	swp_entry_t swap;
1749  	int error;
1750  
1751  	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
1752  	swap = radix_to_swp_entry(*foliop);
1753  	*foliop = NULL;
1754  
1755  	if (is_swapin_error_entry(swap))
1756  		return -EIO;
1757  
1758  	si = get_swap_device(swap);
1759  	if (!si) {
1760  		if (!shmem_confirm_swap(mapping, index, swap))
1761  			return -EEXIST;
1762  		else
1763  			return -EINVAL;
1764  	}
1765  
1766  	/* Look it up and read it in.. */
1767  	folio = swap_cache_get_folio(swap, NULL, 0);
1768  	if (!folio) {
1769  		/* Or update major stats only when swapin succeeds?? */
1770  		if (fault_type) {
1771  			*fault_type |= VM_FAULT_MAJOR;
1772  			count_vm_event(PGMAJFAULT);
1773  			count_memcg_event_mm(charge_mm, PGMAJFAULT);
1774  		}
1775  		/* Here we actually start the io */
1776  		folio = shmem_swapin(swap, gfp, info, index);
1777  		if (!folio) {
1778  			error = -ENOMEM;
1779  			goto failed;
1780  		}
1781  	}
1782  
1783  	/* We have to do this with folio locked to prevent races */
1784  	folio_lock(folio);
1785  	if (!folio_test_swapcache(folio) ||
1786  	    folio_swap_entry(folio).val != swap.val ||
1787  	    !shmem_confirm_swap(mapping, index, swap)) {
1788  		error = -EEXIST;
1789  		goto unlock;
1790  	}
1791  	if (!folio_test_uptodate(folio)) {
1792  		error = -EIO;
1793  		goto failed;
1794  	}
1795  	folio_wait_writeback(folio);
1796  
1797  	/*
1798  	 * Some architectures may have to restore extra metadata to the
1799  	 * folio after reading from swap.
1800  	 */
1801  	arch_swap_restore(swap, folio);
1802  
1803  	if (shmem_should_replace_folio(folio, gfp)) {
1804  		error = shmem_replace_folio(&folio, gfp, info, index);
1805  		if (error)
1806  			goto failed;
1807  	}
1808  
1809  	error = shmem_add_to_page_cache(folio, mapping, index,
1810  					swp_to_radix_entry(swap), gfp,
1811  					charge_mm);
1812  	if (error)
1813  		goto failed;
1814  
1815  	spin_lock_irq(&info->lock);
1816  	info->swapped--;
1817  	shmem_recalc_inode(inode);
1818  	spin_unlock_irq(&info->lock);
1819  
1820  	if (sgp == SGP_WRITE)
1821  		folio_mark_accessed(folio);
1822  
1823  	delete_from_swap_cache(folio);
1824  	folio_mark_dirty(folio);
1825  	swap_free(swap);
1826  	put_swap_device(si);
1827  
1828  	*foliop = folio;
1829  	return 0;
1830  failed:
1831  	if (!shmem_confirm_swap(mapping, index, swap))
1832  		error = -EEXIST;
1833  	if (error == -EIO)
1834  		shmem_set_folio_swapin_error(inode, index, folio, swap);
1835  unlock:
1836  	if (folio) {
1837  		folio_unlock(folio);
1838  		folio_put(folio);
1839  	}
1840  	put_swap_device(si);
1841  
1842  	return error;
1843  }
1844  
1845  /*
1846   * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
1847   *
1848   * If we allocate a new one we do not mark it dirty. That's up to the
1849   * vm. If we swap it in we mark it dirty since we also free the swap
1850   * entry since a page cannot live in both the swap and page cache.
1851   *
1852   * vma, vmf, and fault_type are only supplied by shmem_fault:
1853   * otherwise they are NULL.
1854   */
1855  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
1856  		struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
1857  		struct vm_area_struct *vma, struct vm_fault *vmf,
1858  		vm_fault_t *fault_type)
1859  {
1860  	struct address_space *mapping = inode->i_mapping;
1861  	struct shmem_inode_info *info = SHMEM_I(inode);
1862  	struct shmem_sb_info *sbinfo;
1863  	struct mm_struct *charge_mm;
1864  	struct folio *folio;
1865  	pgoff_t hindex;
1866  	gfp_t huge_gfp;
1867  	int error;
1868  	int once = 0;
1869  	int alloced = 0;
1870  
1871  	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1872  		return -EFBIG;
1873  repeat:
1874  	if (sgp <= SGP_CACHE &&
1875  	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
1876  		return -EINVAL;
1877  	}
1878  
1879  	sbinfo = SHMEM_SB(inode->i_sb);
1880  	charge_mm = vma ? vma->vm_mm : NULL;
1881  
1882  	folio = filemap_get_entry(mapping, index);
1883  	if (folio && vma && userfaultfd_minor(vma)) {
1884  		if (!xa_is_value(folio))
1885  			folio_put(folio);
1886  		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
1887  		return 0;
1888  	}
1889  
1890  	if (xa_is_value(folio)) {
1891  		error = shmem_swapin_folio(inode, index, &folio,
1892  					  sgp, gfp, vma, fault_type);
1893  		if (error == -EEXIST)
1894  			goto repeat;
1895  
1896  		*foliop = folio;
1897  		return error;
1898  	}
1899  
1900  	if (folio) {
1901  		folio_lock(folio);
1902  
1903  		/* Has the folio been truncated or swapped out? */
1904  		if (unlikely(folio->mapping != mapping)) {
1905  			folio_unlock(folio);
1906  			folio_put(folio);
1907  			goto repeat;
1908  		}
1909  		if (sgp == SGP_WRITE)
1910  			folio_mark_accessed(folio);
1911  		if (folio_test_uptodate(folio))
1912  			goto out;
1913  		/* fallocated folio */
1914  		if (sgp != SGP_READ)
1915  			goto clear;
1916  		folio_unlock(folio);
1917  		folio_put(folio);
1918  	}
1919  
1920  	/*
1921  	 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
1922  	 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
1923  	 */
1924  	*foliop = NULL;
1925  	if (sgp == SGP_READ)
1926  		return 0;
1927  	if (sgp == SGP_NOALLOC)
1928  		return -ENOENT;
1929  
1930  	/*
1931  	 * Fast cache lookup and swap lookup did not find it: allocate.
1932  	 */
1933  
1934  	if (vma && userfaultfd_missing(vma)) {
1935  		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
1936  		return 0;
1937  	}
1938  
1939  	if (!shmem_is_huge(inode, index, false,
1940  			   vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
1941  		goto alloc_nohuge;
1942  
1943  	huge_gfp = vma_thp_gfp_mask(vma);
1944  	huge_gfp = limit_gfp_mask(huge_gfp, gfp);
1945  	folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
1946  	if (IS_ERR(folio)) {
1947  alloc_nohuge:
1948  		folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
1949  	}
1950  	if (IS_ERR(folio)) {
1951  		int retry = 5;
1952  
1953  		error = PTR_ERR(folio);
1954  		folio = NULL;
1955  		if (error != -ENOSPC)
1956  			goto unlock;
1957  		/*
1958  		 * Try to reclaim some space by splitting a large folio
1959  		 * beyond i_size on the filesystem.
1960  		 */
1961  		while (retry--) {
1962  			int ret;
1963  
1964  			ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1965  			if (ret == SHRINK_STOP)
1966  				break;
1967  			if (ret)
1968  				goto alloc_nohuge;
1969  		}
1970  		goto unlock;
1971  	}
1972  
1973  	hindex = round_down(index, folio_nr_pages(folio));
1974  
1975  	if (sgp == SGP_WRITE)
1976  		__folio_set_referenced(folio);
1977  
1978  	error = shmem_add_to_page_cache(folio, mapping, hindex,
1979  					NULL, gfp & GFP_RECLAIM_MASK,
1980  					charge_mm);
1981  	if (error)
1982  		goto unacct;
1983  	folio_add_lru(folio);
1984  
1985  	spin_lock_irq(&info->lock);
1986  	info->alloced += folio_nr_pages(folio);
1987  	inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
1988  	shmem_recalc_inode(inode);
1989  	spin_unlock_irq(&info->lock);
1990  	alloced = true;
1991  
1992  	if (folio_test_pmd_mappable(folio) &&
1993  	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1994  					folio_next_index(folio) - 1) {
1995  		/*
1996  		 * Part of the large folio is beyond i_size: subject
1997  		 * to shrink under memory pressure.
1998  		 */
1999  		spin_lock(&sbinfo->shrinklist_lock);
2000  		/*
2001  		 * _careful to defend against unlocked access to
2002  		 * ->shrink_list in shmem_unused_huge_shrink()
2003  		 */
2004  		if (list_empty_careful(&info->shrinklist)) {
2005  			list_add_tail(&info->shrinklist,
2006  				      &sbinfo->shrinklist);
2007  			sbinfo->shrinklist_len++;
2008  		}
2009  		spin_unlock(&sbinfo->shrinklist_lock);
2010  	}
2011  
2012  	/*
2013  	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2014  	 */
2015  	if (sgp == SGP_FALLOC)
2016  		sgp = SGP_WRITE;
2017  clear:
2018  	/*
2019  	 * Let SGP_WRITE caller clear ends if write does not fill folio;
2020  	 * but SGP_FALLOC on a folio fallocated earlier must initialize
2021  	 * it now, lest undo on failure cancel our earlier guarantee.
2022  	 */
2023  	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2024  		long i, n = folio_nr_pages(folio);
2025  
2026  		for (i = 0; i < n; i++)
2027  			clear_highpage(folio_page(folio, i));
2028  		flush_dcache_folio(folio);
2029  		folio_mark_uptodate(folio);
2030  	}
2031  
2032  	/* Perhaps the file has been truncated since we checked */
2033  	if (sgp <= SGP_CACHE &&
2034  	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2035  		if (alloced) {
2036  			folio_clear_dirty(folio);
2037  			filemap_remove_folio(folio);
2038  			spin_lock_irq(&info->lock);
2039  			shmem_recalc_inode(inode);
2040  			spin_unlock_irq(&info->lock);
2041  		}
2042  		error = -EINVAL;
2043  		goto unlock;
2044  	}
2045  out:
2046  	*foliop = folio;
2047  	return 0;
2048  
2049  	/*
2050  	 * Error recovery.
2051  	 */
2052  unacct:
2053  	shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
2054  
2055  	if (folio_test_large(folio)) {
2056  		folio_unlock(folio);
2057  		folio_put(folio);
2058  		goto alloc_nohuge;
2059  	}
2060  unlock:
2061  	if (folio) {
2062  		folio_unlock(folio);
2063  		folio_put(folio);
2064  	}
2065  	if (error == -ENOSPC && !once++) {
2066  		spin_lock_irq(&info->lock);
2067  		shmem_recalc_inode(inode);
2068  		spin_unlock_irq(&info->lock);
2069  		goto repeat;
2070  	}
2071  	if (error == -EEXIST)
2072  		goto repeat;
2073  	return error;
2074  }
2075  
2076  int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
2077  		enum sgp_type sgp)
2078  {
2079  	return shmem_get_folio_gfp(inode, index, foliop, sgp,
2080  			mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
2081  }
2082  
2083  /*
2084   * This is like autoremove_wake_function, but it removes the wait queue
2085   * entry unconditionally - even if something else had already woken the
2086   * target.
2087   */
2088  static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
2089  {
2090  	int ret = default_wake_function(wait, mode, sync, key);
2091  	list_del_init(&wait->entry);
2092  	return ret;
2093  }
2094  
2095  static vm_fault_t shmem_fault(struct vm_fault *vmf)
2096  {
2097  	struct vm_area_struct *vma = vmf->vma;
2098  	struct inode *inode = file_inode(vma->vm_file);
2099  	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2100  	struct folio *folio = NULL;
2101  	int err;
2102  	vm_fault_t ret = VM_FAULT_LOCKED;
2103  
2104  	/*
2105  	 * Trinity finds that probing a hole which tmpfs is punching can
2106  	 * prevent the hole-punch from ever completing: which in turn
2107  	 * locks writers out with its hold on i_rwsem.  So refrain from
2108  	 * faulting pages into the hole while it's being punched.  Although
2109  	 * shmem_undo_range() does remove the additions, it may be unable to
2110  	 * keep up, as each new page needs its own unmap_mapping_range() call,
2111  	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
2112  	 *
2113  	 * It does not matter if we sometimes reach this check just before the
2114  	 * hole-punch begins, so that one fault then races with the punch:
2115  	 * we just need to make racing faults a rare case.
2116  	 *
2117  	 * The implementation below would be much simpler if we just used a
2118  	 * standard mutex or completion: but we cannot take i_rwsem in fault,
2119  	 * and bloating every shmem inode for this unlikely case would be sad.
2120  	 */
2121  	if (unlikely(inode->i_private)) {
2122  		struct shmem_falloc *shmem_falloc;
2123  
2124  		spin_lock(&inode->i_lock);
2125  		shmem_falloc = inode->i_private;
2126  		if (shmem_falloc &&
2127  		    shmem_falloc->waitq &&
2128  		    vmf->pgoff >= shmem_falloc->start &&
2129  		    vmf->pgoff < shmem_falloc->next) {
2130  			struct file *fpin;
2131  			wait_queue_head_t *shmem_falloc_waitq;
2132  			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2133  
2134  			ret = VM_FAULT_NOPAGE;
2135  			fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2136  			if (fpin)
2137  				ret = VM_FAULT_RETRY;
2138  
2139  			shmem_falloc_waitq = shmem_falloc->waitq;
2140  			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2141  					TASK_UNINTERRUPTIBLE);
2142  			spin_unlock(&inode->i_lock);
2143  			schedule();
2144  
2145  			/*
2146  			 * shmem_falloc_waitq points into the shmem_fallocate()
2147  			 * stack of the hole-punching task: shmem_falloc_waitq
2148  			 * is usually invalid by the time we reach here, but
2149  			 * finish_wait() does not dereference it in that case;
2150  			 * though i_lock needed lest racing with wake_up_all().
2151  			 */
2152  			spin_lock(&inode->i_lock);
2153  			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2154  			spin_unlock(&inode->i_lock);
2155  
2156  			if (fpin)
2157  				fput(fpin);
2158  			return ret;
2159  		}
2160  		spin_unlock(&inode->i_lock);
2161  	}
2162  
2163  	err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
2164  				  gfp, vma, vmf, &ret);
2165  	if (err)
2166  		return vmf_error(err);
2167  	if (folio)
2168  		vmf->page = folio_file_page(folio, vmf->pgoff);
2169  	return ret;
2170  }
2171  
2172  unsigned long shmem_get_unmapped_area(struct file *file,
2173  				      unsigned long uaddr, unsigned long len,
2174  				      unsigned long pgoff, unsigned long flags)
2175  {
2176  	unsigned long (*get_area)(struct file *,
2177  		unsigned long, unsigned long, unsigned long, unsigned long);
2178  	unsigned long addr;
2179  	unsigned long offset;
2180  	unsigned long inflated_len;
2181  	unsigned long inflated_addr;
2182  	unsigned long inflated_offset;
2183  
2184  	if (len > TASK_SIZE)
2185  		return -ENOMEM;
2186  
2187  	get_area = current->mm->get_unmapped_area;
2188  	addr = get_area(file, uaddr, len, pgoff, flags);
2189  
2190  	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2191  		return addr;
2192  	if (IS_ERR_VALUE(addr))
2193  		return addr;
2194  	if (addr & ~PAGE_MASK)
2195  		return addr;
2196  	if (addr > TASK_SIZE - len)
2197  		return addr;
2198  
2199  	if (shmem_huge == SHMEM_HUGE_DENY)
2200  		return addr;
2201  	if (len < HPAGE_PMD_SIZE)
2202  		return addr;
2203  	if (flags & MAP_FIXED)
2204  		return addr;
2205  	/*
2206  	 * Our priority is to support MAP_SHARED mapped hugely;
2207  	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2208  	 * But if caller specified an address hint and we allocated area there
2209  	 * successfully, respect that as before.
2210  	 */
2211  	if (uaddr == addr)
2212  		return addr;
2213  
2214  	if (shmem_huge != SHMEM_HUGE_FORCE) {
2215  		struct super_block *sb;
2216  
2217  		if (file) {
2218  			VM_BUG_ON(file->f_op != &shmem_file_operations);
2219  			sb = file_inode(file)->i_sb;
2220  		} else {
2221  			/*
2222  			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2223  			 * for "/dev/zero", to create a shared anonymous object.
2224  			 */
2225  			if (IS_ERR(shm_mnt))
2226  				return addr;
2227  			sb = shm_mnt->mnt_sb;
2228  		}
2229  		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
2230  			return addr;
2231  	}
2232  
2233  	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
2234  	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
2235  		return addr;
2236  	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
2237  		return addr;
2238  
2239  	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
2240  	if (inflated_len > TASK_SIZE)
2241  		return addr;
2242  	if (inflated_len < len)
2243  		return addr;
2244  
2245  	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
2246  	if (IS_ERR_VALUE(inflated_addr))
2247  		return addr;
2248  	if (inflated_addr & ~PAGE_MASK)
2249  		return addr;
2250  
2251  	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2252  	inflated_addr += offset - inflated_offset;
2253  	if (inflated_offset > offset)
2254  		inflated_addr += HPAGE_PMD_SIZE;
2255  
2256  	if (inflated_addr > TASK_SIZE - len)
2257  		return addr;
2258  	return inflated_addr;
2259  }
2260  
2261  #ifdef CONFIG_NUMA
2262  static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2263  {
2264  	struct inode *inode = file_inode(vma->vm_file);
2265  	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2266  }
2267  
2268  static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2269  					  unsigned long addr)
2270  {
2271  	struct inode *inode = file_inode(vma->vm_file);
2272  	pgoff_t index;
2273  
2274  	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2275  	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2276  }
2277  #endif
2278  
2279  int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2280  {
2281  	struct inode *inode = file_inode(file);
2282  	struct shmem_inode_info *info = SHMEM_I(inode);
2283  	int retval = -ENOMEM;
2284  
2285  	/*
2286  	 * What serializes the accesses to info->flags?
2287  	 * ipc_lock_object() when called from shmctl_do_lock(),
2288  	 * no serialization needed when called from shm_destroy().
2289  	 */
2290  	if (lock && !(info->flags & VM_LOCKED)) {
2291  		if (!user_shm_lock(inode->i_size, ucounts))
2292  			goto out_nomem;
2293  		info->flags |= VM_LOCKED;
2294  		mapping_set_unevictable(file->f_mapping);
2295  	}
2296  	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2297  		user_shm_unlock(inode->i_size, ucounts);
2298  		info->flags &= ~VM_LOCKED;
2299  		mapping_clear_unevictable(file->f_mapping);
2300  	}
2301  	retval = 0;
2302  
2303  out_nomem:
2304  	return retval;
2305  }
2306  
2307  static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2308  {
2309  	struct inode *inode = file_inode(file);
2310  	struct shmem_inode_info *info = SHMEM_I(inode);
2311  	int ret;
2312  
2313  	ret = seal_check_future_write(info->seals, vma);
2314  	if (ret)
2315  		return ret;
2316  
2317  	/* arm64 - allow memory tagging on RAM-based files */
2318  	vm_flags_set(vma, VM_MTE_ALLOWED);
2319  
2320  	file_accessed(file);
2321  	/* This is anonymous shared memory if it is unlinked at the time of mmap */
2322  	if (inode->i_nlink)
2323  		vma->vm_ops = &shmem_vm_ops;
2324  	else
2325  		vma->vm_ops = &shmem_anon_vm_ops;
2326  	return 0;
2327  }
2328  
2329  #ifdef CONFIG_TMPFS_XATTR
2330  static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2331  
2332  /*
2333   * chattr's fsflags are unrelated to extended attributes,
2334   * but tmpfs has chosen to enable them under the same config option.
2335   */
2336  static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2337  {
2338  	unsigned int i_flags = 0;
2339  
2340  	if (fsflags & FS_NOATIME_FL)
2341  		i_flags |= S_NOATIME;
2342  	if (fsflags & FS_APPEND_FL)
2343  		i_flags |= S_APPEND;
2344  	if (fsflags & FS_IMMUTABLE_FL)
2345  		i_flags |= S_IMMUTABLE;
2346  	/*
2347  	 * But FS_NODUMP_FL does not require any action in i_flags.
2348  	 */
2349  	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
2350  }
2351  #else
2352  static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
2353  {
2354  }
2355  #define shmem_initxattrs NULL
2356  #endif
2357  
2358  static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb,
2359  				     struct inode *dir, umode_t mode, dev_t dev,
2360  				     unsigned long flags)
2361  {
2362  	struct inode *inode;
2363  	struct shmem_inode_info *info;
2364  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2365  	ino_t ino;
2366  
2367  	if (shmem_reserve_inode(sb, &ino))
2368  		return NULL;
2369  
2370  	inode = new_inode(sb);
2371  	if (inode) {
2372  		inode->i_ino = ino;
2373  		inode_init_owner(idmap, inode, dir, mode);
2374  		inode->i_blocks = 0;
2375  		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
2376  		inode->i_generation = get_random_u32();
2377  		info = SHMEM_I(inode);
2378  		memset(info, 0, (char *)inode - (char *)info);
2379  		spin_lock_init(&info->lock);
2380  		atomic_set(&info->stop_eviction, 0);
2381  		info->seals = F_SEAL_SEAL;
2382  		info->flags = flags & VM_NORESERVE;
2383  		info->i_crtime = inode->i_mtime;
2384  		info->fsflags = (dir == NULL) ? 0 :
2385  			SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
2386  		if (info->fsflags)
2387  			shmem_set_inode_flags(inode, info->fsflags);
2388  		INIT_LIST_HEAD(&info->shrinklist);
2389  		INIT_LIST_HEAD(&info->swaplist);
2390  		if (sbinfo->noswap)
2391  			mapping_set_unevictable(inode->i_mapping);
2392  		simple_xattrs_init(&info->xattrs);
2393  		cache_no_acl(inode);
2394  		mapping_set_large_folios(inode->i_mapping);
2395  
2396  		switch (mode & S_IFMT) {
2397  		default:
2398  			inode->i_op = &shmem_special_inode_operations;
2399  			init_special_inode(inode, mode, dev);
2400  			break;
2401  		case S_IFREG:
2402  			inode->i_mapping->a_ops = &shmem_aops;
2403  			inode->i_op = &shmem_inode_operations;
2404  			inode->i_fop = &shmem_file_operations;
2405  			mpol_shared_policy_init(&info->policy,
2406  						 shmem_get_sbmpol(sbinfo));
2407  			break;
2408  		case S_IFDIR:
2409  			inc_nlink(inode);
2410  			/* Some things misbehave if size == 0 on a directory */
2411  			inode->i_size = 2 * BOGO_DIRENT_SIZE;
2412  			inode->i_op = &shmem_dir_inode_operations;
2413  			inode->i_fop = &simple_dir_operations;
2414  			break;
2415  		case S_IFLNK:
2416  			/*
2417  			 * Must not load anything in the rbtree,
2418  			 * mpol_free_shared_policy will not be called.
2419  			 */
2420  			mpol_shared_policy_init(&info->policy, NULL);
2421  			break;
2422  		}
2423  
2424  		lockdep_annotate_inode_mutex_key(inode);
2425  	} else
2426  		shmem_free_inode(sb);
2427  	return inode;
2428  }
2429  
2430  #ifdef CONFIG_USERFAULTFD
2431  int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
2432  			   struct vm_area_struct *dst_vma,
2433  			   unsigned long dst_addr,
2434  			   unsigned long src_addr,
2435  			   uffd_flags_t flags,
2436  			   struct folio **foliop)
2437  {
2438  	struct inode *inode = file_inode(dst_vma->vm_file);
2439  	struct shmem_inode_info *info = SHMEM_I(inode);
2440  	struct address_space *mapping = inode->i_mapping;
2441  	gfp_t gfp = mapping_gfp_mask(mapping);
2442  	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
2443  	void *page_kaddr;
2444  	struct folio *folio;
2445  	int ret;
2446  	pgoff_t max_off;
2447  
2448  	if (!shmem_inode_acct_block(inode, 1)) {
2449  		/*
2450  		 * We may have got a page, returned -ENOENT triggering a retry,
2451  		 * and now we find ourselves with -ENOMEM. Release the page, to
2452  		 * avoid a BUG_ON in our caller.
2453  		 */
2454  		if (unlikely(*foliop)) {
2455  			folio_put(*foliop);
2456  			*foliop = NULL;
2457  		}
2458  		return -ENOMEM;
2459  	}
2460  
2461  	if (!*foliop) {
2462  		ret = -ENOMEM;
2463  		folio = shmem_alloc_folio(gfp, info, pgoff);
2464  		if (!folio)
2465  			goto out_unacct_blocks;
2466  
2467  		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
2468  			page_kaddr = kmap_local_folio(folio, 0);
2469  			/*
2470  			 * The read mmap_lock is held here.  Despite the
2471  			 * mmap_lock being read recursive a deadlock is still
2472  			 * possible if a writer has taken a lock.  For example:
2473  			 *
2474  			 * process A thread 1 takes read lock on own mmap_lock
2475  			 * process A thread 2 calls mmap, blocks taking write lock
2476  			 * process B thread 1 takes page fault, read lock on own mmap lock
2477  			 * process B thread 2 calls mmap, blocks taking write lock
2478  			 * process A thread 1 blocks taking read lock on process B
2479  			 * process B thread 1 blocks taking read lock on process A
2480  			 *
2481  			 * Disable page faults to prevent potential deadlock
2482  			 * and retry the copy outside the mmap_lock.
2483  			 */
2484  			pagefault_disable();
2485  			ret = copy_from_user(page_kaddr,
2486  					     (const void __user *)src_addr,
2487  					     PAGE_SIZE);
2488  			pagefault_enable();
2489  			kunmap_local(page_kaddr);
2490  
2491  			/* fallback to copy_from_user outside mmap_lock */
2492  			if (unlikely(ret)) {
2493  				*foliop = folio;
2494  				ret = -ENOENT;
2495  				/* don't free the page */
2496  				goto out_unacct_blocks;
2497  			}
2498  
2499  			flush_dcache_folio(folio);
2500  		} else {		/* ZEROPAGE */
2501  			clear_user_highpage(&folio->page, dst_addr);
2502  		}
2503  	} else {
2504  		folio = *foliop;
2505  		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2506  		*foliop = NULL;
2507  	}
2508  
2509  	VM_BUG_ON(folio_test_locked(folio));
2510  	VM_BUG_ON(folio_test_swapbacked(folio));
2511  	__folio_set_locked(folio);
2512  	__folio_set_swapbacked(folio);
2513  	__folio_mark_uptodate(folio);
2514  
2515  	ret = -EFAULT;
2516  	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
2517  	if (unlikely(pgoff >= max_off))
2518  		goto out_release;
2519  
2520  	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
2521  				      gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
2522  	if (ret)
2523  		goto out_release;
2524  
2525  	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
2526  				       &folio->page, true, flags);
2527  	if (ret)
2528  		goto out_delete_from_cache;
2529  
2530  	spin_lock_irq(&info->lock);
2531  	info->alloced++;
2532  	inode->i_blocks += BLOCKS_PER_PAGE;
2533  	shmem_recalc_inode(inode);
2534  	spin_unlock_irq(&info->lock);
2535  
2536  	folio_unlock(folio);
2537  	return 0;
2538  out_delete_from_cache:
2539  	filemap_remove_folio(folio);
2540  out_release:
2541  	folio_unlock(folio);
2542  	folio_put(folio);
2543  out_unacct_blocks:
2544  	shmem_inode_unacct_blocks(inode, 1);
2545  	return ret;
2546  }
2547  #endif /* CONFIG_USERFAULTFD */
2548  
2549  #ifdef CONFIG_TMPFS
2550  static const struct inode_operations shmem_symlink_inode_operations;
2551  static const struct inode_operations shmem_short_symlink_operations;
2552  
2553  static int
2554  shmem_write_begin(struct file *file, struct address_space *mapping,
2555  			loff_t pos, unsigned len,
2556  			struct page **pagep, void **fsdata)
2557  {
2558  	struct inode *inode = mapping->host;
2559  	struct shmem_inode_info *info = SHMEM_I(inode);
2560  	pgoff_t index = pos >> PAGE_SHIFT;
2561  	struct folio *folio;
2562  	int ret = 0;
2563  
2564  	/* i_rwsem is held by caller */
2565  	if (unlikely(info->seals & (F_SEAL_GROW |
2566  				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
2567  		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
2568  			return -EPERM;
2569  		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
2570  			return -EPERM;
2571  	}
2572  
2573  	ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
2574  
2575  	if (ret)
2576  		return ret;
2577  
2578  	*pagep = folio_file_page(folio, index);
2579  	if (PageHWPoison(*pagep)) {
2580  		folio_unlock(folio);
2581  		folio_put(folio);
2582  		*pagep = NULL;
2583  		return -EIO;
2584  	}
2585  
2586  	return 0;
2587  }
2588  
2589  static int
2590  shmem_write_end(struct file *file, struct address_space *mapping,
2591  			loff_t pos, unsigned len, unsigned copied,
2592  			struct page *page, void *fsdata)
2593  {
2594  	struct folio *folio = page_folio(page);
2595  	struct inode *inode = mapping->host;
2596  
2597  	if (pos + copied > inode->i_size)
2598  		i_size_write(inode, pos + copied);
2599  
2600  	if (!folio_test_uptodate(folio)) {
2601  		if (copied < folio_size(folio)) {
2602  			size_t from = offset_in_folio(folio, pos);
2603  			folio_zero_segments(folio, 0, from,
2604  					from + copied, folio_size(folio));
2605  		}
2606  		folio_mark_uptodate(folio);
2607  	}
2608  	folio_mark_dirty(folio);
2609  	folio_unlock(folio);
2610  	folio_put(folio);
2611  
2612  	return copied;
2613  }
2614  
2615  static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
2616  {
2617  	struct file *file = iocb->ki_filp;
2618  	struct inode *inode = file_inode(file);
2619  	struct address_space *mapping = inode->i_mapping;
2620  	pgoff_t index;
2621  	unsigned long offset;
2622  	int error = 0;
2623  	ssize_t retval = 0;
2624  	loff_t *ppos = &iocb->ki_pos;
2625  
2626  	index = *ppos >> PAGE_SHIFT;
2627  	offset = *ppos & ~PAGE_MASK;
2628  
2629  	for (;;) {
2630  		struct folio *folio = NULL;
2631  		struct page *page = NULL;
2632  		pgoff_t end_index;
2633  		unsigned long nr, ret;
2634  		loff_t i_size = i_size_read(inode);
2635  
2636  		end_index = i_size >> PAGE_SHIFT;
2637  		if (index > end_index)
2638  			break;
2639  		if (index == end_index) {
2640  			nr = i_size & ~PAGE_MASK;
2641  			if (nr <= offset)
2642  				break;
2643  		}
2644  
2645  		error = shmem_get_folio(inode, index, &folio, SGP_READ);
2646  		if (error) {
2647  			if (error == -EINVAL)
2648  				error = 0;
2649  			break;
2650  		}
2651  		if (folio) {
2652  			folio_unlock(folio);
2653  
2654  			page = folio_file_page(folio, index);
2655  			if (PageHWPoison(page)) {
2656  				folio_put(folio);
2657  				error = -EIO;
2658  				break;
2659  			}
2660  		}
2661  
2662  		/*
2663  		 * We must evaluate after, since reads (unlike writes)
2664  		 * are called without i_rwsem protection against truncate
2665  		 */
2666  		nr = PAGE_SIZE;
2667  		i_size = i_size_read(inode);
2668  		end_index = i_size >> PAGE_SHIFT;
2669  		if (index == end_index) {
2670  			nr = i_size & ~PAGE_MASK;
2671  			if (nr <= offset) {
2672  				if (folio)
2673  					folio_put(folio);
2674  				break;
2675  			}
2676  		}
2677  		nr -= offset;
2678  
2679  		if (folio) {
2680  			/*
2681  			 * If users can be writing to this page using arbitrary
2682  			 * virtual addresses, take care about potential aliasing
2683  			 * before reading the page on the kernel side.
2684  			 */
2685  			if (mapping_writably_mapped(mapping))
2686  				flush_dcache_page(page);
2687  			/*
2688  			 * Mark the page accessed if we read the beginning.
2689  			 */
2690  			if (!offset)
2691  				folio_mark_accessed(folio);
2692  			/*
2693  			 * Ok, we have the page, and it's up-to-date, so
2694  			 * now we can copy it to user space...
2695  			 */
2696  			ret = copy_page_to_iter(page, offset, nr, to);
2697  			folio_put(folio);
2698  
2699  		} else if (user_backed_iter(to)) {
2700  			/*
2701  			 * Copy to user tends to be so well optimized, but
2702  			 * clear_user() not so much, that it is noticeably
2703  			 * faster to copy the zero page instead of clearing.
2704  			 */
2705  			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
2706  		} else {
2707  			/*
2708  			 * But submitting the same page twice in a row to
2709  			 * splice() - or others? - can result in confusion:
2710  			 * so don't attempt that optimization on pipes etc.
2711  			 */
2712  			ret = iov_iter_zero(nr, to);
2713  		}
2714  
2715  		retval += ret;
2716  		offset += ret;
2717  		index += offset >> PAGE_SHIFT;
2718  		offset &= ~PAGE_MASK;
2719  
2720  		if (!iov_iter_count(to))
2721  			break;
2722  		if (ret < nr) {
2723  			error = -EFAULT;
2724  			break;
2725  		}
2726  		cond_resched();
2727  	}
2728  
2729  	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
2730  	file_accessed(file);
2731  	return retval ? retval : error;
2732  }
2733  
2734  static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
2735  			      struct pipe_buffer *buf)
2736  {
2737  	return true;
2738  }
2739  
2740  static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
2741  				  struct pipe_buffer *buf)
2742  {
2743  }
2744  
2745  static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
2746  				    struct pipe_buffer *buf)
2747  {
2748  	return false;
2749  }
2750  
2751  static const struct pipe_buf_operations zero_pipe_buf_ops = {
2752  	.release	= zero_pipe_buf_release,
2753  	.try_steal	= zero_pipe_buf_try_steal,
2754  	.get		= zero_pipe_buf_get,
2755  };
2756  
2757  static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
2758  					loff_t fpos, size_t size)
2759  {
2760  	size_t offset = fpos & ~PAGE_MASK;
2761  
2762  	size = min_t(size_t, size, PAGE_SIZE - offset);
2763  
2764  	if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
2765  		struct pipe_buffer *buf = pipe_head_buf(pipe);
2766  
2767  		*buf = (struct pipe_buffer) {
2768  			.ops	= &zero_pipe_buf_ops,
2769  			.page	= ZERO_PAGE(0),
2770  			.offset	= offset,
2771  			.len	= size,
2772  		};
2773  		pipe->head++;
2774  	}
2775  
2776  	return size;
2777  }
2778  
2779  static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
2780  				      struct pipe_inode_info *pipe,
2781  				      size_t len, unsigned int flags)
2782  {
2783  	struct inode *inode = file_inode(in);
2784  	struct address_space *mapping = inode->i_mapping;
2785  	struct folio *folio = NULL;
2786  	size_t total_spliced = 0, used, npages, n, part;
2787  	loff_t isize;
2788  	int error = 0;
2789  
2790  	/* Work out how much data we can actually add into the pipe */
2791  	used = pipe_occupancy(pipe->head, pipe->tail);
2792  	npages = max_t(ssize_t, pipe->max_usage - used, 0);
2793  	len = min_t(size_t, len, npages * PAGE_SIZE);
2794  
2795  	do {
2796  		if (*ppos >= i_size_read(inode))
2797  			break;
2798  
2799  		error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
2800  		if (error) {
2801  			if (error == -EINVAL)
2802  				error = 0;
2803  			break;
2804  		}
2805  		if (folio) {
2806  			folio_unlock(folio);
2807  
2808  			if (folio_test_hwpoison(folio)) {
2809  				error = -EIO;
2810  				break;
2811  			}
2812  		}
2813  
2814  		/*
2815  		 * i_size must be checked after we know the pages are Uptodate.
2816  		 *
2817  		 * Checking i_size after the check allows us to calculate
2818  		 * the correct value for "nr", which means the zero-filled
2819  		 * part of the page is not copied back to userspace (unless
2820  		 * another truncate extends the file - this is desired though).
2821  		 */
2822  		isize = i_size_read(inode);
2823  		if (unlikely(*ppos >= isize))
2824  			break;
2825  		part = min_t(loff_t, isize - *ppos, len);
2826  
2827  		if (folio) {
2828  			/*
2829  			 * If users can be writing to this page using arbitrary
2830  			 * virtual addresses, take care about potential aliasing
2831  			 * before reading the page on the kernel side.
2832  			 */
2833  			if (mapping_writably_mapped(mapping))
2834  				flush_dcache_folio(folio);
2835  			folio_mark_accessed(folio);
2836  			/*
2837  			 * Ok, we have the page, and it's up-to-date, so we can
2838  			 * now splice it into the pipe.
2839  			 */
2840  			n = splice_folio_into_pipe(pipe, folio, *ppos, part);
2841  			folio_put(folio);
2842  			folio = NULL;
2843  		} else {
2844  			n = splice_zeropage_into_pipe(pipe, *ppos, len);
2845  		}
2846  
2847  		if (!n)
2848  			break;
2849  		len -= n;
2850  		total_spliced += n;
2851  		*ppos += n;
2852  		in->f_ra.prev_pos = *ppos;
2853  		if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
2854  			break;
2855  
2856  		cond_resched();
2857  	} while (len);
2858  
2859  	if (folio)
2860  		folio_put(folio);
2861  
2862  	file_accessed(in);
2863  	return total_spliced ? total_spliced : error;
2864  }
2865  
2866  static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
2867  {
2868  	struct address_space *mapping = file->f_mapping;
2869  	struct inode *inode = mapping->host;
2870  
2871  	if (whence != SEEK_DATA && whence != SEEK_HOLE)
2872  		return generic_file_llseek_size(file, offset, whence,
2873  					MAX_LFS_FILESIZE, i_size_read(inode));
2874  	if (offset < 0)
2875  		return -ENXIO;
2876  
2877  	inode_lock(inode);
2878  	/* We're holding i_rwsem so we can access i_size directly */
2879  	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
2880  	if (offset >= 0)
2881  		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
2882  	inode_unlock(inode);
2883  	return offset;
2884  }
2885  
2886  static long shmem_fallocate(struct file *file, int mode, loff_t offset,
2887  							 loff_t len)
2888  {
2889  	struct inode *inode = file_inode(file);
2890  	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2891  	struct shmem_inode_info *info = SHMEM_I(inode);
2892  	struct shmem_falloc shmem_falloc;
2893  	pgoff_t start, index, end, undo_fallocend;
2894  	int error;
2895  
2896  	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2897  		return -EOPNOTSUPP;
2898  
2899  	inode_lock(inode);
2900  
2901  	if (mode & FALLOC_FL_PUNCH_HOLE) {
2902  		struct address_space *mapping = file->f_mapping;
2903  		loff_t unmap_start = round_up(offset, PAGE_SIZE);
2904  		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
2905  		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
2906  
2907  		/* protected by i_rwsem */
2908  		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
2909  			error = -EPERM;
2910  			goto out;
2911  		}
2912  
2913  		shmem_falloc.waitq = &shmem_falloc_waitq;
2914  		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
2915  		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
2916  		spin_lock(&inode->i_lock);
2917  		inode->i_private = &shmem_falloc;
2918  		spin_unlock(&inode->i_lock);
2919  
2920  		if ((u64)unmap_end > (u64)unmap_start)
2921  			unmap_mapping_range(mapping, unmap_start,
2922  					    1 + unmap_end - unmap_start, 0);
2923  		shmem_truncate_range(inode, offset, offset + len - 1);
2924  		/* No need to unmap again: hole-punching leaves COWed pages */
2925  
2926  		spin_lock(&inode->i_lock);
2927  		inode->i_private = NULL;
2928  		wake_up_all(&shmem_falloc_waitq);
2929  		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
2930  		spin_unlock(&inode->i_lock);
2931  		error = 0;
2932  		goto out;
2933  	}
2934  
2935  	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
2936  	error = inode_newsize_ok(inode, offset + len);
2937  	if (error)
2938  		goto out;
2939  
2940  	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
2941  		error = -EPERM;
2942  		goto out;
2943  	}
2944  
2945  	start = offset >> PAGE_SHIFT;
2946  	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2947  	/* Try to avoid a swapstorm if len is impossible to satisfy */
2948  	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
2949  		error = -ENOSPC;
2950  		goto out;
2951  	}
2952  
2953  	shmem_falloc.waitq = NULL;
2954  	shmem_falloc.start = start;
2955  	shmem_falloc.next  = start;
2956  	shmem_falloc.nr_falloced = 0;
2957  	shmem_falloc.nr_unswapped = 0;
2958  	spin_lock(&inode->i_lock);
2959  	inode->i_private = &shmem_falloc;
2960  	spin_unlock(&inode->i_lock);
2961  
2962  	/*
2963  	 * info->fallocend is only relevant when huge pages might be
2964  	 * involved: to prevent split_huge_page() freeing fallocated
2965  	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
2966  	 */
2967  	undo_fallocend = info->fallocend;
2968  	if (info->fallocend < end)
2969  		info->fallocend = end;
2970  
2971  	for (index = start; index < end; ) {
2972  		struct folio *folio;
2973  
2974  		/*
2975  		 * Good, the fallocate(2) manpage permits EINTR: we may have
2976  		 * been interrupted because we are using up too much memory.
2977  		 */
2978  		if (signal_pending(current))
2979  			error = -EINTR;
2980  		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
2981  			error = -ENOMEM;
2982  		else
2983  			error = shmem_get_folio(inode, index, &folio,
2984  						SGP_FALLOC);
2985  		if (error) {
2986  			info->fallocend = undo_fallocend;
2987  			/* Remove the !uptodate folios we added */
2988  			if (index > start) {
2989  				shmem_undo_range(inode,
2990  				    (loff_t)start << PAGE_SHIFT,
2991  				    ((loff_t)index << PAGE_SHIFT) - 1, true);
2992  			}
2993  			goto undone;
2994  		}
2995  
2996  		/*
2997  		 * Here is a more important optimization than it appears:
2998  		 * a second SGP_FALLOC on the same large folio will clear it,
2999  		 * making it uptodate and un-undoable if we fail later.
3000  		 */
3001  		index = folio_next_index(folio);
3002  		/* Beware 32-bit wraparound */
3003  		if (!index)
3004  			index--;
3005  
3006  		/*
3007  		 * Inform shmem_writepage() how far we have reached.
3008  		 * No need for lock or barrier: we have the page lock.
3009  		 */
3010  		if (!folio_test_uptodate(folio))
3011  			shmem_falloc.nr_falloced += index - shmem_falloc.next;
3012  		shmem_falloc.next = index;
3013  
3014  		/*
3015  		 * If !uptodate, leave it that way so that freeable folios
3016  		 * can be recognized if we need to rollback on error later.
3017  		 * But mark it dirty so that memory pressure will swap rather
3018  		 * than free the folios we are allocating (and SGP_CACHE folios
3019  		 * might still be clean: we now need to mark those dirty too).
3020  		 */
3021  		folio_mark_dirty(folio);
3022  		folio_unlock(folio);
3023  		folio_put(folio);
3024  		cond_resched();
3025  	}
3026  
3027  	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3028  		i_size_write(inode, offset + len);
3029  undone:
3030  	spin_lock(&inode->i_lock);
3031  	inode->i_private = NULL;
3032  	spin_unlock(&inode->i_lock);
3033  out:
3034  	if (!error)
3035  		file_modified(file);
3036  	inode_unlock(inode);
3037  	return error;
3038  }
3039  
3040  static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3041  {
3042  	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3043  
3044  	buf->f_type = TMPFS_MAGIC;
3045  	buf->f_bsize = PAGE_SIZE;
3046  	buf->f_namelen = NAME_MAX;
3047  	if (sbinfo->max_blocks) {
3048  		buf->f_blocks = sbinfo->max_blocks;
3049  		buf->f_bavail =
3050  		buf->f_bfree  = sbinfo->max_blocks -
3051  				percpu_counter_sum(&sbinfo->used_blocks);
3052  	}
3053  	if (sbinfo->max_inodes) {
3054  		buf->f_files = sbinfo->max_inodes;
3055  		buf->f_ffree = sbinfo->free_inodes;
3056  	}
3057  	/* else leave those fields 0 like simple_statfs */
3058  
3059  	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3060  
3061  	return 0;
3062  }
3063  
3064  /*
3065   * File creation. Allocate an inode, and we're done..
3066   */
3067  static int
3068  shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3069  	    struct dentry *dentry, umode_t mode, dev_t dev)
3070  {
3071  	struct inode *inode;
3072  	int error = -ENOSPC;
3073  
3074  	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3075  	if (inode) {
3076  		error = simple_acl_create(dir, inode);
3077  		if (error)
3078  			goto out_iput;
3079  		error = security_inode_init_security(inode, dir,
3080  						     &dentry->d_name,
3081  						     shmem_initxattrs, NULL);
3082  		if (error && error != -EOPNOTSUPP)
3083  			goto out_iput;
3084  
3085  		error = 0;
3086  		dir->i_size += BOGO_DIRENT_SIZE;
3087  		dir->i_ctime = dir->i_mtime = current_time(dir);
3088  		inode_inc_iversion(dir);
3089  		d_instantiate(dentry, inode);
3090  		dget(dentry); /* Extra count - pin the dentry in core */
3091  	}
3092  	return error;
3093  out_iput:
3094  	iput(inode);
3095  	return error;
3096  }
3097  
3098  static int
3099  shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3100  	      struct file *file, umode_t mode)
3101  {
3102  	struct inode *inode;
3103  	int error = -ENOSPC;
3104  
3105  	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3106  	if (inode) {
3107  		error = security_inode_init_security(inode, dir,
3108  						     NULL,
3109  						     shmem_initxattrs, NULL);
3110  		if (error && error != -EOPNOTSUPP)
3111  			goto out_iput;
3112  		error = simple_acl_create(dir, inode);
3113  		if (error)
3114  			goto out_iput;
3115  		d_tmpfile(file, inode);
3116  	}
3117  	return finish_open_simple(file, error);
3118  out_iput:
3119  	iput(inode);
3120  	return error;
3121  }
3122  
3123  static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3124  		       struct dentry *dentry, umode_t mode)
3125  {
3126  	int error;
3127  
3128  	error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3129  	if (error)
3130  		return error;
3131  	inc_nlink(dir);
3132  	return 0;
3133  }
3134  
3135  static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3136  			struct dentry *dentry, umode_t mode, bool excl)
3137  {
3138  	return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3139  }
3140  
3141  /*
3142   * Link a file..
3143   */
3144  static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
3145  {
3146  	struct inode *inode = d_inode(old_dentry);
3147  	int ret = 0;
3148  
3149  	/*
3150  	 * No ordinary (disk based) filesystem counts links as inodes;
3151  	 * but each new link needs a new dentry, pinning lowmem, and
3152  	 * tmpfs dentries cannot be pruned until they are unlinked.
3153  	 * But if an O_TMPFILE file is linked into the tmpfs, the
3154  	 * first link must skip that, to get the accounting right.
3155  	 */
3156  	if (inode->i_nlink) {
3157  		ret = shmem_reserve_inode(inode->i_sb, NULL);
3158  		if (ret)
3159  			goto out;
3160  	}
3161  
3162  	dir->i_size += BOGO_DIRENT_SIZE;
3163  	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3164  	inode_inc_iversion(dir);
3165  	inc_nlink(inode);
3166  	ihold(inode);	/* New dentry reference */
3167  	dget(dentry);		/* Extra pinning count for the created dentry */
3168  	d_instantiate(dentry, inode);
3169  out:
3170  	return ret;
3171  }
3172  
3173  static int shmem_unlink(struct inode *dir, struct dentry *dentry)
3174  {
3175  	struct inode *inode = d_inode(dentry);
3176  
3177  	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
3178  		shmem_free_inode(inode->i_sb);
3179  
3180  	dir->i_size -= BOGO_DIRENT_SIZE;
3181  	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
3182  	inode_inc_iversion(dir);
3183  	drop_nlink(inode);
3184  	dput(dentry);	/* Undo the count from "create" - this does all the work */
3185  	return 0;
3186  }
3187  
3188  static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
3189  {
3190  	if (!simple_empty(dentry))
3191  		return -ENOTEMPTY;
3192  
3193  	drop_nlink(d_inode(dentry));
3194  	drop_nlink(dir);
3195  	return shmem_unlink(dir, dentry);
3196  }
3197  
3198  static int shmem_whiteout(struct mnt_idmap *idmap,
3199  			  struct inode *old_dir, struct dentry *old_dentry)
3200  {
3201  	struct dentry *whiteout;
3202  	int error;
3203  
3204  	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
3205  	if (!whiteout)
3206  		return -ENOMEM;
3207  
3208  	error = shmem_mknod(idmap, old_dir, whiteout,
3209  			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
3210  	dput(whiteout);
3211  	if (error)
3212  		return error;
3213  
3214  	/*
3215  	 * Cheat and hash the whiteout while the old dentry is still in
3216  	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
3217  	 *
3218  	 * d_lookup() will consistently find one of them at this point,
3219  	 * not sure which one, but that isn't even important.
3220  	 */
3221  	d_rehash(whiteout);
3222  	return 0;
3223  }
3224  
3225  /*
3226   * The VFS layer already does all the dentry stuff for rename,
3227   * we just have to decrement the usage count for the target if
3228   * it exists so that the VFS layer correctly free's it when it
3229   * gets overwritten.
3230   */
3231  static int shmem_rename2(struct mnt_idmap *idmap,
3232  			 struct inode *old_dir, struct dentry *old_dentry,
3233  			 struct inode *new_dir, struct dentry *new_dentry,
3234  			 unsigned int flags)
3235  {
3236  	struct inode *inode = d_inode(old_dentry);
3237  	int they_are_dirs = S_ISDIR(inode->i_mode);
3238  
3239  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3240  		return -EINVAL;
3241  
3242  	if (flags & RENAME_EXCHANGE)
3243  		return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
3244  
3245  	if (!simple_empty(new_dentry))
3246  		return -ENOTEMPTY;
3247  
3248  	if (flags & RENAME_WHITEOUT) {
3249  		int error;
3250  
3251  		error = shmem_whiteout(idmap, old_dir, old_dentry);
3252  		if (error)
3253  			return error;
3254  	}
3255  
3256  	if (d_really_is_positive(new_dentry)) {
3257  		(void) shmem_unlink(new_dir, new_dentry);
3258  		if (they_are_dirs) {
3259  			drop_nlink(d_inode(new_dentry));
3260  			drop_nlink(old_dir);
3261  		}
3262  	} else if (they_are_dirs) {
3263  		drop_nlink(old_dir);
3264  		inc_nlink(new_dir);
3265  	}
3266  
3267  	old_dir->i_size -= BOGO_DIRENT_SIZE;
3268  	new_dir->i_size += BOGO_DIRENT_SIZE;
3269  	old_dir->i_ctime = old_dir->i_mtime =
3270  	new_dir->i_ctime = new_dir->i_mtime =
3271  	inode->i_ctime = current_time(old_dir);
3272  	inode_inc_iversion(old_dir);
3273  	inode_inc_iversion(new_dir);
3274  	return 0;
3275  }
3276  
3277  static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
3278  			 struct dentry *dentry, const char *symname)
3279  {
3280  	int error;
3281  	int len;
3282  	struct inode *inode;
3283  	struct folio *folio;
3284  
3285  	len = strlen(symname) + 1;
3286  	if (len > PAGE_SIZE)
3287  		return -ENAMETOOLONG;
3288  
3289  	inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
3290  				VM_NORESERVE);
3291  	if (!inode)
3292  		return -ENOSPC;
3293  
3294  	error = security_inode_init_security(inode, dir, &dentry->d_name,
3295  					     shmem_initxattrs, NULL);
3296  	if (error && error != -EOPNOTSUPP) {
3297  		iput(inode);
3298  		return error;
3299  	}
3300  
3301  	inode->i_size = len-1;
3302  	if (len <= SHORT_SYMLINK_LEN) {
3303  		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
3304  		if (!inode->i_link) {
3305  			iput(inode);
3306  			return -ENOMEM;
3307  		}
3308  		inode->i_op = &shmem_short_symlink_operations;
3309  	} else {
3310  		inode_nohighmem(inode);
3311  		error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
3312  		if (error) {
3313  			iput(inode);
3314  			return error;
3315  		}
3316  		inode->i_mapping->a_ops = &shmem_aops;
3317  		inode->i_op = &shmem_symlink_inode_operations;
3318  		memcpy(folio_address(folio), symname, len);
3319  		folio_mark_uptodate(folio);
3320  		folio_mark_dirty(folio);
3321  		folio_unlock(folio);
3322  		folio_put(folio);
3323  	}
3324  	dir->i_size += BOGO_DIRENT_SIZE;
3325  	dir->i_ctime = dir->i_mtime = current_time(dir);
3326  	inode_inc_iversion(dir);
3327  	d_instantiate(dentry, inode);
3328  	dget(dentry);
3329  	return 0;
3330  }
3331  
3332  static void shmem_put_link(void *arg)
3333  {
3334  	folio_mark_accessed(arg);
3335  	folio_put(arg);
3336  }
3337  
3338  static const char *shmem_get_link(struct dentry *dentry,
3339  				  struct inode *inode,
3340  				  struct delayed_call *done)
3341  {
3342  	struct folio *folio = NULL;
3343  	int error;
3344  
3345  	if (!dentry) {
3346  		folio = filemap_get_folio(inode->i_mapping, 0);
3347  		if (IS_ERR(folio))
3348  			return ERR_PTR(-ECHILD);
3349  		if (PageHWPoison(folio_page(folio, 0)) ||
3350  		    !folio_test_uptodate(folio)) {
3351  			folio_put(folio);
3352  			return ERR_PTR(-ECHILD);
3353  		}
3354  	} else {
3355  		error = shmem_get_folio(inode, 0, &folio, SGP_READ);
3356  		if (error)
3357  			return ERR_PTR(error);
3358  		if (!folio)
3359  			return ERR_PTR(-ECHILD);
3360  		if (PageHWPoison(folio_page(folio, 0))) {
3361  			folio_unlock(folio);
3362  			folio_put(folio);
3363  			return ERR_PTR(-ECHILD);
3364  		}
3365  		folio_unlock(folio);
3366  	}
3367  	set_delayed_call(done, shmem_put_link, folio);
3368  	return folio_address(folio);
3369  }
3370  
3371  #ifdef CONFIG_TMPFS_XATTR
3372  
3373  static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
3374  {
3375  	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3376  
3377  	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
3378  
3379  	return 0;
3380  }
3381  
3382  static int shmem_fileattr_set(struct mnt_idmap *idmap,
3383  			      struct dentry *dentry, struct fileattr *fa)
3384  {
3385  	struct inode *inode = d_inode(dentry);
3386  	struct shmem_inode_info *info = SHMEM_I(inode);
3387  
3388  	if (fileattr_has_fsx(fa))
3389  		return -EOPNOTSUPP;
3390  	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
3391  		return -EOPNOTSUPP;
3392  
3393  	info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
3394  		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
3395  
3396  	shmem_set_inode_flags(inode, info->fsflags);
3397  	inode->i_ctime = current_time(inode);
3398  	inode_inc_iversion(inode);
3399  	return 0;
3400  }
3401  
3402  /*
3403   * Superblocks without xattr inode operations may get some security.* xattr
3404   * support from the LSM "for free". As soon as we have any other xattrs
3405   * like ACLs, we also need to implement the security.* handlers at
3406   * filesystem level, though.
3407   */
3408  
3409  /*
3410   * Callback for security_inode_init_security() for acquiring xattrs.
3411   */
3412  static int shmem_initxattrs(struct inode *inode,
3413  			    const struct xattr *xattr_array,
3414  			    void *fs_info)
3415  {
3416  	struct shmem_inode_info *info = SHMEM_I(inode);
3417  	const struct xattr *xattr;
3418  	struct simple_xattr *new_xattr;
3419  	size_t len;
3420  
3421  	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
3422  		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
3423  		if (!new_xattr)
3424  			return -ENOMEM;
3425  
3426  		len = strlen(xattr->name) + 1;
3427  		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
3428  					  GFP_KERNEL);
3429  		if (!new_xattr->name) {
3430  			kvfree(new_xattr);
3431  			return -ENOMEM;
3432  		}
3433  
3434  		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
3435  		       XATTR_SECURITY_PREFIX_LEN);
3436  		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
3437  		       xattr->name, len);
3438  
3439  		simple_xattr_add(&info->xattrs, new_xattr);
3440  	}
3441  
3442  	return 0;
3443  }
3444  
3445  static int shmem_xattr_handler_get(const struct xattr_handler *handler,
3446  				   struct dentry *unused, struct inode *inode,
3447  				   const char *name, void *buffer, size_t size)
3448  {
3449  	struct shmem_inode_info *info = SHMEM_I(inode);
3450  
3451  	name = xattr_full_name(handler, name);
3452  	return simple_xattr_get(&info->xattrs, name, buffer, size);
3453  }
3454  
3455  static int shmem_xattr_handler_set(const struct xattr_handler *handler,
3456  				   struct mnt_idmap *idmap,
3457  				   struct dentry *unused, struct inode *inode,
3458  				   const char *name, const void *value,
3459  				   size_t size, int flags)
3460  {
3461  	struct shmem_inode_info *info = SHMEM_I(inode);
3462  	int err;
3463  
3464  	name = xattr_full_name(handler, name);
3465  	err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
3466  	if (!err) {
3467  		inode->i_ctime = current_time(inode);
3468  		inode_inc_iversion(inode);
3469  	}
3470  	return err;
3471  }
3472  
3473  static const struct xattr_handler shmem_security_xattr_handler = {
3474  	.prefix = XATTR_SECURITY_PREFIX,
3475  	.get = shmem_xattr_handler_get,
3476  	.set = shmem_xattr_handler_set,
3477  };
3478  
3479  static const struct xattr_handler shmem_trusted_xattr_handler = {
3480  	.prefix = XATTR_TRUSTED_PREFIX,
3481  	.get = shmem_xattr_handler_get,
3482  	.set = shmem_xattr_handler_set,
3483  };
3484  
3485  static const struct xattr_handler *shmem_xattr_handlers[] = {
3486  	&shmem_security_xattr_handler,
3487  	&shmem_trusted_xattr_handler,
3488  	NULL
3489  };
3490  
3491  static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
3492  {
3493  	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
3494  	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
3495  }
3496  #endif /* CONFIG_TMPFS_XATTR */
3497  
3498  static const struct inode_operations shmem_short_symlink_operations = {
3499  	.getattr	= shmem_getattr,
3500  	.get_link	= simple_get_link,
3501  #ifdef CONFIG_TMPFS_XATTR
3502  	.listxattr	= shmem_listxattr,
3503  #endif
3504  };
3505  
3506  static const struct inode_operations shmem_symlink_inode_operations = {
3507  	.getattr	= shmem_getattr,
3508  	.get_link	= shmem_get_link,
3509  #ifdef CONFIG_TMPFS_XATTR
3510  	.listxattr	= shmem_listxattr,
3511  #endif
3512  };
3513  
3514  static struct dentry *shmem_get_parent(struct dentry *child)
3515  {
3516  	return ERR_PTR(-ESTALE);
3517  }
3518  
3519  static int shmem_match(struct inode *ino, void *vfh)
3520  {
3521  	__u32 *fh = vfh;
3522  	__u64 inum = fh[2];
3523  	inum = (inum << 32) | fh[1];
3524  	return ino->i_ino == inum && fh[0] == ino->i_generation;
3525  }
3526  
3527  /* Find any alias of inode, but prefer a hashed alias */
3528  static struct dentry *shmem_find_alias(struct inode *inode)
3529  {
3530  	struct dentry *alias = d_find_alias(inode);
3531  
3532  	return alias ?: d_find_any_alias(inode);
3533  }
3534  
3535  
3536  static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
3537  		struct fid *fid, int fh_len, int fh_type)
3538  {
3539  	struct inode *inode;
3540  	struct dentry *dentry = NULL;
3541  	u64 inum;
3542  
3543  	if (fh_len < 3)
3544  		return NULL;
3545  
3546  	inum = fid->raw[2];
3547  	inum = (inum << 32) | fid->raw[1];
3548  
3549  	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
3550  			shmem_match, fid->raw);
3551  	if (inode) {
3552  		dentry = shmem_find_alias(inode);
3553  		iput(inode);
3554  	}
3555  
3556  	return dentry;
3557  }
3558  
3559  static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
3560  				struct inode *parent)
3561  {
3562  	if (*len < 3) {
3563  		*len = 3;
3564  		return FILEID_INVALID;
3565  	}
3566  
3567  	if (inode_unhashed(inode)) {
3568  		/* Unfortunately insert_inode_hash is not idempotent,
3569  		 * so as we hash inodes here rather than at creation
3570  		 * time, we need a lock to ensure we only try
3571  		 * to do it once
3572  		 */
3573  		static DEFINE_SPINLOCK(lock);
3574  		spin_lock(&lock);
3575  		if (inode_unhashed(inode))
3576  			__insert_inode_hash(inode,
3577  					    inode->i_ino + inode->i_generation);
3578  		spin_unlock(&lock);
3579  	}
3580  
3581  	fh[0] = inode->i_generation;
3582  	fh[1] = inode->i_ino;
3583  	fh[2] = ((__u64)inode->i_ino) >> 32;
3584  
3585  	*len = 3;
3586  	return 1;
3587  }
3588  
3589  static const struct export_operations shmem_export_ops = {
3590  	.get_parent     = shmem_get_parent,
3591  	.encode_fh      = shmem_encode_fh,
3592  	.fh_to_dentry	= shmem_fh_to_dentry,
3593  };
3594  
3595  enum shmem_param {
3596  	Opt_gid,
3597  	Opt_huge,
3598  	Opt_mode,
3599  	Opt_mpol,
3600  	Opt_nr_blocks,
3601  	Opt_nr_inodes,
3602  	Opt_size,
3603  	Opt_uid,
3604  	Opt_inode32,
3605  	Opt_inode64,
3606  	Opt_noswap,
3607  };
3608  
3609  static const struct constant_table shmem_param_enums_huge[] = {
3610  	{"never",	SHMEM_HUGE_NEVER },
3611  	{"always",	SHMEM_HUGE_ALWAYS },
3612  	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
3613  	{"advise",	SHMEM_HUGE_ADVISE },
3614  	{}
3615  };
3616  
3617  const struct fs_parameter_spec shmem_fs_parameters[] = {
3618  	fsparam_u32   ("gid",		Opt_gid),
3619  	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
3620  	fsparam_u32oct("mode",		Opt_mode),
3621  	fsparam_string("mpol",		Opt_mpol),
3622  	fsparam_string("nr_blocks",	Opt_nr_blocks),
3623  	fsparam_string("nr_inodes",	Opt_nr_inodes),
3624  	fsparam_string("size",		Opt_size),
3625  	fsparam_u32   ("uid",		Opt_uid),
3626  	fsparam_flag  ("inode32",	Opt_inode32),
3627  	fsparam_flag  ("inode64",	Opt_inode64),
3628  	fsparam_flag  ("noswap",	Opt_noswap),
3629  	{}
3630  };
3631  
3632  static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
3633  {
3634  	struct shmem_options *ctx = fc->fs_private;
3635  	struct fs_parse_result result;
3636  	unsigned long long size;
3637  	char *rest;
3638  	int opt;
3639  
3640  	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
3641  	if (opt < 0)
3642  		return opt;
3643  
3644  	switch (opt) {
3645  	case Opt_size:
3646  		size = memparse(param->string, &rest);
3647  		if (*rest == '%') {
3648  			size <<= PAGE_SHIFT;
3649  			size *= totalram_pages();
3650  			do_div(size, 100);
3651  			rest++;
3652  		}
3653  		if (*rest)
3654  			goto bad_value;
3655  		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
3656  		ctx->seen |= SHMEM_SEEN_BLOCKS;
3657  		break;
3658  	case Opt_nr_blocks:
3659  		ctx->blocks = memparse(param->string, &rest);
3660  		if (*rest || ctx->blocks > S64_MAX)
3661  			goto bad_value;
3662  		ctx->seen |= SHMEM_SEEN_BLOCKS;
3663  		break;
3664  	case Opt_nr_inodes:
3665  		ctx->inodes = memparse(param->string, &rest);
3666  		if (*rest)
3667  			goto bad_value;
3668  		ctx->seen |= SHMEM_SEEN_INODES;
3669  		break;
3670  	case Opt_mode:
3671  		ctx->mode = result.uint_32 & 07777;
3672  		break;
3673  	case Opt_uid:
3674  		ctx->uid = make_kuid(current_user_ns(), result.uint_32);
3675  		if (!uid_valid(ctx->uid))
3676  			goto bad_value;
3677  		break;
3678  	case Opt_gid:
3679  		ctx->gid = make_kgid(current_user_ns(), result.uint_32);
3680  		if (!gid_valid(ctx->gid))
3681  			goto bad_value;
3682  		break;
3683  	case Opt_huge:
3684  		ctx->huge = result.uint_32;
3685  		if (ctx->huge != SHMEM_HUGE_NEVER &&
3686  		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
3687  		      has_transparent_hugepage()))
3688  			goto unsupported_parameter;
3689  		ctx->seen |= SHMEM_SEEN_HUGE;
3690  		break;
3691  	case Opt_mpol:
3692  		if (IS_ENABLED(CONFIG_NUMA)) {
3693  			mpol_put(ctx->mpol);
3694  			ctx->mpol = NULL;
3695  			if (mpol_parse_str(param->string, &ctx->mpol))
3696  				goto bad_value;
3697  			break;
3698  		}
3699  		goto unsupported_parameter;
3700  	case Opt_inode32:
3701  		ctx->full_inums = false;
3702  		ctx->seen |= SHMEM_SEEN_INUMS;
3703  		break;
3704  	case Opt_inode64:
3705  		if (sizeof(ino_t) < 8) {
3706  			return invalfc(fc,
3707  				       "Cannot use inode64 with <64bit inums in kernel\n");
3708  		}
3709  		ctx->full_inums = true;
3710  		ctx->seen |= SHMEM_SEEN_INUMS;
3711  		break;
3712  	case Opt_noswap:
3713  		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
3714  			return invalfc(fc,
3715  				       "Turning off swap in unprivileged tmpfs mounts unsupported");
3716  		}
3717  		ctx->noswap = true;
3718  		ctx->seen |= SHMEM_SEEN_NOSWAP;
3719  		break;
3720  	}
3721  	return 0;
3722  
3723  unsupported_parameter:
3724  	return invalfc(fc, "Unsupported parameter '%s'", param->key);
3725  bad_value:
3726  	return invalfc(fc, "Bad value for '%s'", param->key);
3727  }
3728  
3729  static int shmem_parse_options(struct fs_context *fc, void *data)
3730  {
3731  	char *options = data;
3732  
3733  	if (options) {
3734  		int err = security_sb_eat_lsm_opts(options, &fc->security);
3735  		if (err)
3736  			return err;
3737  	}
3738  
3739  	while (options != NULL) {
3740  		char *this_char = options;
3741  		for (;;) {
3742  			/*
3743  			 * NUL-terminate this option: unfortunately,
3744  			 * mount options form a comma-separated list,
3745  			 * but mpol's nodelist may also contain commas.
3746  			 */
3747  			options = strchr(options, ',');
3748  			if (options == NULL)
3749  				break;
3750  			options++;
3751  			if (!isdigit(*options)) {
3752  				options[-1] = '\0';
3753  				break;
3754  			}
3755  		}
3756  		if (*this_char) {
3757  			char *value = strchr(this_char, '=');
3758  			size_t len = 0;
3759  			int err;
3760  
3761  			if (value) {
3762  				*value++ = '\0';
3763  				len = strlen(value);
3764  			}
3765  			err = vfs_parse_fs_string(fc, this_char, value, len);
3766  			if (err < 0)
3767  				return err;
3768  		}
3769  	}
3770  	return 0;
3771  }
3772  
3773  /*
3774   * Reconfigure a shmem filesystem.
3775   *
3776   * Note that we disallow change from limited->unlimited blocks/inodes while any
3777   * are in use; but we must separately disallow unlimited->limited, because in
3778   * that case we have no record of how much is already in use.
3779   */
3780  static int shmem_reconfigure(struct fs_context *fc)
3781  {
3782  	struct shmem_options *ctx = fc->fs_private;
3783  	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
3784  	unsigned long inodes;
3785  	struct mempolicy *mpol = NULL;
3786  	const char *err;
3787  
3788  	raw_spin_lock(&sbinfo->stat_lock);
3789  	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
3790  
3791  	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
3792  		if (!sbinfo->max_blocks) {
3793  			err = "Cannot retroactively limit size";
3794  			goto out;
3795  		}
3796  		if (percpu_counter_compare(&sbinfo->used_blocks,
3797  					   ctx->blocks) > 0) {
3798  			err = "Too small a size for current use";
3799  			goto out;
3800  		}
3801  	}
3802  	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
3803  		if (!sbinfo->max_inodes) {
3804  			err = "Cannot retroactively limit inodes";
3805  			goto out;
3806  		}
3807  		if (ctx->inodes < inodes) {
3808  			err = "Too few inodes for current use";
3809  			goto out;
3810  		}
3811  	}
3812  
3813  	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
3814  	    sbinfo->next_ino > UINT_MAX) {
3815  		err = "Current inum too high to switch to 32-bit inums";
3816  		goto out;
3817  	}
3818  	if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
3819  		err = "Cannot disable swap on remount";
3820  		goto out;
3821  	}
3822  	if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
3823  		err = "Cannot enable swap on remount if it was disabled on first mount";
3824  		goto out;
3825  	}
3826  
3827  	if (ctx->seen & SHMEM_SEEN_HUGE)
3828  		sbinfo->huge = ctx->huge;
3829  	if (ctx->seen & SHMEM_SEEN_INUMS)
3830  		sbinfo->full_inums = ctx->full_inums;
3831  	if (ctx->seen & SHMEM_SEEN_BLOCKS)
3832  		sbinfo->max_blocks  = ctx->blocks;
3833  	if (ctx->seen & SHMEM_SEEN_INODES) {
3834  		sbinfo->max_inodes  = ctx->inodes;
3835  		sbinfo->free_inodes = ctx->inodes - inodes;
3836  	}
3837  
3838  	/*
3839  	 * Preserve previous mempolicy unless mpol remount option was specified.
3840  	 */
3841  	if (ctx->mpol) {
3842  		mpol = sbinfo->mpol;
3843  		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
3844  		ctx->mpol = NULL;
3845  	}
3846  
3847  	if (ctx->noswap)
3848  		sbinfo->noswap = true;
3849  
3850  	raw_spin_unlock(&sbinfo->stat_lock);
3851  	mpol_put(mpol);
3852  	return 0;
3853  out:
3854  	raw_spin_unlock(&sbinfo->stat_lock);
3855  	return invalfc(fc, "%s", err);
3856  }
3857  
3858  static int shmem_show_options(struct seq_file *seq, struct dentry *root)
3859  {
3860  	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
3861  	struct mempolicy *mpol;
3862  
3863  	if (sbinfo->max_blocks != shmem_default_max_blocks())
3864  		seq_printf(seq, ",size=%luk",
3865  			sbinfo->max_blocks << (PAGE_SHIFT - 10));
3866  	if (sbinfo->max_inodes != shmem_default_max_inodes())
3867  		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
3868  	if (sbinfo->mode != (0777 | S_ISVTX))
3869  		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
3870  	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
3871  		seq_printf(seq, ",uid=%u",
3872  				from_kuid_munged(&init_user_ns, sbinfo->uid));
3873  	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
3874  		seq_printf(seq, ",gid=%u",
3875  				from_kgid_munged(&init_user_ns, sbinfo->gid));
3876  
3877  	/*
3878  	 * Showing inode{64,32} might be useful even if it's the system default,
3879  	 * since then people don't have to resort to checking both here and
3880  	 * /proc/config.gz to confirm 64-bit inums were successfully applied
3881  	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
3882  	 *
3883  	 * We hide it when inode64 isn't the default and we are using 32-bit
3884  	 * inodes, since that probably just means the feature isn't even under
3885  	 * consideration.
3886  	 *
3887  	 * As such:
3888  	 *
3889  	 *                     +-----------------+-----------------+
3890  	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
3891  	 *  +------------------+-----------------+-----------------+
3892  	 *  | full_inums=true  | show            | show            |
3893  	 *  | full_inums=false | show            | hide            |
3894  	 *  +------------------+-----------------+-----------------+
3895  	 *
3896  	 */
3897  	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
3898  		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
3899  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3900  	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3901  	if (sbinfo->huge)
3902  		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3903  #endif
3904  	mpol = shmem_get_sbmpol(sbinfo);
3905  	shmem_show_mpol(seq, mpol);
3906  	mpol_put(mpol);
3907  	if (sbinfo->noswap)
3908  		seq_printf(seq, ",noswap");
3909  	return 0;
3910  }
3911  
3912  #endif /* CONFIG_TMPFS */
3913  
3914  static void shmem_put_super(struct super_block *sb)
3915  {
3916  	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3917  
3918  	free_percpu(sbinfo->ino_batch);
3919  	percpu_counter_destroy(&sbinfo->used_blocks);
3920  	mpol_put(sbinfo->mpol);
3921  	kfree(sbinfo);
3922  	sb->s_fs_info = NULL;
3923  }
3924  
3925  static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
3926  {
3927  	struct shmem_options *ctx = fc->fs_private;
3928  	struct inode *inode;
3929  	struct shmem_sb_info *sbinfo;
3930  
3931  	/* Round up to L1_CACHE_BYTES to resist false sharing */
3932  	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
3933  				L1_CACHE_BYTES), GFP_KERNEL);
3934  	if (!sbinfo)
3935  		return -ENOMEM;
3936  
3937  	sb->s_fs_info = sbinfo;
3938  
3939  #ifdef CONFIG_TMPFS
3940  	/*
3941  	 * Per default we only allow half of the physical ram per
3942  	 * tmpfs instance, limiting inodes to one per page of lowmem;
3943  	 * but the internal instance is left unlimited.
3944  	 */
3945  	if (!(sb->s_flags & SB_KERNMOUNT)) {
3946  		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
3947  			ctx->blocks = shmem_default_max_blocks();
3948  		if (!(ctx->seen & SHMEM_SEEN_INODES))
3949  			ctx->inodes = shmem_default_max_inodes();
3950  		if (!(ctx->seen & SHMEM_SEEN_INUMS))
3951  			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
3952  		sbinfo->noswap = ctx->noswap;
3953  	} else {
3954  		sb->s_flags |= SB_NOUSER;
3955  	}
3956  	sb->s_export_op = &shmem_export_ops;
3957  	sb->s_flags |= SB_NOSEC | SB_I_VERSION;
3958  #else
3959  	sb->s_flags |= SB_NOUSER;
3960  #endif
3961  	sbinfo->max_blocks = ctx->blocks;
3962  	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
3963  	if (sb->s_flags & SB_KERNMOUNT) {
3964  		sbinfo->ino_batch = alloc_percpu(ino_t);
3965  		if (!sbinfo->ino_batch)
3966  			goto failed;
3967  	}
3968  	sbinfo->uid = ctx->uid;
3969  	sbinfo->gid = ctx->gid;
3970  	sbinfo->full_inums = ctx->full_inums;
3971  	sbinfo->mode = ctx->mode;
3972  	sbinfo->huge = ctx->huge;
3973  	sbinfo->mpol = ctx->mpol;
3974  	ctx->mpol = NULL;
3975  
3976  	raw_spin_lock_init(&sbinfo->stat_lock);
3977  	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3978  		goto failed;
3979  	spin_lock_init(&sbinfo->shrinklist_lock);
3980  	INIT_LIST_HEAD(&sbinfo->shrinklist);
3981  
3982  	sb->s_maxbytes = MAX_LFS_FILESIZE;
3983  	sb->s_blocksize = PAGE_SIZE;
3984  	sb->s_blocksize_bits = PAGE_SHIFT;
3985  	sb->s_magic = TMPFS_MAGIC;
3986  	sb->s_op = &shmem_ops;
3987  	sb->s_time_gran = 1;
3988  #ifdef CONFIG_TMPFS_XATTR
3989  	sb->s_xattr = shmem_xattr_handlers;
3990  #endif
3991  #ifdef CONFIG_TMPFS_POSIX_ACL
3992  	sb->s_flags |= SB_POSIXACL;
3993  #endif
3994  	uuid_gen(&sb->s_uuid);
3995  
3996  	inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
3997  				VM_NORESERVE);
3998  	if (!inode)
3999  		goto failed;
4000  	inode->i_uid = sbinfo->uid;
4001  	inode->i_gid = sbinfo->gid;
4002  	sb->s_root = d_make_root(inode);
4003  	if (!sb->s_root)
4004  		goto failed;
4005  	return 0;
4006  
4007  failed:
4008  	shmem_put_super(sb);
4009  	return -ENOMEM;
4010  }
4011  
4012  static int shmem_get_tree(struct fs_context *fc)
4013  {
4014  	return get_tree_nodev(fc, shmem_fill_super);
4015  }
4016  
4017  static void shmem_free_fc(struct fs_context *fc)
4018  {
4019  	struct shmem_options *ctx = fc->fs_private;
4020  
4021  	if (ctx) {
4022  		mpol_put(ctx->mpol);
4023  		kfree(ctx);
4024  	}
4025  }
4026  
4027  static const struct fs_context_operations shmem_fs_context_ops = {
4028  	.free			= shmem_free_fc,
4029  	.get_tree		= shmem_get_tree,
4030  #ifdef CONFIG_TMPFS
4031  	.parse_monolithic	= shmem_parse_options,
4032  	.parse_param		= shmem_parse_one,
4033  	.reconfigure		= shmem_reconfigure,
4034  #endif
4035  };
4036  
4037  static struct kmem_cache *shmem_inode_cachep;
4038  
4039  static struct inode *shmem_alloc_inode(struct super_block *sb)
4040  {
4041  	struct shmem_inode_info *info;
4042  	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
4043  	if (!info)
4044  		return NULL;
4045  	return &info->vfs_inode;
4046  }
4047  
4048  static void shmem_free_in_core_inode(struct inode *inode)
4049  {
4050  	if (S_ISLNK(inode->i_mode))
4051  		kfree(inode->i_link);
4052  	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
4053  }
4054  
4055  static void shmem_destroy_inode(struct inode *inode)
4056  {
4057  	if (S_ISREG(inode->i_mode))
4058  		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
4059  }
4060  
4061  static void shmem_init_inode(void *foo)
4062  {
4063  	struct shmem_inode_info *info = foo;
4064  	inode_init_once(&info->vfs_inode);
4065  }
4066  
4067  static void shmem_init_inodecache(void)
4068  {
4069  	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
4070  				sizeof(struct shmem_inode_info),
4071  				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
4072  }
4073  
4074  static void shmem_destroy_inodecache(void)
4075  {
4076  	kmem_cache_destroy(shmem_inode_cachep);
4077  }
4078  
4079  /* Keep the page in page cache instead of truncating it */
4080  static int shmem_error_remove_page(struct address_space *mapping,
4081  				   struct page *page)
4082  {
4083  	return 0;
4084  }
4085  
4086  const struct address_space_operations shmem_aops = {
4087  	.writepage	= shmem_writepage,
4088  	.dirty_folio	= noop_dirty_folio,
4089  #ifdef CONFIG_TMPFS
4090  	.write_begin	= shmem_write_begin,
4091  	.write_end	= shmem_write_end,
4092  #endif
4093  #ifdef CONFIG_MIGRATION
4094  	.migrate_folio	= migrate_folio,
4095  #endif
4096  	.error_remove_page = shmem_error_remove_page,
4097  };
4098  EXPORT_SYMBOL(shmem_aops);
4099  
4100  static const struct file_operations shmem_file_operations = {
4101  	.mmap		= shmem_mmap,
4102  	.open		= generic_file_open,
4103  	.get_unmapped_area = shmem_get_unmapped_area,
4104  #ifdef CONFIG_TMPFS
4105  	.llseek		= shmem_file_llseek,
4106  	.read_iter	= shmem_file_read_iter,
4107  	.write_iter	= generic_file_write_iter,
4108  	.fsync		= noop_fsync,
4109  	.splice_read	= shmem_file_splice_read,
4110  	.splice_write	= iter_file_splice_write,
4111  	.fallocate	= shmem_fallocate,
4112  #endif
4113  };
4114  
4115  static const struct inode_operations shmem_inode_operations = {
4116  	.getattr	= shmem_getattr,
4117  	.setattr	= shmem_setattr,
4118  #ifdef CONFIG_TMPFS_XATTR
4119  	.listxattr	= shmem_listxattr,
4120  	.set_acl	= simple_set_acl,
4121  	.fileattr_get	= shmem_fileattr_get,
4122  	.fileattr_set	= shmem_fileattr_set,
4123  #endif
4124  };
4125  
4126  static const struct inode_operations shmem_dir_inode_operations = {
4127  #ifdef CONFIG_TMPFS
4128  	.getattr	= shmem_getattr,
4129  	.create		= shmem_create,
4130  	.lookup		= simple_lookup,
4131  	.link		= shmem_link,
4132  	.unlink		= shmem_unlink,
4133  	.symlink	= shmem_symlink,
4134  	.mkdir		= shmem_mkdir,
4135  	.rmdir		= shmem_rmdir,
4136  	.mknod		= shmem_mknod,
4137  	.rename		= shmem_rename2,
4138  	.tmpfile	= shmem_tmpfile,
4139  #endif
4140  #ifdef CONFIG_TMPFS_XATTR
4141  	.listxattr	= shmem_listxattr,
4142  	.fileattr_get	= shmem_fileattr_get,
4143  	.fileattr_set	= shmem_fileattr_set,
4144  #endif
4145  #ifdef CONFIG_TMPFS_POSIX_ACL
4146  	.setattr	= shmem_setattr,
4147  	.set_acl	= simple_set_acl,
4148  #endif
4149  };
4150  
4151  static const struct inode_operations shmem_special_inode_operations = {
4152  	.getattr	= shmem_getattr,
4153  #ifdef CONFIG_TMPFS_XATTR
4154  	.listxattr	= shmem_listxattr,
4155  #endif
4156  #ifdef CONFIG_TMPFS_POSIX_ACL
4157  	.setattr	= shmem_setattr,
4158  	.set_acl	= simple_set_acl,
4159  #endif
4160  };
4161  
4162  static const struct super_operations shmem_ops = {
4163  	.alloc_inode	= shmem_alloc_inode,
4164  	.free_inode	= shmem_free_in_core_inode,
4165  	.destroy_inode	= shmem_destroy_inode,
4166  #ifdef CONFIG_TMPFS
4167  	.statfs		= shmem_statfs,
4168  	.show_options	= shmem_show_options,
4169  #endif
4170  	.evict_inode	= shmem_evict_inode,
4171  	.drop_inode	= generic_delete_inode,
4172  	.put_super	= shmem_put_super,
4173  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4174  	.nr_cached_objects	= shmem_unused_huge_count,
4175  	.free_cached_objects	= shmem_unused_huge_scan,
4176  #endif
4177  };
4178  
4179  static const struct vm_operations_struct shmem_vm_ops = {
4180  	.fault		= shmem_fault,
4181  	.map_pages	= filemap_map_pages,
4182  #ifdef CONFIG_NUMA
4183  	.set_policy     = shmem_set_policy,
4184  	.get_policy     = shmem_get_policy,
4185  #endif
4186  };
4187  
4188  static const struct vm_operations_struct shmem_anon_vm_ops = {
4189  	.fault		= shmem_fault,
4190  	.map_pages	= filemap_map_pages,
4191  #ifdef CONFIG_NUMA
4192  	.set_policy     = shmem_set_policy,
4193  	.get_policy     = shmem_get_policy,
4194  #endif
4195  };
4196  
4197  int shmem_init_fs_context(struct fs_context *fc)
4198  {
4199  	struct shmem_options *ctx;
4200  
4201  	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
4202  	if (!ctx)
4203  		return -ENOMEM;
4204  
4205  	ctx->mode = 0777 | S_ISVTX;
4206  	ctx->uid = current_fsuid();
4207  	ctx->gid = current_fsgid();
4208  
4209  	fc->fs_private = ctx;
4210  	fc->ops = &shmem_fs_context_ops;
4211  	return 0;
4212  }
4213  
4214  static struct file_system_type shmem_fs_type = {
4215  	.owner		= THIS_MODULE,
4216  	.name		= "tmpfs",
4217  	.init_fs_context = shmem_init_fs_context,
4218  #ifdef CONFIG_TMPFS
4219  	.parameters	= shmem_fs_parameters,
4220  #endif
4221  	.kill_sb	= kill_litter_super,
4222  #ifdef CONFIG_SHMEM
4223  	.fs_flags	= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
4224  #else
4225  	.fs_flags	= FS_USERNS_MOUNT,
4226  #endif
4227  };
4228  
4229  void __init shmem_init(void)
4230  {
4231  	int error;
4232  
4233  	shmem_init_inodecache();
4234  
4235  	error = register_filesystem(&shmem_fs_type);
4236  	if (error) {
4237  		pr_err("Could not register tmpfs\n");
4238  		goto out2;
4239  	}
4240  
4241  	shm_mnt = kern_mount(&shmem_fs_type);
4242  	if (IS_ERR(shm_mnt)) {
4243  		error = PTR_ERR(shm_mnt);
4244  		pr_err("Could not kern_mount tmpfs\n");
4245  		goto out1;
4246  	}
4247  
4248  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4249  	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
4250  		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4251  	else
4252  		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
4253  #endif
4254  	return;
4255  
4256  out1:
4257  	unregister_filesystem(&shmem_fs_type);
4258  out2:
4259  	shmem_destroy_inodecache();
4260  	shm_mnt = ERR_PTR(error);
4261  }
4262  
4263  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
4264  static ssize_t shmem_enabled_show(struct kobject *kobj,
4265  				  struct kobj_attribute *attr, char *buf)
4266  {
4267  	static const int values[] = {
4268  		SHMEM_HUGE_ALWAYS,
4269  		SHMEM_HUGE_WITHIN_SIZE,
4270  		SHMEM_HUGE_ADVISE,
4271  		SHMEM_HUGE_NEVER,
4272  		SHMEM_HUGE_DENY,
4273  		SHMEM_HUGE_FORCE,
4274  	};
4275  	int len = 0;
4276  	int i;
4277  
4278  	for (i = 0; i < ARRAY_SIZE(values); i++) {
4279  		len += sysfs_emit_at(buf, len,
4280  				     shmem_huge == values[i] ? "%s[%s]" : "%s%s",
4281  				     i ? " " : "",
4282  				     shmem_format_huge(values[i]));
4283  	}
4284  
4285  	len += sysfs_emit_at(buf, len, "\n");
4286  
4287  	return len;
4288  }
4289  
4290  static ssize_t shmem_enabled_store(struct kobject *kobj,
4291  		struct kobj_attribute *attr, const char *buf, size_t count)
4292  {
4293  	char tmp[16];
4294  	int huge;
4295  
4296  	if (count + 1 > sizeof(tmp))
4297  		return -EINVAL;
4298  	memcpy(tmp, buf, count);
4299  	tmp[count] = '\0';
4300  	if (count && tmp[count - 1] == '\n')
4301  		tmp[count - 1] = '\0';
4302  
4303  	huge = shmem_parse_huge(tmp);
4304  	if (huge == -EINVAL)
4305  		return -EINVAL;
4306  	if (!has_transparent_hugepage() &&
4307  			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
4308  		return -EINVAL;
4309  
4310  	shmem_huge = huge;
4311  	if (shmem_huge > SHMEM_HUGE_DENY)
4312  		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
4313  	return count;
4314  }
4315  
4316  struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
4317  #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
4318  
4319  #else /* !CONFIG_SHMEM */
4320  
4321  /*
4322   * tiny-shmem: simple shmemfs and tmpfs using ramfs code
4323   *
4324   * This is intended for small system where the benefits of the full
4325   * shmem code (swap-backed and resource-limited) are outweighed by
4326   * their complexity. On systems without swap this code should be
4327   * effectively equivalent, but much lighter weight.
4328   */
4329  
4330  static struct file_system_type shmem_fs_type = {
4331  	.name		= "tmpfs",
4332  	.init_fs_context = ramfs_init_fs_context,
4333  	.parameters	= ramfs_fs_parameters,
4334  	.kill_sb	= ramfs_kill_sb,
4335  	.fs_flags	= FS_USERNS_MOUNT,
4336  };
4337  
4338  void __init shmem_init(void)
4339  {
4340  	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
4341  
4342  	shm_mnt = kern_mount(&shmem_fs_type);
4343  	BUG_ON(IS_ERR(shm_mnt));
4344  }
4345  
4346  int shmem_unuse(unsigned int type)
4347  {
4348  	return 0;
4349  }
4350  
4351  int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
4352  {
4353  	return 0;
4354  }
4355  
4356  void shmem_unlock_mapping(struct address_space *mapping)
4357  {
4358  }
4359  
4360  #ifdef CONFIG_MMU
4361  unsigned long shmem_get_unmapped_area(struct file *file,
4362  				      unsigned long addr, unsigned long len,
4363  				      unsigned long pgoff, unsigned long flags)
4364  {
4365  	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4366  }
4367  #endif
4368  
4369  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
4370  {
4371  	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
4372  }
4373  EXPORT_SYMBOL_GPL(shmem_truncate_range);
4374  
4375  #define shmem_vm_ops				generic_file_vm_ops
4376  #define shmem_anon_vm_ops			generic_file_vm_ops
4377  #define shmem_file_operations			ramfs_file_operations
4378  #define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
4379  #define shmem_acct_size(flags, size)		0
4380  #define shmem_unacct_size(flags, size)		do {} while (0)
4381  
4382  #endif /* CONFIG_SHMEM */
4383  
4384  /* common code */
4385  
4386  static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
4387  				       unsigned long flags, unsigned int i_flags)
4388  {
4389  	struct inode *inode;
4390  	struct file *res;
4391  
4392  	if (IS_ERR(mnt))
4393  		return ERR_CAST(mnt);
4394  
4395  	if (size < 0 || size > MAX_LFS_FILESIZE)
4396  		return ERR_PTR(-EINVAL);
4397  
4398  	if (shmem_acct_size(flags, size))
4399  		return ERR_PTR(-ENOMEM);
4400  
4401  	if (is_idmapped_mnt(mnt))
4402  		return ERR_PTR(-EINVAL);
4403  
4404  	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
4405  				S_IFREG | S_IRWXUGO, 0, flags);
4406  	if (unlikely(!inode)) {
4407  		shmem_unacct_size(flags, size);
4408  		return ERR_PTR(-ENOSPC);
4409  	}
4410  	inode->i_flags |= i_flags;
4411  	inode->i_size = size;
4412  	clear_nlink(inode);	/* It is unlinked */
4413  	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
4414  	if (!IS_ERR(res))
4415  		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
4416  				&shmem_file_operations);
4417  	if (IS_ERR(res))
4418  		iput(inode);
4419  	return res;
4420  }
4421  
4422  /**
4423   * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
4424   * 	kernel internal.  There will be NO LSM permission checks against the
4425   * 	underlying inode.  So users of this interface must do LSM checks at a
4426   *	higher layer.  The users are the big_key and shm implementations.  LSM
4427   *	checks are provided at the key or shm level rather than the inode.
4428   * @name: name for dentry (to be seen in /proc/<pid>/maps
4429   * @size: size to be set for the file
4430   * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4431   */
4432  struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
4433  {
4434  	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
4435  }
4436  
4437  /**
4438   * shmem_file_setup - get an unlinked file living in tmpfs
4439   * @name: name for dentry (to be seen in /proc/<pid>/maps
4440   * @size: size to be set for the file
4441   * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4442   */
4443  struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
4444  {
4445  	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
4446  }
4447  EXPORT_SYMBOL_GPL(shmem_file_setup);
4448  
4449  /**
4450   * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
4451   * @mnt: the tmpfs mount where the file will be created
4452   * @name: name for dentry (to be seen in /proc/<pid>/maps
4453   * @size: size to be set for the file
4454   * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
4455   */
4456  struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
4457  				       loff_t size, unsigned long flags)
4458  {
4459  	return __shmem_file_setup(mnt, name, size, flags, 0);
4460  }
4461  EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
4462  
4463  /**
4464   * shmem_zero_setup - setup a shared anonymous mapping
4465   * @vma: the vma to be mmapped is prepared by do_mmap
4466   */
4467  int shmem_zero_setup(struct vm_area_struct *vma)
4468  {
4469  	struct file *file;
4470  	loff_t size = vma->vm_end - vma->vm_start;
4471  
4472  	/*
4473  	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
4474  	 * between XFS directory reading and selinux: since this file is only
4475  	 * accessible to the user through its mapping, use S_PRIVATE flag to
4476  	 * bypass file security, in the same way as shmem_kernel_file_setup().
4477  	 */
4478  	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
4479  	if (IS_ERR(file))
4480  		return PTR_ERR(file);
4481  
4482  	if (vma->vm_file)
4483  		fput(vma->vm_file);
4484  	vma->vm_file = file;
4485  	vma->vm_ops = &shmem_anon_vm_ops;
4486  
4487  	return 0;
4488  }
4489  
4490  /**
4491   * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
4492   * @mapping:	the folio's address_space
4493   * @index:	the folio index
4494   * @gfp:	the page allocator flags to use if allocating
4495   *
4496   * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
4497   * with any new page allocations done using the specified allocation flags.
4498   * But read_cache_page_gfp() uses the ->read_folio() method: which does not
4499   * suit tmpfs, since it may have pages in swapcache, and needs to find those
4500   * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
4501   *
4502   * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
4503   * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
4504   */
4505  struct folio *shmem_read_folio_gfp(struct address_space *mapping,
4506  		pgoff_t index, gfp_t gfp)
4507  {
4508  #ifdef CONFIG_SHMEM
4509  	struct inode *inode = mapping->host;
4510  	struct folio *folio;
4511  	int error;
4512  
4513  	BUG_ON(!shmem_mapping(mapping));
4514  	error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
4515  				  gfp, NULL, NULL, NULL);
4516  	if (error)
4517  		return ERR_PTR(error);
4518  
4519  	folio_unlock(folio);
4520  	return folio;
4521  #else
4522  	/*
4523  	 * The tiny !SHMEM case uses ramfs without swap
4524  	 */
4525  	return mapping_read_folio_gfp(mapping, index, gfp);
4526  #endif
4527  }
4528  EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
4529  
4530  struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
4531  					 pgoff_t index, gfp_t gfp)
4532  {
4533  	struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
4534  	struct page *page;
4535  
4536  	if (IS_ERR(folio))
4537  		return &folio->page;
4538  
4539  	page = folio_file_page(folio, index);
4540  	if (PageHWPoison(page)) {
4541  		folio_put(folio);
4542  		return ERR_PTR(-EIO);
4543  	}
4544  
4545  	return page;
4546  }
4547  EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
4548