xref: /linux/mm/shmem.c (revision e9ef810dfee7a2227da9d423aecb0ced35faddbe)
1 /*
2  * Resizable virtual memory filesystem for Linux.
3  *
4  * Copyright (C) 2000 Linus Torvalds.
5  *		 2000 Transmeta Corp.
6  *		 2000-2001 Christoph Rohland
7  *		 2000-2001 SAP AG
8  *		 2002 Red Hat Inc.
9  * Copyright (C) 2002-2011 Hugh Dickins.
10  * Copyright (C) 2011 Google Inc.
11  * Copyright (C) 2002-2005 VERITAS Software Corporation.
12  * Copyright (C) 2004 Andi Kleen, SuSE Labs
13  *
14  * Extended attribute support for tmpfs:
15  * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
16  * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
17  *
18  * tiny-shmem:
19  * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
20  *
21  * This file is released under the GPL.
22  */
23 
24 #include <linux/fs.h>
25 #include <linux/init.h>
26 #include <linux/vfs.h>
27 #include <linux/mount.h>
28 #include <linux/ramfs.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include <linux/fileattr.h>
32 #include <linux/mm.h>
33 #include <linux/random.h>
34 #include <linux/sched/signal.h>
35 #include <linux/export.h>
36 #include <linux/shmem_fs.h>
37 #include <linux/swap.h>
38 #include <linux/uio.h>
39 #include <linux/hugetlb.h>
40 #include <linux/fs_parser.h>
41 #include <linux/swapfile.h>
42 #include <linux/iversion.h>
43 #include <linux/unicode.h>
44 #include "swap.h"
45 
46 static struct vfsmount *shm_mnt __ro_after_init;
47 
48 #ifdef CONFIG_SHMEM
49 /*
50  * This virtual memory filesystem is heavily based on the ramfs. It
51  * extends ramfs by the ability to use swap and honor resource limits
52  * which makes it a completely usable filesystem.
53  */
54 
55 #include <linux/xattr.h>
56 #include <linux/exportfs.h>
57 #include <linux/posix_acl.h>
58 #include <linux/posix_acl_xattr.h>
59 #include <linux/mman.h>
60 #include <linux/string.h>
61 #include <linux/slab.h>
62 #include <linux/backing-dev.h>
63 #include <linux/writeback.h>
64 #include <linux/pagevec.h>
65 #include <linux/percpu_counter.h>
66 #include <linux/falloc.h>
67 #include <linux/splice.h>
68 #include <linux/security.h>
69 #include <linux/swapops.h>
70 #include <linux/mempolicy.h>
71 #include <linux/namei.h>
72 #include <linux/ctype.h>
73 #include <linux/migrate.h>
74 #include <linux/highmem.h>
75 #include <linux/seq_file.h>
76 #include <linux/magic.h>
77 #include <linux/syscalls.h>
78 #include <linux/fcntl.h>
79 #include <uapi/linux/memfd.h>
80 #include <linux/rmap.h>
81 #include <linux/uuid.h>
82 #include <linux/quotaops.h>
83 #include <linux/rcupdate_wait.h>
84 
85 #include <linux/uaccess.h>
86 
87 #include "internal.h"
88 
89 #define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
90 
91 /* Pretend that each entry is of this size in directory's i_size */
92 #define BOGO_DIRENT_SIZE 20
93 
94 /* Pretend that one inode + its dentry occupy this much memory */
95 #define BOGO_INODE_SIZE 1024
96 
97 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */
98 #define SHORT_SYMLINK_LEN 128
99 
100 /*
101  * shmem_fallocate communicates with shmem_fault or shmem_writeout via
102  * inode->i_private (with i_rwsem making sure that it has only one user at
103  * a time): we would prefer not to enlarge the shmem inode just for that.
104  */
105 struct shmem_falloc {
106 	wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
107 	pgoff_t start;		/* start of range currently being fallocated */
108 	pgoff_t next;		/* the next page offset to be fallocated */
109 	pgoff_t nr_falloced;	/* how many new pages have been fallocated */
110 	pgoff_t nr_unswapped;	/* how often writeout refused to swap out */
111 };
112 
113 struct shmem_options {
114 	unsigned long long blocks;
115 	unsigned long long inodes;
116 	struct mempolicy *mpol;
117 	kuid_t uid;
118 	kgid_t gid;
119 	umode_t mode;
120 	bool full_inums;
121 	int huge;
122 	int seen;
123 	bool noswap;
124 	unsigned short quota_types;
125 	struct shmem_quota_limits qlimits;
126 #if IS_ENABLED(CONFIG_UNICODE)
127 	struct unicode_map *encoding;
128 	bool strict_encoding;
129 #endif
130 #define SHMEM_SEEN_BLOCKS 1
131 #define SHMEM_SEEN_INODES 2
132 #define SHMEM_SEEN_HUGE 4
133 #define SHMEM_SEEN_INUMS 8
134 #define SHMEM_SEEN_NOSWAP 16
135 #define SHMEM_SEEN_QUOTA 32
136 };
137 
138 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
139 static unsigned long huge_shmem_orders_always __read_mostly;
140 static unsigned long huge_shmem_orders_madvise __read_mostly;
141 static unsigned long huge_shmem_orders_inherit __read_mostly;
142 static unsigned long huge_shmem_orders_within_size __read_mostly;
143 static bool shmem_orders_configured __initdata;
144 #endif
145 
146 #ifdef CONFIG_TMPFS
shmem_default_max_blocks(void)147 static unsigned long shmem_default_max_blocks(void)
148 {
149 	return totalram_pages() / 2;
150 }
151 
shmem_default_max_inodes(void)152 static unsigned long shmem_default_max_inodes(void)
153 {
154 	unsigned long nr_pages = totalram_pages();
155 
156 	return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
157 			ULONG_MAX / BOGO_INODE_SIZE);
158 }
159 #endif
160 
161 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
162 			struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
163 			struct vm_area_struct *vma, vm_fault_t *fault_type);
164 
SHMEM_SB(struct super_block * sb)165 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
166 {
167 	return sb->s_fs_info;
168 }
169 
170 /*
171  * shmem_file_setup pre-accounts the whole fixed size of a VM object,
172  * for shared memory and for shared anonymous (/dev/zero) mappings
173  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
174  * consistent with the pre-accounting of private mappings ...
175  */
shmem_acct_size(unsigned long flags,loff_t size)176 static inline int shmem_acct_size(unsigned long flags, loff_t size)
177 {
178 	return (flags & VM_NORESERVE) ?
179 		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
180 }
181 
shmem_unacct_size(unsigned long flags,loff_t size)182 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
183 {
184 	if (!(flags & VM_NORESERVE))
185 		vm_unacct_memory(VM_ACCT(size));
186 }
187 
shmem_reacct_size(unsigned long flags,loff_t oldsize,loff_t newsize)188 static inline int shmem_reacct_size(unsigned long flags,
189 		loff_t oldsize, loff_t newsize)
190 {
191 	if (!(flags & VM_NORESERVE)) {
192 		if (VM_ACCT(newsize) > VM_ACCT(oldsize))
193 			return security_vm_enough_memory_mm(current->mm,
194 					VM_ACCT(newsize) - VM_ACCT(oldsize));
195 		else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
196 			vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
197 	}
198 	return 0;
199 }
200 
201 /*
202  * ... whereas tmpfs objects are accounted incrementally as
203  * pages are allocated, in order to allow large sparse files.
204  * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
205  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
206  */
shmem_acct_blocks(unsigned long flags,long pages)207 static inline int shmem_acct_blocks(unsigned long flags, long pages)
208 {
209 	if (!(flags & VM_NORESERVE))
210 		return 0;
211 
212 	return security_vm_enough_memory_mm(current->mm,
213 			pages * VM_ACCT(PAGE_SIZE));
214 }
215 
shmem_unacct_blocks(unsigned long flags,long pages)216 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
217 {
218 	if (flags & VM_NORESERVE)
219 		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
220 }
221 
shmem_inode_acct_blocks(struct inode * inode,long pages)222 static int shmem_inode_acct_blocks(struct inode *inode, long pages)
223 {
224 	struct shmem_inode_info *info = SHMEM_I(inode);
225 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
226 	int err = -ENOSPC;
227 
228 	if (shmem_acct_blocks(info->flags, pages))
229 		return err;
230 
231 	might_sleep();	/* when quotas */
232 	if (sbinfo->max_blocks) {
233 		if (!percpu_counter_limited_add(&sbinfo->used_blocks,
234 						sbinfo->max_blocks, pages))
235 			goto unacct;
236 
237 		err = dquot_alloc_block_nodirty(inode, pages);
238 		if (err) {
239 			percpu_counter_sub(&sbinfo->used_blocks, pages);
240 			goto unacct;
241 		}
242 	} else {
243 		err = dquot_alloc_block_nodirty(inode, pages);
244 		if (err)
245 			goto unacct;
246 	}
247 
248 	return 0;
249 
250 unacct:
251 	shmem_unacct_blocks(info->flags, pages);
252 	return err;
253 }
254 
shmem_inode_unacct_blocks(struct inode * inode,long pages)255 static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
256 {
257 	struct shmem_inode_info *info = SHMEM_I(inode);
258 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
259 
260 	might_sleep();	/* when quotas */
261 	dquot_free_block_nodirty(inode, pages);
262 
263 	if (sbinfo->max_blocks)
264 		percpu_counter_sub(&sbinfo->used_blocks, pages);
265 	shmem_unacct_blocks(info->flags, pages);
266 }
267 
268 static const struct super_operations shmem_ops;
269 static const struct address_space_operations shmem_aops;
270 static const struct file_operations shmem_file_operations;
271 static const struct inode_operations shmem_inode_operations;
272 static const struct inode_operations shmem_dir_inode_operations;
273 static const struct inode_operations shmem_special_inode_operations;
274 static const struct vm_operations_struct shmem_vm_ops;
275 static const struct vm_operations_struct shmem_anon_vm_ops;
276 static struct file_system_type shmem_fs_type;
277 
shmem_mapping(struct address_space * mapping)278 bool shmem_mapping(struct address_space *mapping)
279 {
280 	return mapping->a_ops == &shmem_aops;
281 }
282 EXPORT_SYMBOL_GPL(shmem_mapping);
283 
vma_is_anon_shmem(struct vm_area_struct * vma)284 bool vma_is_anon_shmem(struct vm_area_struct *vma)
285 {
286 	return vma->vm_ops == &shmem_anon_vm_ops;
287 }
288 
vma_is_shmem(struct vm_area_struct * vma)289 bool vma_is_shmem(struct vm_area_struct *vma)
290 {
291 	return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
292 }
293 
294 static LIST_HEAD(shmem_swaplist);
295 static DEFINE_SPINLOCK(shmem_swaplist_lock);
296 
297 #ifdef CONFIG_TMPFS_QUOTA
298 
shmem_enable_quotas(struct super_block * sb,unsigned short quota_types)299 static int shmem_enable_quotas(struct super_block *sb,
300 			       unsigned short quota_types)
301 {
302 	int type, err = 0;
303 
304 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
305 	for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
306 		if (!(quota_types & (1 << type)))
307 			continue;
308 		err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
309 					  DQUOT_USAGE_ENABLED |
310 					  DQUOT_LIMITS_ENABLED);
311 		if (err)
312 			goto out_err;
313 	}
314 	return 0;
315 
316 out_err:
317 	pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
318 		type, err);
319 	for (type--; type >= 0; type--)
320 		dquot_quota_off(sb, type);
321 	return err;
322 }
323 
shmem_disable_quotas(struct super_block * sb)324 static void shmem_disable_quotas(struct super_block *sb)
325 {
326 	int type;
327 
328 	for (type = 0; type < SHMEM_MAXQUOTAS; type++)
329 		dquot_quota_off(sb, type);
330 }
331 
shmem_get_dquots(struct inode * inode)332 static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
333 {
334 	return SHMEM_I(inode)->i_dquot;
335 }
336 #endif /* CONFIG_TMPFS_QUOTA */
337 
338 /*
339  * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
340  * produces a novel ino for the newly allocated inode.
341  *
342  * It may also be called when making a hard link to permit the space needed by
343  * each dentry. However, in that case, no new inode number is needed since that
344  * internally draws from another pool of inode numbers (currently global
345  * get_next_ino()). This case is indicated by passing NULL as inop.
346  */
347 #define SHMEM_INO_BATCH 1024
shmem_reserve_inode(struct super_block * sb,ino_t * inop)348 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
349 {
350 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
351 	ino_t ino;
352 
353 	if (!(sb->s_flags & SB_KERNMOUNT)) {
354 		raw_spin_lock(&sbinfo->stat_lock);
355 		if (sbinfo->max_inodes) {
356 			if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
357 				raw_spin_unlock(&sbinfo->stat_lock);
358 				return -ENOSPC;
359 			}
360 			sbinfo->free_ispace -= BOGO_INODE_SIZE;
361 		}
362 		if (inop) {
363 			ino = sbinfo->next_ino++;
364 			if (unlikely(is_zero_ino(ino)))
365 				ino = sbinfo->next_ino++;
366 			if (unlikely(!sbinfo->full_inums &&
367 				     ino > UINT_MAX)) {
368 				/*
369 				 * Emulate get_next_ino uint wraparound for
370 				 * compatibility
371 				 */
372 				if (IS_ENABLED(CONFIG_64BIT))
373 					pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
374 						__func__, MINOR(sb->s_dev));
375 				sbinfo->next_ino = 1;
376 				ino = sbinfo->next_ino++;
377 			}
378 			*inop = ino;
379 		}
380 		raw_spin_unlock(&sbinfo->stat_lock);
381 	} else if (inop) {
382 		/*
383 		 * __shmem_file_setup, one of our callers, is lock-free: it
384 		 * doesn't hold stat_lock in shmem_reserve_inode since
385 		 * max_inodes is always 0, and is called from potentially
386 		 * unknown contexts. As such, use a per-cpu batched allocator
387 		 * which doesn't require the per-sb stat_lock unless we are at
388 		 * the batch boundary.
389 		 *
390 		 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
391 		 * shmem mounts are not exposed to userspace, so we don't need
392 		 * to worry about things like glibc compatibility.
393 		 */
394 		ino_t *next_ino;
395 
396 		next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
397 		ino = *next_ino;
398 		if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
399 			raw_spin_lock(&sbinfo->stat_lock);
400 			ino = sbinfo->next_ino;
401 			sbinfo->next_ino += SHMEM_INO_BATCH;
402 			raw_spin_unlock(&sbinfo->stat_lock);
403 			if (unlikely(is_zero_ino(ino)))
404 				ino++;
405 		}
406 		*inop = ino;
407 		*next_ino = ++ino;
408 		put_cpu();
409 	}
410 
411 	return 0;
412 }
413 
shmem_free_inode(struct super_block * sb,size_t freed_ispace)414 static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
415 {
416 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
417 	if (sbinfo->max_inodes) {
418 		raw_spin_lock(&sbinfo->stat_lock);
419 		sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
420 		raw_spin_unlock(&sbinfo->stat_lock);
421 	}
422 }
423 
424 /**
425  * shmem_recalc_inode - recalculate the block usage of an inode
426  * @inode: inode to recalc
427  * @alloced: the change in number of pages allocated to inode
428  * @swapped: the change in number of pages swapped from inode
429  *
430  * We have to calculate the free blocks since the mm can drop
431  * undirtied hole pages behind our back.
432  *
433  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
434  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
435  *
436  * Return: true if swapped was incremented from 0, for shmem_writeout().
437  */
shmem_recalc_inode(struct inode * inode,long alloced,long swapped)438 static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
439 {
440 	struct shmem_inode_info *info = SHMEM_I(inode);
441 	bool first_swapped = false;
442 	long freed;
443 
444 	spin_lock(&info->lock);
445 	info->alloced += alloced;
446 	info->swapped += swapped;
447 	freed = info->alloced - info->swapped -
448 		READ_ONCE(inode->i_mapping->nrpages);
449 	/*
450 	 * Special case: whereas normally shmem_recalc_inode() is called
451 	 * after i_mapping->nrpages has already been adjusted (up or down),
452 	 * shmem_writeout() has to raise swapped before nrpages is lowered -
453 	 * to stop a racing shmem_recalc_inode() from thinking that a page has
454 	 * been freed.  Compensate here, to avoid the need for a followup call.
455 	 */
456 	if (swapped > 0) {
457 		if (info->swapped == swapped)
458 			first_swapped = true;
459 		freed += swapped;
460 	}
461 	if (freed > 0)
462 		info->alloced -= freed;
463 	spin_unlock(&info->lock);
464 
465 	/* The quota case may block */
466 	if (freed > 0)
467 		shmem_inode_unacct_blocks(inode, freed);
468 	return first_swapped;
469 }
470 
shmem_charge(struct inode * inode,long pages)471 bool shmem_charge(struct inode *inode, long pages)
472 {
473 	struct address_space *mapping = inode->i_mapping;
474 
475 	if (shmem_inode_acct_blocks(inode, pages))
476 		return false;
477 
478 	/* nrpages adjustment first, then shmem_recalc_inode() when balanced */
479 	xa_lock_irq(&mapping->i_pages);
480 	mapping->nrpages += pages;
481 	xa_unlock_irq(&mapping->i_pages);
482 
483 	shmem_recalc_inode(inode, pages, 0);
484 	return true;
485 }
486 
shmem_uncharge(struct inode * inode,long pages)487 void shmem_uncharge(struct inode *inode, long pages)
488 {
489 	/* pages argument is currently unused: keep it to help debugging */
490 	/* nrpages adjustment done by __filemap_remove_folio() or caller */
491 
492 	shmem_recalc_inode(inode, 0, 0);
493 }
494 
495 /*
496  * Replace item expected in xarray by a new item, while holding xa_lock.
497  */
shmem_replace_entry(struct address_space * mapping,pgoff_t index,void * expected,void * replacement)498 static int shmem_replace_entry(struct address_space *mapping,
499 			pgoff_t index, void *expected, void *replacement)
500 {
501 	XA_STATE(xas, &mapping->i_pages, index);
502 	void *item;
503 
504 	VM_BUG_ON(!expected);
505 	VM_BUG_ON(!replacement);
506 	item = xas_load(&xas);
507 	if (item != expected)
508 		return -ENOENT;
509 	xas_store(&xas, replacement);
510 	return 0;
511 }
512 
513 /*
514  * Sometimes, before we decide whether to proceed or to fail, we must check
515  * that an entry was not already brought back from swap by a racing thread.
516  *
517  * Checking folio is not enough: by the time a swapcache folio is locked, it
518  * might be reused, and again be swapcache, using the same swap as before.
519  */
shmem_confirm_swap(struct address_space * mapping,pgoff_t index,swp_entry_t swap)520 static bool shmem_confirm_swap(struct address_space *mapping,
521 			       pgoff_t index, swp_entry_t swap)
522 {
523 	return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
524 }
525 
526 /*
527  * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
528  *
529  * SHMEM_HUGE_NEVER:
530  *	disables huge pages for the mount;
531  * SHMEM_HUGE_ALWAYS:
532  *	enables huge pages for the mount;
533  * SHMEM_HUGE_WITHIN_SIZE:
534  *	only allocate huge pages if the page will be fully within i_size,
535  *	also respect madvise() hints;
536  * SHMEM_HUGE_ADVISE:
537  *	only allocate huge pages if requested with madvise();
538  */
539 
540 #define SHMEM_HUGE_NEVER	0
541 #define SHMEM_HUGE_ALWAYS	1
542 #define SHMEM_HUGE_WITHIN_SIZE	2
543 #define SHMEM_HUGE_ADVISE	3
544 
545 /*
546  * Special values.
547  * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
548  *
549  * SHMEM_HUGE_DENY:
550  *	disables huge on shm_mnt and all mounts, for emergency use;
551  * SHMEM_HUGE_FORCE:
552  *	enables huge on shm_mnt and all mounts, w/o needing option, for testing;
553  *
554  */
555 #define SHMEM_HUGE_DENY		(-1)
556 #define SHMEM_HUGE_FORCE	(-2)
557 
558 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
559 /* ifdef here to avoid bloating shmem.o when not necessary */
560 
561 static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
562 static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
563 
564 /**
565  * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
566  * @mapping: Target address_space.
567  * @index: The page index.
568  * @write_end: end of a write, could extend inode size.
569  *
570  * This returns huge orders for folios (when supported) based on the file size
571  * which the mapping currently allows at the given index. The index is relevant
572  * due to alignment considerations the mapping might have. The returned order
573  * may be less than the size passed.
574  *
575  * Return: The orders.
576  */
577 static inline unsigned int
shmem_mapping_size_orders(struct address_space * mapping,pgoff_t index,loff_t write_end)578 shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
579 {
580 	unsigned int order;
581 	size_t size;
582 
583 	if (!mapping_large_folio_support(mapping) || !write_end)
584 		return 0;
585 
586 	/* Calculate the write size based on the write_end */
587 	size = write_end - (index << PAGE_SHIFT);
588 	order = filemap_get_order(size);
589 	if (!order)
590 		return 0;
591 
592 	/* If we're not aligned, allocate a smaller folio */
593 	if (index & ((1UL << order) - 1))
594 		order = __ffs(index);
595 
596 	order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
597 	return order > 0 ? BIT(order + 1) - 1 : 0;
598 }
599 
shmem_get_orders_within_size(struct inode * inode,unsigned long within_size_orders,pgoff_t index,loff_t write_end)600 static unsigned int shmem_get_orders_within_size(struct inode *inode,
601 		unsigned long within_size_orders, pgoff_t index,
602 		loff_t write_end)
603 {
604 	pgoff_t aligned_index;
605 	unsigned long order;
606 	loff_t i_size;
607 
608 	order = highest_order(within_size_orders);
609 	while (within_size_orders) {
610 		aligned_index = round_up(index + 1, 1 << order);
611 		i_size = max(write_end, i_size_read(inode));
612 		i_size = round_up(i_size, PAGE_SIZE);
613 		if (i_size >> PAGE_SHIFT >= aligned_index)
614 			return within_size_orders;
615 
616 		order = next_order(&within_size_orders, order);
617 	}
618 
619 	return 0;
620 }
621 
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,vm_flags_t vm_flags)622 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
623 					      loff_t write_end, bool shmem_huge_force,
624 					      struct vm_area_struct *vma,
625 					      vm_flags_t vm_flags)
626 {
627 	unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
628 		0 : BIT(HPAGE_PMD_ORDER);
629 	unsigned long within_size_orders;
630 
631 	if (!S_ISREG(inode->i_mode))
632 		return 0;
633 	if (shmem_huge == SHMEM_HUGE_DENY)
634 		return 0;
635 	if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
636 		return maybe_pmd_order;
637 
638 	/*
639 	 * The huge order allocation for anon shmem is controlled through
640 	 * the mTHP interface, so we still use PMD-sized huge order to
641 	 * check whether global control is enabled.
642 	 *
643 	 * For tmpfs mmap()'s huge order, we still use PMD-sized order to
644 	 * allocate huge pages due to lack of a write size hint.
645 	 *
646 	 * Otherwise, tmpfs will allow getting a highest order hint based on
647 	 * the size of write and fallocate paths, then will try each allowable
648 	 * huge orders.
649 	 */
650 	switch (SHMEM_SB(inode->i_sb)->huge) {
651 	case SHMEM_HUGE_ALWAYS:
652 		if (vma)
653 			return maybe_pmd_order;
654 
655 		return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
656 	case SHMEM_HUGE_WITHIN_SIZE:
657 		if (vma)
658 			within_size_orders = maybe_pmd_order;
659 		else
660 			within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
661 								       index, write_end);
662 
663 		within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
664 								  index, write_end);
665 		if (within_size_orders > 0)
666 			return within_size_orders;
667 
668 		fallthrough;
669 	case SHMEM_HUGE_ADVISE:
670 		if (vm_flags & VM_HUGEPAGE)
671 			return maybe_pmd_order;
672 		fallthrough;
673 	default:
674 		return 0;
675 	}
676 }
677 
shmem_parse_huge(const char * str)678 static int shmem_parse_huge(const char *str)
679 {
680 	int huge;
681 
682 	if (!str)
683 		return -EINVAL;
684 
685 	if (!strcmp(str, "never"))
686 		huge = SHMEM_HUGE_NEVER;
687 	else if (!strcmp(str, "always"))
688 		huge = SHMEM_HUGE_ALWAYS;
689 	else if (!strcmp(str, "within_size"))
690 		huge = SHMEM_HUGE_WITHIN_SIZE;
691 	else if (!strcmp(str, "advise"))
692 		huge = SHMEM_HUGE_ADVISE;
693 	else if (!strcmp(str, "deny"))
694 		huge = SHMEM_HUGE_DENY;
695 	else if (!strcmp(str, "force"))
696 		huge = SHMEM_HUGE_FORCE;
697 	else
698 		return -EINVAL;
699 
700 	if (!has_transparent_hugepage() &&
701 	    huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
702 		return -EINVAL;
703 
704 	/* Do not override huge allocation policy with non-PMD sized mTHP */
705 	if (huge == SHMEM_HUGE_FORCE &&
706 	    huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
707 		return -EINVAL;
708 
709 	return huge;
710 }
711 
712 #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
shmem_format_huge(int huge)713 static const char *shmem_format_huge(int huge)
714 {
715 	switch (huge) {
716 	case SHMEM_HUGE_NEVER:
717 		return "never";
718 	case SHMEM_HUGE_ALWAYS:
719 		return "always";
720 	case SHMEM_HUGE_WITHIN_SIZE:
721 		return "within_size";
722 	case SHMEM_HUGE_ADVISE:
723 		return "advise";
724 	case SHMEM_HUGE_DENY:
725 		return "deny";
726 	case SHMEM_HUGE_FORCE:
727 		return "force";
728 	default:
729 		VM_BUG_ON(1);
730 		return "bad_val";
731 	}
732 }
733 #endif
734 
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)735 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
736 		struct shrink_control *sc, unsigned long nr_to_free)
737 {
738 	LIST_HEAD(list), *pos, *next;
739 	struct inode *inode;
740 	struct shmem_inode_info *info;
741 	struct folio *folio;
742 	unsigned long batch = sc ? sc->nr_to_scan : 128;
743 	unsigned long split = 0, freed = 0;
744 
745 	if (list_empty(&sbinfo->shrinklist))
746 		return SHRINK_STOP;
747 
748 	spin_lock(&sbinfo->shrinklist_lock);
749 	list_for_each_safe(pos, next, &sbinfo->shrinklist) {
750 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
751 
752 		/* pin the inode */
753 		inode = igrab(&info->vfs_inode);
754 
755 		/* inode is about to be evicted */
756 		if (!inode) {
757 			list_del_init(&info->shrinklist);
758 			goto next;
759 		}
760 
761 		list_move(&info->shrinklist, &list);
762 next:
763 		sbinfo->shrinklist_len--;
764 		if (!--batch)
765 			break;
766 	}
767 	spin_unlock(&sbinfo->shrinklist_lock);
768 
769 	list_for_each_safe(pos, next, &list) {
770 		pgoff_t next, end;
771 		loff_t i_size;
772 		int ret;
773 
774 		info = list_entry(pos, struct shmem_inode_info, shrinklist);
775 		inode = &info->vfs_inode;
776 
777 		if (nr_to_free && freed >= nr_to_free)
778 			goto move_back;
779 
780 		i_size = i_size_read(inode);
781 		folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
782 		if (!folio || xa_is_value(folio))
783 			goto drop;
784 
785 		/* No large folio at the end of the file: nothing to split */
786 		if (!folio_test_large(folio)) {
787 			folio_put(folio);
788 			goto drop;
789 		}
790 
791 		/* Check if there is anything to gain from splitting */
792 		next = folio_next_index(folio);
793 		end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
794 		if (end <= folio->index || end >= next) {
795 			folio_put(folio);
796 			goto drop;
797 		}
798 
799 		/*
800 		 * Move the inode on the list back to shrinklist if we failed
801 		 * to lock the page at this time.
802 		 *
803 		 * Waiting for the lock may lead to deadlock in the
804 		 * reclaim path.
805 		 */
806 		if (!folio_trylock(folio)) {
807 			folio_put(folio);
808 			goto move_back;
809 		}
810 
811 		ret = split_folio(folio);
812 		folio_unlock(folio);
813 		folio_put(folio);
814 
815 		/* If split failed move the inode on the list back to shrinklist */
816 		if (ret)
817 			goto move_back;
818 
819 		freed += next - end;
820 		split++;
821 drop:
822 		list_del_init(&info->shrinklist);
823 		goto put;
824 move_back:
825 		/*
826 		 * Make sure the inode is either on the global list or deleted
827 		 * from any local list before iput() since it could be deleted
828 		 * in another thread once we put the inode (then the local list
829 		 * is corrupted).
830 		 */
831 		spin_lock(&sbinfo->shrinklist_lock);
832 		list_move(&info->shrinklist, &sbinfo->shrinklist);
833 		sbinfo->shrinklist_len++;
834 		spin_unlock(&sbinfo->shrinklist_lock);
835 put:
836 		iput(inode);
837 	}
838 
839 	return split;
840 }
841 
shmem_unused_huge_scan(struct super_block * sb,struct shrink_control * sc)842 static long shmem_unused_huge_scan(struct super_block *sb,
843 		struct shrink_control *sc)
844 {
845 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
846 
847 	if (!READ_ONCE(sbinfo->shrinklist_len))
848 		return SHRINK_STOP;
849 
850 	return shmem_unused_huge_shrink(sbinfo, sc, 0);
851 }
852 
shmem_unused_huge_count(struct super_block * sb,struct shrink_control * sc)853 static long shmem_unused_huge_count(struct super_block *sb,
854 		struct shrink_control *sc)
855 {
856 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
857 	return READ_ONCE(sbinfo->shrinklist_len);
858 }
859 #else /* !CONFIG_TRANSPARENT_HUGEPAGE */
860 
861 #define shmem_huge SHMEM_HUGE_DENY
862 
shmem_unused_huge_shrink(struct shmem_sb_info * sbinfo,struct shrink_control * sc,unsigned long nr_to_free)863 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
864 		struct shrink_control *sc, unsigned long nr_to_free)
865 {
866 	return 0;
867 }
868 
shmem_huge_global_enabled(struct inode * inode,pgoff_t index,loff_t write_end,bool shmem_huge_force,struct vm_area_struct * vma,vm_flags_t vm_flags)869 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
870 					      loff_t write_end, bool shmem_huge_force,
871 					      struct vm_area_struct *vma,
872 					      vm_flags_t vm_flags)
873 {
874 	return 0;
875 }
876 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
877 
shmem_update_stats(struct folio * folio,int nr_pages)878 static void shmem_update_stats(struct folio *folio, int nr_pages)
879 {
880 	if (folio_test_pmd_mappable(folio))
881 		__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
882 	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
883 	__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
884 }
885 
886 /*
887  * Somewhat like filemap_add_folio, but error if expected item has gone.
888  */
shmem_add_to_page_cache(struct folio * folio,struct address_space * mapping,pgoff_t index,void * expected,gfp_t gfp)889 static int shmem_add_to_page_cache(struct folio *folio,
890 				   struct address_space *mapping,
891 				   pgoff_t index, void *expected, gfp_t gfp)
892 {
893 	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
894 	long nr = folio_nr_pages(folio);
895 
896 	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
897 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
898 	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
899 
900 	folio_ref_add(folio, nr);
901 	folio->mapping = mapping;
902 	folio->index = index;
903 
904 	gfp &= GFP_RECLAIM_MASK;
905 	folio_throttle_swaprate(folio, gfp);
906 
907 	do {
908 		xas_lock_irq(&xas);
909 		if (expected != xas_find_conflict(&xas)) {
910 			xas_set_err(&xas, -EEXIST);
911 			goto unlock;
912 		}
913 		if (expected && xas_find_conflict(&xas)) {
914 			xas_set_err(&xas, -EEXIST);
915 			goto unlock;
916 		}
917 		xas_store(&xas, folio);
918 		if (xas_error(&xas))
919 			goto unlock;
920 		shmem_update_stats(folio, nr);
921 		mapping->nrpages += nr;
922 unlock:
923 		xas_unlock_irq(&xas);
924 	} while (xas_nomem(&xas, gfp));
925 
926 	if (xas_error(&xas)) {
927 		folio->mapping = NULL;
928 		folio_ref_sub(folio, nr);
929 		return xas_error(&xas);
930 	}
931 
932 	return 0;
933 }
934 
935 /*
936  * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
937  */
shmem_delete_from_page_cache(struct folio * folio,void * radswap)938 static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
939 {
940 	struct address_space *mapping = folio->mapping;
941 	long nr = folio_nr_pages(folio);
942 	int error;
943 
944 	xa_lock_irq(&mapping->i_pages);
945 	error = shmem_replace_entry(mapping, folio->index, folio, radswap);
946 	folio->mapping = NULL;
947 	mapping->nrpages -= nr;
948 	shmem_update_stats(folio, -nr);
949 	xa_unlock_irq(&mapping->i_pages);
950 	folio_put_refs(folio, nr);
951 	BUG_ON(error);
952 }
953 
954 /*
955  * Remove swap entry from page cache, free the swap and its page cache. Returns
956  * the number of pages being freed. 0 means entry not found in XArray (0 pages
957  * being freed).
958  */
shmem_free_swap(struct address_space * mapping,pgoff_t index,void * radswap)959 static long shmem_free_swap(struct address_space *mapping,
960 			    pgoff_t index, void *radswap)
961 {
962 	int order = xa_get_order(&mapping->i_pages, index);
963 	void *old;
964 
965 	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
966 	if (old != radswap)
967 		return 0;
968 	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
969 
970 	return 1 << order;
971 }
972 
973 /*
974  * Determine (in bytes) how many of the shmem object's pages mapped by the
975  * given offsets are swapped out.
976  *
977  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
978  * as long as the inode doesn't go away and racy results are not a problem.
979  */
shmem_partial_swap_usage(struct address_space * mapping,pgoff_t start,pgoff_t end)980 unsigned long shmem_partial_swap_usage(struct address_space *mapping,
981 						pgoff_t start, pgoff_t end)
982 {
983 	XA_STATE(xas, &mapping->i_pages, start);
984 	struct page *page;
985 	unsigned long swapped = 0;
986 	unsigned long max = end - 1;
987 
988 	rcu_read_lock();
989 	xas_for_each(&xas, page, max) {
990 		if (xas_retry(&xas, page))
991 			continue;
992 		if (xa_is_value(page))
993 			swapped += 1 << xas_get_order(&xas);
994 		if (xas.xa_index == max)
995 			break;
996 		if (need_resched()) {
997 			xas_pause(&xas);
998 			cond_resched_rcu();
999 		}
1000 	}
1001 	rcu_read_unlock();
1002 
1003 	return swapped << PAGE_SHIFT;
1004 }
1005 
1006 /*
1007  * Determine (in bytes) how many of the shmem object's pages mapped by the
1008  * given vma is swapped out.
1009  *
1010  * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
1011  * as long as the inode doesn't go away and racy results are not a problem.
1012  */
shmem_swap_usage(struct vm_area_struct * vma)1013 unsigned long shmem_swap_usage(struct vm_area_struct *vma)
1014 {
1015 	struct inode *inode = file_inode(vma->vm_file);
1016 	struct shmem_inode_info *info = SHMEM_I(inode);
1017 	struct address_space *mapping = inode->i_mapping;
1018 	unsigned long swapped;
1019 
1020 	/* Be careful as we don't hold info->lock */
1021 	swapped = READ_ONCE(info->swapped);
1022 
1023 	/*
1024 	 * The easier cases are when the shmem object has nothing in swap, or
1025 	 * the vma maps it whole. Then we can simply use the stats that we
1026 	 * already track.
1027 	 */
1028 	if (!swapped)
1029 		return 0;
1030 
1031 	if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
1032 		return swapped << PAGE_SHIFT;
1033 
1034 	/* Here comes the more involved part */
1035 	return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
1036 					vma->vm_pgoff + vma_pages(vma));
1037 }
1038 
1039 /*
1040  * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
1041  */
shmem_unlock_mapping(struct address_space * mapping)1042 void shmem_unlock_mapping(struct address_space *mapping)
1043 {
1044 	struct folio_batch fbatch;
1045 	pgoff_t index = 0;
1046 
1047 	folio_batch_init(&fbatch);
1048 	/*
1049 	 * Minor point, but we might as well stop if someone else SHM_LOCKs it.
1050 	 */
1051 	while (!mapping_unevictable(mapping) &&
1052 	       filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
1053 		check_move_unevictable_folios(&fbatch);
1054 		folio_batch_release(&fbatch);
1055 		cond_resched();
1056 	}
1057 }
1058 
shmem_get_partial_folio(struct inode * inode,pgoff_t index)1059 static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
1060 {
1061 	struct folio *folio;
1062 
1063 	/*
1064 	 * At first avoid shmem_get_folio(,,,SGP_READ): that fails
1065 	 * beyond i_size, and reports fallocated folios as holes.
1066 	 */
1067 	folio = filemap_get_entry(inode->i_mapping, index);
1068 	if (!folio)
1069 		return folio;
1070 	if (!xa_is_value(folio)) {
1071 		folio_lock(folio);
1072 		if (folio->mapping == inode->i_mapping)
1073 			return folio;
1074 		/* The folio has been swapped out */
1075 		folio_unlock(folio);
1076 		folio_put(folio);
1077 	}
1078 	/*
1079 	 * But read a folio back from swap if any of it is within i_size
1080 	 * (although in some cases this is just a waste of time).
1081 	 */
1082 	folio = NULL;
1083 	shmem_get_folio(inode, index, 0, &folio, SGP_READ);
1084 	return folio;
1085 }
1086 
1087 /*
1088  * Remove range of pages and swap entries from page cache, and free them.
1089  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
1090  */
shmem_undo_range(struct inode * inode,loff_t lstart,loff_t lend,bool unfalloc)1091 static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
1092 								 bool unfalloc)
1093 {
1094 	struct address_space *mapping = inode->i_mapping;
1095 	struct shmem_inode_info *info = SHMEM_I(inode);
1096 	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
1097 	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
1098 	struct folio_batch fbatch;
1099 	pgoff_t indices[PAGEVEC_SIZE];
1100 	struct folio *folio;
1101 	bool same_folio;
1102 	long nr_swaps_freed = 0;
1103 	pgoff_t index;
1104 	int i;
1105 
1106 	if (lend == -1)
1107 		end = -1;	/* unsigned, so actually very big */
1108 
1109 	if (info->fallocend > start && info->fallocend <= end && !unfalloc)
1110 		info->fallocend = start;
1111 
1112 	folio_batch_init(&fbatch);
1113 	index = start;
1114 	while (index < end && find_lock_entries(mapping, &index, end - 1,
1115 			&fbatch, indices)) {
1116 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1117 			folio = fbatch.folios[i];
1118 
1119 			if (xa_is_value(folio)) {
1120 				if (unfalloc)
1121 					continue;
1122 				nr_swaps_freed += shmem_free_swap(mapping,
1123 							indices[i], folio);
1124 				continue;
1125 			}
1126 
1127 			if (!unfalloc || !folio_test_uptodate(folio))
1128 				truncate_inode_folio(mapping, folio);
1129 			folio_unlock(folio);
1130 		}
1131 		folio_batch_remove_exceptionals(&fbatch);
1132 		folio_batch_release(&fbatch);
1133 		cond_resched();
1134 	}
1135 
1136 	/*
1137 	 * When undoing a failed fallocate, we want none of the partial folio
1138 	 * zeroing and splitting below, but shall want to truncate the whole
1139 	 * folio when !uptodate indicates that it was added by this fallocate,
1140 	 * even when [lstart, lend] covers only a part of the folio.
1141 	 */
1142 	if (unfalloc)
1143 		goto whole_folios;
1144 
1145 	same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
1146 	folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
1147 	if (folio) {
1148 		same_folio = lend < folio_pos(folio) + folio_size(folio);
1149 		folio_mark_dirty(folio);
1150 		if (!truncate_inode_partial_folio(folio, lstart, lend)) {
1151 			start = folio_next_index(folio);
1152 			if (same_folio)
1153 				end = folio->index;
1154 		}
1155 		folio_unlock(folio);
1156 		folio_put(folio);
1157 		folio = NULL;
1158 	}
1159 
1160 	if (!same_folio)
1161 		folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
1162 	if (folio) {
1163 		folio_mark_dirty(folio);
1164 		if (!truncate_inode_partial_folio(folio, lstart, lend))
1165 			end = folio->index;
1166 		folio_unlock(folio);
1167 		folio_put(folio);
1168 	}
1169 
1170 whole_folios:
1171 
1172 	index = start;
1173 	while (index < end) {
1174 		cond_resched();
1175 
1176 		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
1177 				indices)) {
1178 			/* If all gone or hole-punch or unfalloc, we're done */
1179 			if (index == start || end != -1)
1180 				break;
1181 			/* But if truncating, restart to make sure all gone */
1182 			index = start;
1183 			continue;
1184 		}
1185 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
1186 			folio = fbatch.folios[i];
1187 
1188 			if (xa_is_value(folio)) {
1189 				long swaps_freed;
1190 
1191 				if (unfalloc)
1192 					continue;
1193 				swaps_freed = shmem_free_swap(mapping, indices[i], folio);
1194 				if (!swaps_freed) {
1195 					/* Swap was replaced by page: retry */
1196 					index = indices[i];
1197 					break;
1198 				}
1199 				nr_swaps_freed += swaps_freed;
1200 				continue;
1201 			}
1202 
1203 			folio_lock(folio);
1204 
1205 			if (!unfalloc || !folio_test_uptodate(folio)) {
1206 				if (folio_mapping(folio) != mapping) {
1207 					/* Page was replaced by swap: retry */
1208 					folio_unlock(folio);
1209 					index = indices[i];
1210 					break;
1211 				}
1212 				VM_BUG_ON_FOLIO(folio_test_writeback(folio),
1213 						folio);
1214 
1215 				if (!folio_test_large(folio)) {
1216 					truncate_inode_folio(mapping, folio);
1217 				} else if (truncate_inode_partial_folio(folio, lstart, lend)) {
1218 					/*
1219 					 * If we split a page, reset the loop so
1220 					 * that we pick up the new sub pages.
1221 					 * Otherwise the THP was entirely
1222 					 * dropped or the target range was
1223 					 * zeroed, so just continue the loop as
1224 					 * is.
1225 					 */
1226 					if (!folio_test_large(folio)) {
1227 						folio_unlock(folio);
1228 						index = start;
1229 						break;
1230 					}
1231 				}
1232 			}
1233 			folio_unlock(folio);
1234 		}
1235 		folio_batch_remove_exceptionals(&fbatch);
1236 		folio_batch_release(&fbatch);
1237 	}
1238 
1239 	shmem_recalc_inode(inode, 0, -nr_swaps_freed);
1240 }
1241 
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)1242 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
1243 {
1244 	shmem_undo_range(inode, lstart, lend, false);
1245 	inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
1246 	inode_inc_iversion(inode);
1247 }
1248 EXPORT_SYMBOL_GPL(shmem_truncate_range);
1249 
shmem_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int query_flags)1250 static int shmem_getattr(struct mnt_idmap *idmap,
1251 			 const struct path *path, struct kstat *stat,
1252 			 u32 request_mask, unsigned int query_flags)
1253 {
1254 	struct inode *inode = path->dentry->d_inode;
1255 	struct shmem_inode_info *info = SHMEM_I(inode);
1256 
1257 	if (info->alloced - info->swapped != inode->i_mapping->nrpages)
1258 		shmem_recalc_inode(inode, 0, 0);
1259 
1260 	if (info->fsflags & FS_APPEND_FL)
1261 		stat->attributes |= STATX_ATTR_APPEND;
1262 	if (info->fsflags & FS_IMMUTABLE_FL)
1263 		stat->attributes |= STATX_ATTR_IMMUTABLE;
1264 	if (info->fsflags & FS_NODUMP_FL)
1265 		stat->attributes |= STATX_ATTR_NODUMP;
1266 	stat->attributes_mask |= (STATX_ATTR_APPEND |
1267 			STATX_ATTR_IMMUTABLE |
1268 			STATX_ATTR_NODUMP);
1269 	generic_fillattr(idmap, request_mask, inode, stat);
1270 
1271 	if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
1272 		stat->blksize = HPAGE_PMD_SIZE;
1273 
1274 	if (request_mask & STATX_BTIME) {
1275 		stat->result_mask |= STATX_BTIME;
1276 		stat->btime.tv_sec = info->i_crtime.tv_sec;
1277 		stat->btime.tv_nsec = info->i_crtime.tv_nsec;
1278 	}
1279 
1280 	return 0;
1281 }
1282 
shmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)1283 static int shmem_setattr(struct mnt_idmap *idmap,
1284 			 struct dentry *dentry, struct iattr *attr)
1285 {
1286 	struct inode *inode = d_inode(dentry);
1287 	struct shmem_inode_info *info = SHMEM_I(inode);
1288 	int error;
1289 	bool update_mtime = false;
1290 	bool update_ctime = true;
1291 
1292 	error = setattr_prepare(idmap, dentry, attr);
1293 	if (error)
1294 		return error;
1295 
1296 	if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
1297 		if ((inode->i_mode ^ attr->ia_mode) & 0111) {
1298 			return -EPERM;
1299 		}
1300 	}
1301 
1302 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
1303 		loff_t oldsize = inode->i_size;
1304 		loff_t newsize = attr->ia_size;
1305 
1306 		/* protected by i_rwsem */
1307 		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
1308 		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
1309 			return -EPERM;
1310 
1311 		if (newsize != oldsize) {
1312 			error = shmem_reacct_size(SHMEM_I(inode)->flags,
1313 					oldsize, newsize);
1314 			if (error)
1315 				return error;
1316 			i_size_write(inode, newsize);
1317 			update_mtime = true;
1318 		} else {
1319 			update_ctime = false;
1320 		}
1321 		if (newsize <= oldsize) {
1322 			loff_t holebegin = round_up(newsize, PAGE_SIZE);
1323 			if (oldsize > holebegin)
1324 				unmap_mapping_range(inode->i_mapping,
1325 							holebegin, 0, 1);
1326 			if (info->alloced)
1327 				shmem_truncate_range(inode,
1328 							newsize, (loff_t)-1);
1329 			/* unmap again to remove racily COWed private pages */
1330 			if (oldsize > holebegin)
1331 				unmap_mapping_range(inode->i_mapping,
1332 							holebegin, 0, 1);
1333 		}
1334 	}
1335 
1336 	if (is_quota_modification(idmap, inode, attr)) {
1337 		error = dquot_initialize(inode);
1338 		if (error)
1339 			return error;
1340 	}
1341 
1342 	/* Transfer quota accounting */
1343 	if (i_uid_needs_update(idmap, attr, inode) ||
1344 	    i_gid_needs_update(idmap, attr, inode)) {
1345 		error = dquot_transfer(idmap, inode, attr);
1346 		if (error)
1347 			return error;
1348 	}
1349 
1350 	setattr_copy(idmap, inode, attr);
1351 	if (attr->ia_valid & ATTR_MODE)
1352 		error = posix_acl_chmod(idmap, dentry, inode->i_mode);
1353 	if (!error && update_ctime) {
1354 		inode_set_ctime_current(inode);
1355 		if (update_mtime)
1356 			inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
1357 		inode_inc_iversion(inode);
1358 	}
1359 	return error;
1360 }
1361 
shmem_evict_inode(struct inode * inode)1362 static void shmem_evict_inode(struct inode *inode)
1363 {
1364 	struct shmem_inode_info *info = SHMEM_I(inode);
1365 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1366 	size_t freed = 0;
1367 
1368 	if (shmem_mapping(inode->i_mapping)) {
1369 		shmem_unacct_size(info->flags, inode->i_size);
1370 		inode->i_size = 0;
1371 		mapping_set_exiting(inode->i_mapping);
1372 		shmem_truncate_range(inode, 0, (loff_t)-1);
1373 		if (!list_empty(&info->shrinklist)) {
1374 			spin_lock(&sbinfo->shrinklist_lock);
1375 			if (!list_empty(&info->shrinklist)) {
1376 				list_del_init(&info->shrinklist);
1377 				sbinfo->shrinklist_len--;
1378 			}
1379 			spin_unlock(&sbinfo->shrinklist_lock);
1380 		}
1381 		while (!list_empty(&info->swaplist)) {
1382 			/* Wait while shmem_unuse() is scanning this inode... */
1383 			wait_var_event(&info->stop_eviction,
1384 				       !atomic_read(&info->stop_eviction));
1385 			spin_lock(&shmem_swaplist_lock);
1386 			/* ...but beware of the race if we peeked too early */
1387 			if (!atomic_read(&info->stop_eviction))
1388 				list_del_init(&info->swaplist);
1389 			spin_unlock(&shmem_swaplist_lock);
1390 		}
1391 	}
1392 
1393 	simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
1394 	shmem_free_inode(inode->i_sb, freed);
1395 	WARN_ON(inode->i_blocks);
1396 	clear_inode(inode);
1397 #ifdef CONFIG_TMPFS_QUOTA
1398 	dquot_free_inode(inode);
1399 	dquot_drop(inode);
1400 #endif
1401 }
1402 
shmem_find_swap_entries(struct address_space * mapping,pgoff_t start,struct folio_batch * fbatch,pgoff_t * indices,unsigned int type)1403 static unsigned int shmem_find_swap_entries(struct address_space *mapping,
1404 				pgoff_t start, struct folio_batch *fbatch,
1405 				pgoff_t *indices, unsigned int type)
1406 {
1407 	XA_STATE(xas, &mapping->i_pages, start);
1408 	struct folio *folio;
1409 	swp_entry_t entry;
1410 
1411 	rcu_read_lock();
1412 	xas_for_each(&xas, folio, ULONG_MAX) {
1413 		if (xas_retry(&xas, folio))
1414 			continue;
1415 
1416 		if (!xa_is_value(folio))
1417 			continue;
1418 
1419 		entry = radix_to_swp_entry(folio);
1420 		/*
1421 		 * swapin error entries can be found in the mapping. But they're
1422 		 * deliberately ignored here as we've done everything we can do.
1423 		 */
1424 		if (swp_type(entry) != type)
1425 			continue;
1426 
1427 		indices[folio_batch_count(fbatch)] = xas.xa_index;
1428 		if (!folio_batch_add(fbatch, folio))
1429 			break;
1430 
1431 		if (need_resched()) {
1432 			xas_pause(&xas);
1433 			cond_resched_rcu();
1434 		}
1435 	}
1436 	rcu_read_unlock();
1437 
1438 	return folio_batch_count(fbatch);
1439 }
1440 
1441 /*
1442  * Move the swapped pages for an inode to page cache. Returns the count
1443  * of pages swapped in, or the error in case of failure.
1444  */
shmem_unuse_swap_entries(struct inode * inode,struct folio_batch * fbatch,pgoff_t * indices)1445 static int shmem_unuse_swap_entries(struct inode *inode,
1446 		struct folio_batch *fbatch, pgoff_t *indices)
1447 {
1448 	int i = 0;
1449 	int ret = 0;
1450 	int error = 0;
1451 	struct address_space *mapping = inode->i_mapping;
1452 
1453 	for (i = 0; i < folio_batch_count(fbatch); i++) {
1454 		struct folio *folio = fbatch->folios[i];
1455 
1456 		error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
1457 					mapping_gfp_mask(mapping), NULL, NULL);
1458 		if (error == 0) {
1459 			folio_unlock(folio);
1460 			folio_put(folio);
1461 			ret++;
1462 		}
1463 		if (error == -ENOMEM)
1464 			break;
1465 		error = 0;
1466 	}
1467 	return error ? error : ret;
1468 }
1469 
1470 /*
1471  * If swap found in inode, free it and move page from swapcache to filecache.
1472  */
shmem_unuse_inode(struct inode * inode,unsigned int type)1473 static int shmem_unuse_inode(struct inode *inode, unsigned int type)
1474 {
1475 	struct address_space *mapping = inode->i_mapping;
1476 	pgoff_t start = 0;
1477 	struct folio_batch fbatch;
1478 	pgoff_t indices[PAGEVEC_SIZE];
1479 	int ret = 0;
1480 
1481 	do {
1482 		folio_batch_init(&fbatch);
1483 		if (!shmem_find_swap_entries(mapping, start, &fbatch,
1484 					     indices, type)) {
1485 			ret = 0;
1486 			break;
1487 		}
1488 
1489 		ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
1490 		if (ret < 0)
1491 			break;
1492 
1493 		start = indices[folio_batch_count(&fbatch) - 1];
1494 	} while (true);
1495 
1496 	return ret;
1497 }
1498 
1499 /*
1500  * Read all the shared memory data that resides in the swap
1501  * device 'type' back into memory, so the swap device can be
1502  * unused.
1503  */
shmem_unuse(unsigned int type)1504 int shmem_unuse(unsigned int type)
1505 {
1506 	struct shmem_inode_info *info, *next;
1507 	int error = 0;
1508 
1509 	if (list_empty(&shmem_swaplist))
1510 		return 0;
1511 
1512 	spin_lock(&shmem_swaplist_lock);
1513 start_over:
1514 	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
1515 		if (!info->swapped) {
1516 			list_del_init(&info->swaplist);
1517 			continue;
1518 		}
1519 		/*
1520 		 * Drop the swaplist mutex while searching the inode for swap;
1521 		 * but before doing so, make sure shmem_evict_inode() will not
1522 		 * remove placeholder inode from swaplist, nor let it be freed
1523 		 * (igrab() would protect from unlink, but not from unmount).
1524 		 */
1525 		atomic_inc(&info->stop_eviction);
1526 		spin_unlock(&shmem_swaplist_lock);
1527 
1528 		error = shmem_unuse_inode(&info->vfs_inode, type);
1529 		cond_resched();
1530 
1531 		spin_lock(&shmem_swaplist_lock);
1532 		if (atomic_dec_and_test(&info->stop_eviction))
1533 			wake_up_var(&info->stop_eviction);
1534 		if (error)
1535 			break;
1536 		if (list_empty(&info->swaplist))
1537 			goto start_over;
1538 		next = list_next_entry(info, swaplist);
1539 		if (!info->swapped)
1540 			list_del_init(&info->swaplist);
1541 	}
1542 	spin_unlock(&shmem_swaplist_lock);
1543 
1544 	return error;
1545 }
1546 
1547 /**
1548  * shmem_writeout - Write the folio to swap
1549  * @folio: The folio to write
1550  * @plug: swap plug
1551  * @folio_list: list to put back folios on split
1552  *
1553  * Move the folio from the page cache to the swap cache.
1554  */
shmem_writeout(struct folio * folio,struct swap_iocb ** plug,struct list_head * folio_list)1555 int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
1556 		struct list_head *folio_list)
1557 {
1558 	struct address_space *mapping = folio->mapping;
1559 	struct inode *inode = mapping->host;
1560 	struct shmem_inode_info *info = SHMEM_I(inode);
1561 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1562 	pgoff_t index;
1563 	int nr_pages;
1564 	bool split = false;
1565 
1566 	if ((info->flags & VM_LOCKED) || sbinfo->noswap)
1567 		goto redirty;
1568 
1569 	if (!total_swap_pages)
1570 		goto redirty;
1571 
1572 	/*
1573 	 * If CONFIG_THP_SWAP is not enabled, the large folio should be
1574 	 * split when swapping.
1575 	 *
1576 	 * And shrinkage of pages beyond i_size does not split swap, so
1577 	 * swapout of a large folio crossing i_size needs to split too
1578 	 * (unless fallocate has been used to preallocate beyond EOF).
1579 	 */
1580 	if (folio_test_large(folio)) {
1581 		index = shmem_fallocend(inode,
1582 			DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
1583 		if ((index > folio->index && index < folio_next_index(folio)) ||
1584 		    !IS_ENABLED(CONFIG_THP_SWAP))
1585 			split = true;
1586 	}
1587 
1588 	if (split) {
1589 try_split:
1590 		/* Ensure the subpages are still dirty */
1591 		folio_test_set_dirty(folio);
1592 		if (split_folio_to_list(folio, folio_list))
1593 			goto redirty;
1594 		folio_clear_dirty(folio);
1595 	}
1596 
1597 	index = folio->index;
1598 	nr_pages = folio_nr_pages(folio);
1599 
1600 	/*
1601 	 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
1602 	 * value into swapfile.c, the only way we can correctly account for a
1603 	 * fallocated folio arriving here is now to initialize it and write it.
1604 	 *
1605 	 * That's okay for a folio already fallocated earlier, but if we have
1606 	 * not yet completed the fallocation, then (a) we want to keep track
1607 	 * of this folio in case we have to undo it, and (b) it may not be a
1608 	 * good idea to continue anyway, once we're pushing into swap.  So
1609 	 * reactivate the folio, and let shmem_fallocate() quit when too many.
1610 	 */
1611 	if (!folio_test_uptodate(folio)) {
1612 		if (inode->i_private) {
1613 			struct shmem_falloc *shmem_falloc;
1614 			spin_lock(&inode->i_lock);
1615 			shmem_falloc = inode->i_private;
1616 			if (shmem_falloc &&
1617 			    !shmem_falloc->waitq &&
1618 			    index >= shmem_falloc->start &&
1619 			    index < shmem_falloc->next)
1620 				shmem_falloc->nr_unswapped += nr_pages;
1621 			else
1622 				shmem_falloc = NULL;
1623 			spin_unlock(&inode->i_lock);
1624 			if (shmem_falloc)
1625 				goto redirty;
1626 		}
1627 		folio_zero_range(folio, 0, folio_size(folio));
1628 		flush_dcache_folio(folio);
1629 		folio_mark_uptodate(folio);
1630 	}
1631 
1632 	if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
1633 		bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
1634 		int error;
1635 
1636 		/*
1637 		 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1638 		 * if it's not already there.  Do it now before the folio is
1639 		 * removed from page cache, when its pagelock no longer
1640 		 * protects the inode from eviction.  And do it now, after
1641 		 * we've incremented swapped, because shmem_unuse() will
1642 		 * prune a !swapped inode from the swaplist.
1643 		 */
1644 		if (first_swapped) {
1645 			spin_lock(&shmem_swaplist_lock);
1646 			if (list_empty(&info->swaplist))
1647 				list_add(&info->swaplist, &shmem_swaplist);
1648 			spin_unlock(&shmem_swaplist_lock);
1649 		}
1650 
1651 		swap_shmem_alloc(folio->swap, nr_pages);
1652 		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
1653 
1654 		BUG_ON(folio_mapped(folio));
1655 		error = swap_writeout(folio, plug);
1656 		if (error != AOP_WRITEPAGE_ACTIVATE) {
1657 			/* folio has been unlocked */
1658 			return error;
1659 		}
1660 
1661 		/*
1662 		 * The intention here is to avoid holding on to the swap when
1663 		 * zswap was unable to compress and unable to writeback; but
1664 		 * it will be appropriate if other reactivate cases are added.
1665 		 */
1666 		error = shmem_add_to_page_cache(folio, mapping, index,
1667 				swp_to_radix_entry(folio->swap),
1668 				__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
1669 		/* Swap entry might be erased by racing shmem_free_swap() */
1670 		if (!error) {
1671 			shmem_recalc_inode(inode, 0, -nr_pages);
1672 			swap_free_nr(folio->swap, nr_pages);
1673 		}
1674 
1675 		/*
1676 		 * The delete_from_swap_cache() below could be left for
1677 		 * shrink_folio_list()'s folio_free_swap() to dispose of;
1678 		 * but I'm a little nervous about letting this folio out of
1679 		 * shmem_writeout() in a hybrid half-tmpfs-half-swap state
1680 		 * e.g. folio_mapping(folio) might give an unexpected answer.
1681 		 */
1682 		delete_from_swap_cache(folio);
1683 		goto redirty;
1684 	}
1685 	if (nr_pages > 1)
1686 		goto try_split;
1687 redirty:
1688 	folio_mark_dirty(folio);
1689 	return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
1690 }
1691 EXPORT_SYMBOL_GPL(shmem_writeout);
1692 
1693 #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1694 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1695 {
1696 	char buffer[64];
1697 
1698 	if (!mpol || mpol->mode == MPOL_DEFAULT)
1699 		return;		/* show nothing */
1700 
1701 	mpol_to_str(buffer, sizeof(buffer), mpol);
1702 
1703 	seq_printf(seq, ",mpol=%s", buffer);
1704 }
1705 
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1706 static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1707 {
1708 	struct mempolicy *mpol = NULL;
1709 	if (sbinfo->mpol) {
1710 		raw_spin_lock(&sbinfo->stat_lock);	/* prevent replace/use races */
1711 		mpol = sbinfo->mpol;
1712 		mpol_get(mpol);
1713 		raw_spin_unlock(&sbinfo->stat_lock);
1714 	}
1715 	return mpol;
1716 }
1717 #else /* !CONFIG_NUMA || !CONFIG_TMPFS */
shmem_show_mpol(struct seq_file * seq,struct mempolicy * mpol)1718 static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1719 {
1720 }
shmem_get_sbmpol(struct shmem_sb_info * sbinfo)1721 static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1722 {
1723 	return NULL;
1724 }
1725 #endif /* CONFIG_NUMA && CONFIG_TMPFS */
1726 
1727 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
1728 			pgoff_t index, unsigned int order, pgoff_t *ilx);
1729 
shmem_swapin_cluster(swp_entry_t swap,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index)1730 static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
1731 			struct shmem_inode_info *info, pgoff_t index)
1732 {
1733 	struct mempolicy *mpol;
1734 	pgoff_t ilx;
1735 	struct folio *folio;
1736 
1737 	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
1738 	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
1739 	mpol_cond_put(mpol);
1740 
1741 	return folio;
1742 }
1743 
1744 /*
1745  * Make sure huge_gfp is always more limited than limit_gfp.
1746  * Some of the flags set permissions, while others set limitations.
1747  */
limit_gfp_mask(gfp_t huge_gfp,gfp_t limit_gfp)1748 static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
1749 {
1750 	gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
1751 	gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
1752 	gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
1753 	gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
1754 
1755 	/* Allow allocations only from the originally specified zones. */
1756 	result |= zoneflags;
1757 
1758 	/*
1759 	 * Minimize the result gfp by taking the union with the deny flags,
1760 	 * and the intersection of the allow flags.
1761 	 */
1762 	result |= (limit_gfp & denyflags);
1763 	result |= (huge_gfp & limit_gfp) & allowflags;
1764 
1765 	return result;
1766 }
1767 
1768 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
shmem_hpage_pmd_enabled(void)1769 bool shmem_hpage_pmd_enabled(void)
1770 {
1771 	if (shmem_huge == SHMEM_HUGE_DENY)
1772 		return false;
1773 	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
1774 		return true;
1775 	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
1776 		return true;
1777 	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
1778 		return true;
1779 	if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
1780 	    shmem_huge != SHMEM_HUGE_NEVER)
1781 		return true;
1782 
1783 	return false;
1784 }
1785 
shmem_allowable_huge_orders(struct inode * inode,struct vm_area_struct * vma,pgoff_t index,loff_t write_end,bool shmem_huge_force)1786 unsigned long shmem_allowable_huge_orders(struct inode *inode,
1787 				struct vm_area_struct *vma, pgoff_t index,
1788 				loff_t write_end, bool shmem_huge_force)
1789 {
1790 	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
1791 	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
1792 	vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
1793 	unsigned int global_orders;
1794 
1795 	if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
1796 		return 0;
1797 
1798 	global_orders = shmem_huge_global_enabled(inode, index, write_end,
1799 						  shmem_huge_force, vma, vm_flags);
1800 	/* Tmpfs huge pages allocation */
1801 	if (!vma || !vma_is_anon_shmem(vma))
1802 		return global_orders;
1803 
1804 	/*
1805 	 * Following the 'deny' semantics of the top level, force the huge
1806 	 * option off from all mounts.
1807 	 */
1808 	if (shmem_huge == SHMEM_HUGE_DENY)
1809 		return 0;
1810 
1811 	/*
1812 	 * Only allow inherit orders if the top-level value is 'force', which
1813 	 * means non-PMD sized THP can not override 'huge' mount option now.
1814 	 */
1815 	if (shmem_huge == SHMEM_HUGE_FORCE)
1816 		return READ_ONCE(huge_shmem_orders_inherit);
1817 
1818 	/* Allow mTHP that will be fully within i_size. */
1819 	mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
1820 
1821 	if (vm_flags & VM_HUGEPAGE)
1822 		mask |= READ_ONCE(huge_shmem_orders_madvise);
1823 
1824 	if (global_orders > 0)
1825 		mask |= READ_ONCE(huge_shmem_orders_inherit);
1826 
1827 	return THP_ORDERS_ALL_FILE_DEFAULT & mask;
1828 }
1829 
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1830 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1831 					   struct address_space *mapping, pgoff_t index,
1832 					   unsigned long orders)
1833 {
1834 	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
1835 	pgoff_t aligned_index;
1836 	unsigned long pages;
1837 	int order;
1838 
1839 	if (vma) {
1840 		orders = thp_vma_suitable_orders(vma, vmf->address, orders);
1841 		if (!orders)
1842 			return 0;
1843 	}
1844 
1845 	/* Find the highest order that can add into the page cache */
1846 	order = highest_order(orders);
1847 	while (orders) {
1848 		pages = 1UL << order;
1849 		aligned_index = round_down(index, pages);
1850 		/*
1851 		 * Check for conflict before waiting on a huge allocation.
1852 		 * Conflict might be that a huge page has just been allocated
1853 		 * and added to page cache by a racing thread, or that there
1854 		 * is already at least one small page in the huge extent.
1855 		 * Be careful to retry when appropriate, but not forever!
1856 		 * Elsewhere -EEXIST would be the right code, but not here.
1857 		 */
1858 		if (!xa_find(&mapping->i_pages, &aligned_index,
1859 			     aligned_index + pages - 1, XA_PRESENT))
1860 			break;
1861 		order = next_order(&orders, order);
1862 	}
1863 
1864 	return orders;
1865 }
1866 #else
shmem_suitable_orders(struct inode * inode,struct vm_fault * vmf,struct address_space * mapping,pgoff_t index,unsigned long orders)1867 static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
1868 					   struct address_space *mapping, pgoff_t index,
1869 					   unsigned long orders)
1870 {
1871 	return 0;
1872 }
1873 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1874 
shmem_alloc_folio(gfp_t gfp,int order,struct shmem_inode_info * info,pgoff_t index)1875 static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
1876 		struct shmem_inode_info *info, pgoff_t index)
1877 {
1878 	struct mempolicy *mpol;
1879 	pgoff_t ilx;
1880 	struct folio *folio;
1881 
1882 	mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
1883 	folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
1884 	mpol_cond_put(mpol);
1885 
1886 	return folio;
1887 }
1888 
shmem_alloc_and_add_folio(struct vm_fault * vmf,gfp_t gfp,struct inode * inode,pgoff_t index,struct mm_struct * fault_mm,unsigned long orders)1889 static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
1890 		gfp_t gfp, struct inode *inode, pgoff_t index,
1891 		struct mm_struct *fault_mm, unsigned long orders)
1892 {
1893 	struct address_space *mapping = inode->i_mapping;
1894 	struct shmem_inode_info *info = SHMEM_I(inode);
1895 	unsigned long suitable_orders = 0;
1896 	struct folio *folio = NULL;
1897 	long pages;
1898 	int error, order;
1899 
1900 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1901 		orders = 0;
1902 
1903 	if (orders > 0) {
1904 		suitable_orders = shmem_suitable_orders(inode, vmf,
1905 							mapping, index, orders);
1906 
1907 		order = highest_order(suitable_orders);
1908 		while (suitable_orders) {
1909 			pages = 1UL << order;
1910 			index = round_down(index, pages);
1911 			folio = shmem_alloc_folio(gfp, order, info, index);
1912 			if (folio)
1913 				goto allocated;
1914 
1915 			if (pages == HPAGE_PMD_NR)
1916 				count_vm_event(THP_FILE_FALLBACK);
1917 			count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
1918 			order = next_order(&suitable_orders, order);
1919 		}
1920 	} else {
1921 		pages = 1;
1922 		folio = shmem_alloc_folio(gfp, 0, info, index);
1923 	}
1924 	if (!folio)
1925 		return ERR_PTR(-ENOMEM);
1926 
1927 allocated:
1928 	__folio_set_locked(folio);
1929 	__folio_set_swapbacked(folio);
1930 
1931 	gfp &= GFP_RECLAIM_MASK;
1932 	error = mem_cgroup_charge(folio, fault_mm, gfp);
1933 	if (error) {
1934 		if (xa_find(&mapping->i_pages, &index,
1935 				index + pages - 1, XA_PRESENT)) {
1936 			error = -EEXIST;
1937 		} else if (pages > 1) {
1938 			if (pages == HPAGE_PMD_NR) {
1939 				count_vm_event(THP_FILE_FALLBACK);
1940 				count_vm_event(THP_FILE_FALLBACK_CHARGE);
1941 			}
1942 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
1943 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
1944 		}
1945 		goto unlock;
1946 	}
1947 
1948 	error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
1949 	if (error)
1950 		goto unlock;
1951 
1952 	error = shmem_inode_acct_blocks(inode, pages);
1953 	if (error) {
1954 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1955 		long freed;
1956 		/*
1957 		 * Try to reclaim some space by splitting a few
1958 		 * large folios beyond i_size on the filesystem.
1959 		 */
1960 		shmem_unused_huge_shrink(sbinfo, NULL, pages);
1961 		/*
1962 		 * And do a shmem_recalc_inode() to account for freed pages:
1963 		 * except our folio is there in cache, so not quite balanced.
1964 		 */
1965 		spin_lock(&info->lock);
1966 		freed = pages + info->alloced - info->swapped -
1967 			READ_ONCE(mapping->nrpages);
1968 		if (freed > 0)
1969 			info->alloced -= freed;
1970 		spin_unlock(&info->lock);
1971 		if (freed > 0)
1972 			shmem_inode_unacct_blocks(inode, freed);
1973 		error = shmem_inode_acct_blocks(inode, pages);
1974 		if (error) {
1975 			filemap_remove_folio(folio);
1976 			goto unlock;
1977 		}
1978 	}
1979 
1980 	shmem_recalc_inode(inode, pages, 0);
1981 	folio_add_lru(folio);
1982 	return folio;
1983 
1984 unlock:
1985 	folio_unlock(folio);
1986 	folio_put(folio);
1987 	return ERR_PTR(error);
1988 }
1989 
shmem_swap_alloc_folio(struct inode * inode,struct vm_area_struct * vma,pgoff_t index,swp_entry_t entry,int order,gfp_t gfp)1990 static struct folio *shmem_swap_alloc_folio(struct inode *inode,
1991 		struct vm_area_struct *vma, pgoff_t index,
1992 		swp_entry_t entry, int order, gfp_t gfp)
1993 {
1994 	struct shmem_inode_info *info = SHMEM_I(inode);
1995 	struct folio *new;
1996 	void *shadow;
1997 	int nr_pages;
1998 
1999 	/*
2000 	 * We have arrived here because our zones are constrained, so don't
2001 	 * limit chance of success with further cpuset and node constraints.
2002 	 */
2003 	gfp &= ~GFP_CONSTRAINT_MASK;
2004 	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
2005 		gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2006 
2007 		gfp = limit_gfp_mask(huge_gfp, gfp);
2008 	}
2009 
2010 	new = shmem_alloc_folio(gfp, order, info, index);
2011 	if (!new)
2012 		return ERR_PTR(-ENOMEM);
2013 
2014 	nr_pages = folio_nr_pages(new);
2015 	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
2016 					   gfp, entry)) {
2017 		folio_put(new);
2018 		return ERR_PTR(-ENOMEM);
2019 	}
2020 
2021 	/*
2022 	 * Prevent parallel swapin from proceeding with the swap cache flag.
2023 	 *
2024 	 * Of course there is another possible concurrent scenario as well,
2025 	 * that is to say, the swap cache flag of a large folio has already
2026 	 * been set by swapcache_prepare(), while another thread may have
2027 	 * already split the large swap entry stored in the shmem mapping.
2028 	 * In this case, shmem_add_to_page_cache() will help identify the
2029 	 * concurrent swapin and return -EEXIST.
2030 	 */
2031 	if (swapcache_prepare(entry, nr_pages)) {
2032 		folio_put(new);
2033 		return ERR_PTR(-EEXIST);
2034 	}
2035 
2036 	__folio_set_locked(new);
2037 	__folio_set_swapbacked(new);
2038 	new->swap = entry;
2039 
2040 	memcg1_swapin(entry, nr_pages);
2041 	shadow = get_shadow_from_swap_cache(entry);
2042 	if (shadow)
2043 		workingset_refault(new, shadow);
2044 	folio_add_lru(new);
2045 	swap_read_folio(new, NULL);
2046 	return new;
2047 }
2048 
2049 /*
2050  * When a page is moved from swapcache to shmem filecache (either by the
2051  * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
2052  * shmem_unuse_inode()), it may have been read in earlier from swap, in
2053  * ignorance of the mapping it belongs to.  If that mapping has special
2054  * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
2055  * we may need to copy to a suitable page before moving to filecache.
2056  *
2057  * In a future release, this may well be extended to respect cpuset and
2058  * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
2059  * but for now it is a simple matter of zone.
2060  */
shmem_should_replace_folio(struct folio * folio,gfp_t gfp)2061 static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
2062 {
2063 	return folio_zonenum(folio) > gfp_zone(gfp);
2064 }
2065 
shmem_replace_folio(struct folio ** foliop,gfp_t gfp,struct shmem_inode_info * info,pgoff_t index,struct vm_area_struct * vma)2066 static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
2067 				struct shmem_inode_info *info, pgoff_t index,
2068 				struct vm_area_struct *vma)
2069 {
2070 	struct folio *new, *old = *foliop;
2071 	swp_entry_t entry = old->swap;
2072 	struct address_space *swap_mapping = swap_address_space(entry);
2073 	pgoff_t swap_index = swap_cache_index(entry);
2074 	XA_STATE(xas, &swap_mapping->i_pages, swap_index);
2075 	int nr_pages = folio_nr_pages(old);
2076 	int error = 0, i;
2077 
2078 	/*
2079 	 * We have arrived here because our zones are constrained, so don't
2080 	 * limit chance of success by further cpuset and node constraints.
2081 	 */
2082 	gfp &= ~GFP_CONSTRAINT_MASK;
2083 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2084 	if (nr_pages > 1) {
2085 		gfp_t huge_gfp = vma_thp_gfp_mask(vma);
2086 
2087 		gfp = limit_gfp_mask(huge_gfp, gfp);
2088 	}
2089 #endif
2090 
2091 	new = shmem_alloc_folio(gfp, folio_order(old), info, index);
2092 	if (!new)
2093 		return -ENOMEM;
2094 
2095 	folio_ref_add(new, nr_pages);
2096 	folio_copy(new, old);
2097 	flush_dcache_folio(new);
2098 
2099 	__folio_set_locked(new);
2100 	__folio_set_swapbacked(new);
2101 	folio_mark_uptodate(new);
2102 	new->swap = entry;
2103 	folio_set_swapcache(new);
2104 
2105 	/* Swap cache still stores N entries instead of a high-order entry */
2106 	xa_lock_irq(&swap_mapping->i_pages);
2107 	for (i = 0; i < nr_pages; i++) {
2108 		void *item = xas_load(&xas);
2109 
2110 		if (item != old) {
2111 			error = -ENOENT;
2112 			break;
2113 		}
2114 
2115 		xas_store(&xas, new);
2116 		xas_next(&xas);
2117 	}
2118 	if (!error) {
2119 		mem_cgroup_replace_folio(old, new);
2120 		shmem_update_stats(new, nr_pages);
2121 		shmem_update_stats(old, -nr_pages);
2122 	}
2123 	xa_unlock_irq(&swap_mapping->i_pages);
2124 
2125 	if (unlikely(error)) {
2126 		/*
2127 		 * Is this possible?  I think not, now that our callers
2128 		 * check both the swapcache flag and folio->private
2129 		 * after getting the folio lock; but be defensive.
2130 		 * Reverse old to newpage for clear and free.
2131 		 */
2132 		old = new;
2133 	} else {
2134 		folio_add_lru(new);
2135 		*foliop = new;
2136 	}
2137 
2138 	folio_clear_swapcache(old);
2139 	old->private = NULL;
2140 
2141 	folio_unlock(old);
2142 	/*
2143 	 * The old folio are removed from swap cache, drop the 'nr_pages'
2144 	 * reference, as well as one temporary reference getting from swap
2145 	 * cache.
2146 	 */
2147 	folio_put_refs(old, nr_pages + 1);
2148 	return error;
2149 }
2150 
shmem_set_folio_swapin_error(struct inode * inode,pgoff_t index,struct folio * folio,swp_entry_t swap,bool skip_swapcache)2151 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2152 					 struct folio *folio, swp_entry_t swap,
2153 					 bool skip_swapcache)
2154 {
2155 	struct address_space *mapping = inode->i_mapping;
2156 	swp_entry_t swapin_error;
2157 	void *old;
2158 	int nr_pages;
2159 
2160 	swapin_error = make_poisoned_swp_entry();
2161 	old = xa_cmpxchg_irq(&mapping->i_pages, index,
2162 			     swp_to_radix_entry(swap),
2163 			     swp_to_radix_entry(swapin_error), 0);
2164 	if (old != swp_to_radix_entry(swap))
2165 		return;
2166 
2167 	nr_pages = folio_nr_pages(folio);
2168 	folio_wait_writeback(folio);
2169 	if (!skip_swapcache)
2170 		delete_from_swap_cache(folio);
2171 	/*
2172 	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
2173 	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
2174 	 * in shmem_evict_inode().
2175 	 */
2176 	shmem_recalc_inode(inode, -nr_pages, -nr_pages);
2177 	swap_free_nr(swap, nr_pages);
2178 }
2179 
shmem_split_large_entry(struct inode * inode,pgoff_t index,swp_entry_t swap,gfp_t gfp)2180 static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
2181 				   swp_entry_t swap, gfp_t gfp)
2182 {
2183 	struct address_space *mapping = inode->i_mapping;
2184 	XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
2185 	int split_order = 0, entry_order;
2186 	int i;
2187 
2188 	/* Convert user data gfp flags to xarray node gfp flags */
2189 	gfp &= GFP_RECLAIM_MASK;
2190 
2191 	for (;;) {
2192 		void *old = NULL;
2193 		int cur_order;
2194 		pgoff_t swap_index;
2195 
2196 		xas_lock_irq(&xas);
2197 		old = xas_load(&xas);
2198 		if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
2199 			xas_set_err(&xas, -EEXIST);
2200 			goto unlock;
2201 		}
2202 
2203 		entry_order = xas_get_order(&xas);
2204 
2205 		if (!entry_order)
2206 			goto unlock;
2207 
2208 		/* Try to split large swap entry in pagecache */
2209 		cur_order = entry_order;
2210 		swap_index = round_down(index, 1 << entry_order);
2211 
2212 		split_order = xas_try_split_min_order(cur_order);
2213 
2214 		while (cur_order > 0) {
2215 			pgoff_t aligned_index =
2216 				round_down(index, 1 << cur_order);
2217 			pgoff_t swap_offset = aligned_index - swap_index;
2218 
2219 			xas_set_order(&xas, index, split_order);
2220 			xas_try_split(&xas, old, cur_order);
2221 			if (xas_error(&xas))
2222 				goto unlock;
2223 
2224 			/*
2225 			 * Re-set the swap entry after splitting, and the swap
2226 			 * offset of the original large entry must be continuous.
2227 			 */
2228 			for (i = 0; i < 1 << cur_order;
2229 			     i += (1 << split_order)) {
2230 				swp_entry_t tmp;
2231 
2232 				tmp = swp_entry(swp_type(swap),
2233 						swp_offset(swap) + swap_offset +
2234 							i);
2235 				__xa_store(&mapping->i_pages, aligned_index + i,
2236 					   swp_to_radix_entry(tmp), 0);
2237 			}
2238 			cur_order = split_order;
2239 			split_order = xas_try_split_min_order(split_order);
2240 		}
2241 
2242 unlock:
2243 		xas_unlock_irq(&xas);
2244 
2245 		if (!xas_nomem(&xas, gfp))
2246 			break;
2247 	}
2248 
2249 	if (xas_error(&xas))
2250 		return xas_error(&xas);
2251 
2252 	return entry_order;
2253 }
2254 
2255 /*
2256  * Swap in the folio pointed to by *foliop.
2257  * Caller has to make sure that *foliop contains a valid swapped folio.
2258  * Returns 0 and the folio in foliop if success. On failure, returns the
2259  * error code and NULL in *foliop.
2260  */
shmem_swapin_folio(struct inode * inode,pgoff_t index,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_area_struct * vma,vm_fault_t * fault_type)2261 static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
2262 			     struct folio **foliop, enum sgp_type sgp,
2263 			     gfp_t gfp, struct vm_area_struct *vma,
2264 			     vm_fault_t *fault_type)
2265 {
2266 	struct address_space *mapping = inode->i_mapping;
2267 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
2268 	struct shmem_inode_info *info = SHMEM_I(inode);
2269 	struct swap_info_struct *si;
2270 	struct folio *folio = NULL;
2271 	bool skip_swapcache = false;
2272 	swp_entry_t swap;
2273 	int error, nr_pages, order, split_order;
2274 
2275 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
2276 	swap = radix_to_swp_entry(*foliop);
2277 	*foliop = NULL;
2278 
2279 	if (is_poisoned_swp_entry(swap))
2280 		return -EIO;
2281 
2282 	si = get_swap_device(swap);
2283 	if (!si) {
2284 		if (!shmem_confirm_swap(mapping, index, swap))
2285 			return -EEXIST;
2286 		else
2287 			return -EINVAL;
2288 	}
2289 
2290 	/* Look it up and read it in.. */
2291 	folio = swap_cache_get_folio(swap, NULL, 0);
2292 	order = xa_get_order(&mapping->i_pages, index);
2293 	if (!folio) {
2294 		int nr_pages = 1 << order;
2295 		bool fallback_order0 = false;
2296 
2297 		/* Or update major stats only when swapin succeeds?? */
2298 		if (fault_type) {
2299 			*fault_type |= VM_FAULT_MAJOR;
2300 			count_vm_event(PGMAJFAULT);
2301 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
2302 		}
2303 
2304 		/*
2305 		 * If uffd is active for the vma, we need per-page fault
2306 		 * fidelity to maintain the uffd semantics, then fallback
2307 		 * to swapin order-0 folio, as well as for zswap case.
2308 		 * Any existing sub folio in the swap cache also blocks
2309 		 * mTHP swapin.
2310 		 */
2311 		if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
2312 				  !zswap_never_enabled() ||
2313 				  non_swapcache_batch(swap, nr_pages) != nr_pages))
2314 			fallback_order0 = true;
2315 
2316 		/* Skip swapcache for synchronous device. */
2317 		if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2318 			folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
2319 			if (!IS_ERR(folio)) {
2320 				skip_swapcache = true;
2321 				goto alloced;
2322 			}
2323 
2324 			/*
2325 			 * Fallback to swapin order-0 folio unless the swap entry
2326 			 * already exists.
2327 			 */
2328 			error = PTR_ERR(folio);
2329 			folio = NULL;
2330 			if (error == -EEXIST)
2331 				goto failed;
2332 		}
2333 
2334 		/*
2335 		 * Now swap device can only swap in order 0 folio, then we
2336 		 * should split the large swap entry stored in the pagecache
2337 		 * if necessary.
2338 		 */
2339 		split_order = shmem_split_large_entry(inode, index, swap, gfp);
2340 		if (split_order < 0) {
2341 			error = split_order;
2342 			goto failed;
2343 		}
2344 
2345 		/*
2346 		 * If the large swap entry has already been split, it is
2347 		 * necessary to recalculate the new swap entry based on
2348 		 * the old order alignment.
2349 		 */
2350 		if (split_order > 0) {
2351 			pgoff_t offset = index - round_down(index, 1 << split_order);
2352 
2353 			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2354 		}
2355 
2356 		/* Here we actually start the io */
2357 		folio = shmem_swapin_cluster(swap, gfp, info, index);
2358 		if (!folio) {
2359 			error = -ENOMEM;
2360 			goto failed;
2361 		}
2362 	} else if (order != folio_order(folio)) {
2363 		/*
2364 		 * Swap readahead may swap in order 0 folios into swapcache
2365 		 * asynchronously, while the shmem mapping can still stores
2366 		 * large swap entries. In such cases, we should split the
2367 		 * large swap entry to prevent possible data corruption.
2368 		 */
2369 		split_order = shmem_split_large_entry(inode, index, swap, gfp);
2370 		if (split_order < 0) {
2371 			folio_put(folio);
2372 			folio = NULL;
2373 			error = split_order;
2374 			goto failed;
2375 		}
2376 
2377 		/*
2378 		 * If the large swap entry has already been split, it is
2379 		 * necessary to recalculate the new swap entry based on
2380 		 * the old order alignment.
2381 		 */
2382 		if (split_order > 0) {
2383 			pgoff_t offset = index - round_down(index, 1 << split_order);
2384 
2385 			swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
2386 		}
2387 	}
2388 
2389 alloced:
2390 	/* We have to do this with folio locked to prevent races */
2391 	folio_lock(folio);
2392 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
2393 	    folio->swap.val != swap.val ||
2394 	    !shmem_confirm_swap(mapping, index, swap) ||
2395 	    xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
2396 		error = -EEXIST;
2397 		goto unlock;
2398 	}
2399 	if (!folio_test_uptodate(folio)) {
2400 		error = -EIO;
2401 		goto failed;
2402 	}
2403 	folio_wait_writeback(folio);
2404 	nr_pages = folio_nr_pages(folio);
2405 
2406 	/*
2407 	 * Some architectures may have to restore extra metadata to the
2408 	 * folio after reading from swap.
2409 	 */
2410 	arch_swap_restore(folio_swap(swap, folio), folio);
2411 
2412 	if (shmem_should_replace_folio(folio, gfp)) {
2413 		error = shmem_replace_folio(&folio, gfp, info, index, vma);
2414 		if (error)
2415 			goto failed;
2416 	}
2417 
2418 	error = shmem_add_to_page_cache(folio, mapping,
2419 					round_down(index, nr_pages),
2420 					swp_to_radix_entry(swap), gfp);
2421 	if (error)
2422 		goto failed;
2423 
2424 	shmem_recalc_inode(inode, 0, -nr_pages);
2425 
2426 	if (sgp == SGP_WRITE)
2427 		folio_mark_accessed(folio);
2428 
2429 	if (skip_swapcache) {
2430 		folio->swap.val = 0;
2431 		swapcache_clear(si, swap, nr_pages);
2432 	} else {
2433 		delete_from_swap_cache(folio);
2434 	}
2435 	folio_mark_dirty(folio);
2436 	swap_free_nr(swap, nr_pages);
2437 	put_swap_device(si);
2438 
2439 	*foliop = folio;
2440 	return 0;
2441 failed:
2442 	if (!shmem_confirm_swap(mapping, index, swap))
2443 		error = -EEXIST;
2444 	if (error == -EIO)
2445 		shmem_set_folio_swapin_error(inode, index, folio, swap,
2446 					     skip_swapcache);
2447 unlock:
2448 	if (skip_swapcache)
2449 		swapcache_clear(si, swap, folio_nr_pages(folio));
2450 	if (folio) {
2451 		folio_unlock(folio);
2452 		folio_put(folio);
2453 	}
2454 	put_swap_device(si);
2455 
2456 	return error;
2457 }
2458 
2459 /*
2460  * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
2461  *
2462  * If we allocate a new one we do not mark it dirty. That's up to the
2463  * vm. If we swap it in we mark it dirty since we also free the swap
2464  * entry since a page cannot live in both the swap and page cache.
2465  *
2466  * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
2467  */
shmem_get_folio_gfp(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp,gfp_t gfp,struct vm_fault * vmf,vm_fault_t * fault_type)2468 static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
2469 		loff_t write_end, struct folio **foliop, enum sgp_type sgp,
2470 		gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
2471 {
2472 	struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
2473 	struct mm_struct *fault_mm;
2474 	struct folio *folio;
2475 	int error;
2476 	bool alloced;
2477 	unsigned long orders = 0;
2478 
2479 	if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
2480 		return -EINVAL;
2481 
2482 	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
2483 		return -EFBIG;
2484 repeat:
2485 	if (sgp <= SGP_CACHE &&
2486 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
2487 		return -EINVAL;
2488 
2489 	alloced = false;
2490 	fault_mm = vma ? vma->vm_mm : NULL;
2491 
2492 	folio = filemap_get_entry(inode->i_mapping, index);
2493 	if (folio && vma && userfaultfd_minor(vma)) {
2494 		if (!xa_is_value(folio))
2495 			folio_put(folio);
2496 		*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
2497 		return 0;
2498 	}
2499 
2500 	if (xa_is_value(folio)) {
2501 		error = shmem_swapin_folio(inode, index, &folio,
2502 					   sgp, gfp, vma, fault_type);
2503 		if (error == -EEXIST)
2504 			goto repeat;
2505 
2506 		*foliop = folio;
2507 		return error;
2508 	}
2509 
2510 	if (folio) {
2511 		folio_lock(folio);
2512 
2513 		/* Has the folio been truncated or swapped out? */
2514 		if (unlikely(folio->mapping != inode->i_mapping)) {
2515 			folio_unlock(folio);
2516 			folio_put(folio);
2517 			goto repeat;
2518 		}
2519 		if (sgp == SGP_WRITE)
2520 			folio_mark_accessed(folio);
2521 		if (folio_test_uptodate(folio))
2522 			goto out;
2523 		/* fallocated folio */
2524 		if (sgp != SGP_READ)
2525 			goto clear;
2526 		folio_unlock(folio);
2527 		folio_put(folio);
2528 	}
2529 
2530 	/*
2531 	 * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
2532 	 * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
2533 	 */
2534 	*foliop = NULL;
2535 	if (sgp == SGP_READ)
2536 		return 0;
2537 	if (sgp == SGP_NOALLOC)
2538 		return -ENOENT;
2539 
2540 	/*
2541 	 * Fast cache lookup and swap lookup did not find it: allocate.
2542 	 */
2543 
2544 	if (vma && userfaultfd_missing(vma)) {
2545 		*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
2546 		return 0;
2547 	}
2548 
2549 	/* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
2550 	orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
2551 	if (orders > 0) {
2552 		gfp_t huge_gfp;
2553 
2554 		huge_gfp = vma_thp_gfp_mask(vma);
2555 		huge_gfp = limit_gfp_mask(huge_gfp, gfp);
2556 		folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
2557 				inode, index, fault_mm, orders);
2558 		if (!IS_ERR(folio)) {
2559 			if (folio_test_pmd_mappable(folio))
2560 				count_vm_event(THP_FILE_ALLOC);
2561 			count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
2562 			goto alloced;
2563 		}
2564 		if (PTR_ERR(folio) == -EEXIST)
2565 			goto repeat;
2566 	}
2567 
2568 	folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
2569 	if (IS_ERR(folio)) {
2570 		error = PTR_ERR(folio);
2571 		if (error == -EEXIST)
2572 			goto repeat;
2573 		folio = NULL;
2574 		goto unlock;
2575 	}
2576 
2577 alloced:
2578 	alloced = true;
2579 	if (folio_test_large(folio) &&
2580 	    DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
2581 					folio_next_index(folio)) {
2582 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
2583 		struct shmem_inode_info *info = SHMEM_I(inode);
2584 		/*
2585 		 * Part of the large folio is beyond i_size: subject
2586 		 * to shrink under memory pressure.
2587 		 */
2588 		spin_lock(&sbinfo->shrinklist_lock);
2589 		/*
2590 		 * _careful to defend against unlocked access to
2591 		 * ->shrink_list in shmem_unused_huge_shrink()
2592 		 */
2593 		if (list_empty_careful(&info->shrinklist)) {
2594 			list_add_tail(&info->shrinklist,
2595 				      &sbinfo->shrinklist);
2596 			sbinfo->shrinklist_len++;
2597 		}
2598 		spin_unlock(&sbinfo->shrinklist_lock);
2599 	}
2600 
2601 	if (sgp == SGP_WRITE)
2602 		folio_set_referenced(folio);
2603 	/*
2604 	 * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
2605 	 */
2606 	if (sgp == SGP_FALLOC)
2607 		sgp = SGP_WRITE;
2608 clear:
2609 	/*
2610 	 * Let SGP_WRITE caller clear ends if write does not fill folio;
2611 	 * but SGP_FALLOC on a folio fallocated earlier must initialize
2612 	 * it now, lest undo on failure cancel our earlier guarantee.
2613 	 */
2614 	if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
2615 		long i, n = folio_nr_pages(folio);
2616 
2617 		for (i = 0; i < n; i++)
2618 			clear_highpage(folio_page(folio, i));
2619 		flush_dcache_folio(folio);
2620 		folio_mark_uptodate(folio);
2621 	}
2622 
2623 	/* Perhaps the file has been truncated since we checked */
2624 	if (sgp <= SGP_CACHE &&
2625 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
2626 		error = -EINVAL;
2627 		goto unlock;
2628 	}
2629 out:
2630 	*foliop = folio;
2631 	return 0;
2632 
2633 	/*
2634 	 * Error recovery.
2635 	 */
2636 unlock:
2637 	if (alloced)
2638 		filemap_remove_folio(folio);
2639 	shmem_recalc_inode(inode, 0, 0);
2640 	if (folio) {
2641 		folio_unlock(folio);
2642 		folio_put(folio);
2643 	}
2644 	return error;
2645 }
2646 
2647 /**
2648  * shmem_get_folio - find, and lock a shmem folio.
2649  * @inode:	inode to search
2650  * @index:	the page index.
2651  * @write_end:	end of a write, could extend inode size
2652  * @foliop:	pointer to the folio if found
2653  * @sgp:	SGP_* flags to control behavior
2654  *
2655  * Looks up the page cache entry at @inode & @index.  If a folio is
2656  * present, it is returned locked with an increased refcount.
2657  *
2658  * If the caller modifies data in the folio, it must call folio_mark_dirty()
2659  * before unlocking the folio to ensure that the folio is not reclaimed.
2660  * There is no need to reserve space before calling folio_mark_dirty().
2661  *
2662  * When no folio is found, the behavior depends on @sgp:
2663  *  - for SGP_READ, *@foliop is %NULL and 0 is returned
2664  *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
2665  *  - for all other flags a new folio is allocated, inserted into the
2666  *    page cache and returned locked in @foliop.
2667  *
2668  * Context: May sleep.
2669  * Return: 0 if successful, else a negative error code.
2670  */
shmem_get_folio(struct inode * inode,pgoff_t index,loff_t write_end,struct folio ** foliop,enum sgp_type sgp)2671 int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
2672 		    struct folio **foliop, enum sgp_type sgp)
2673 {
2674 	return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
2675 			mapping_gfp_mask(inode->i_mapping), NULL, NULL);
2676 }
2677 EXPORT_SYMBOL_GPL(shmem_get_folio);
2678 
2679 /*
2680  * This is like autoremove_wake_function, but it removes the wait queue
2681  * entry unconditionally - even if something else had already woken the
2682  * target.
2683  */
synchronous_wake_function(wait_queue_entry_t * wait,unsigned int mode,int sync,void * key)2684 static int synchronous_wake_function(wait_queue_entry_t *wait,
2685 			unsigned int mode, int sync, void *key)
2686 {
2687 	int ret = default_wake_function(wait, mode, sync, key);
2688 	list_del_init(&wait->entry);
2689 	return ret;
2690 }
2691 
2692 /*
2693  * Trinity finds that probing a hole which tmpfs is punching can
2694  * prevent the hole-punch from ever completing: which in turn
2695  * locks writers out with its hold on i_rwsem.  So refrain from
2696  * faulting pages into the hole while it's being punched.  Although
2697  * shmem_undo_range() does remove the additions, it may be unable to
2698  * keep up, as each new page needs its own unmap_mapping_range() call,
2699  * and the i_mmap tree grows ever slower to scan if new vmas are added.
2700  *
2701  * It does not matter if we sometimes reach this check just before the
2702  * hole-punch begins, so that one fault then races with the punch:
2703  * we just need to make racing faults a rare case.
2704  *
2705  * The implementation below would be much simpler if we just used a
2706  * standard mutex or completion: but we cannot take i_rwsem in fault,
2707  * and bloating every shmem inode for this unlikely case would be sad.
2708  */
shmem_falloc_wait(struct vm_fault * vmf,struct inode * inode)2709 static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
2710 {
2711 	struct shmem_falloc *shmem_falloc;
2712 	struct file *fpin = NULL;
2713 	vm_fault_t ret = 0;
2714 
2715 	spin_lock(&inode->i_lock);
2716 	shmem_falloc = inode->i_private;
2717 	if (shmem_falloc &&
2718 	    shmem_falloc->waitq &&
2719 	    vmf->pgoff >= shmem_falloc->start &&
2720 	    vmf->pgoff < shmem_falloc->next) {
2721 		wait_queue_head_t *shmem_falloc_waitq;
2722 		DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
2723 
2724 		ret = VM_FAULT_NOPAGE;
2725 		fpin = maybe_unlock_mmap_for_io(vmf, NULL);
2726 		shmem_falloc_waitq = shmem_falloc->waitq;
2727 		prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
2728 				TASK_UNINTERRUPTIBLE);
2729 		spin_unlock(&inode->i_lock);
2730 		schedule();
2731 
2732 		/*
2733 		 * shmem_falloc_waitq points into the shmem_fallocate()
2734 		 * stack of the hole-punching task: shmem_falloc_waitq
2735 		 * is usually invalid by the time we reach here, but
2736 		 * finish_wait() does not dereference it in that case;
2737 		 * though i_lock needed lest racing with wake_up_all().
2738 		 */
2739 		spin_lock(&inode->i_lock);
2740 		finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
2741 	}
2742 	spin_unlock(&inode->i_lock);
2743 	if (fpin) {
2744 		fput(fpin);
2745 		ret = VM_FAULT_RETRY;
2746 	}
2747 	return ret;
2748 }
2749 
shmem_fault(struct vm_fault * vmf)2750 static vm_fault_t shmem_fault(struct vm_fault *vmf)
2751 {
2752 	struct inode *inode = file_inode(vmf->vma->vm_file);
2753 	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
2754 	struct folio *folio = NULL;
2755 	vm_fault_t ret = 0;
2756 	int err;
2757 
2758 	/*
2759 	 * Trinity finds that probing a hole which tmpfs is punching can
2760 	 * prevent the hole-punch from ever completing: noted in i_private.
2761 	 */
2762 	if (unlikely(inode->i_private)) {
2763 		ret = shmem_falloc_wait(vmf, inode);
2764 		if (ret)
2765 			return ret;
2766 	}
2767 
2768 	WARN_ON_ONCE(vmf->page != NULL);
2769 	err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
2770 				  gfp, vmf, &ret);
2771 	if (err)
2772 		return vmf_error(err);
2773 	if (folio) {
2774 		vmf->page = folio_file_page(folio, vmf->pgoff);
2775 		ret |= VM_FAULT_LOCKED;
2776 	}
2777 	return ret;
2778 }
2779 
shmem_get_unmapped_area(struct file * file,unsigned long uaddr,unsigned long len,unsigned long pgoff,unsigned long flags)2780 unsigned long shmem_get_unmapped_area(struct file *file,
2781 				      unsigned long uaddr, unsigned long len,
2782 				      unsigned long pgoff, unsigned long flags)
2783 {
2784 	unsigned long addr;
2785 	unsigned long offset;
2786 	unsigned long inflated_len;
2787 	unsigned long inflated_addr;
2788 	unsigned long inflated_offset;
2789 	unsigned long hpage_size;
2790 
2791 	if (len > TASK_SIZE)
2792 		return -ENOMEM;
2793 
2794 	addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
2795 				    flags);
2796 
2797 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
2798 		return addr;
2799 	if (IS_ERR_VALUE(addr))
2800 		return addr;
2801 	if (addr & ~PAGE_MASK)
2802 		return addr;
2803 	if (addr > TASK_SIZE - len)
2804 		return addr;
2805 
2806 	if (shmem_huge == SHMEM_HUGE_DENY)
2807 		return addr;
2808 	if (flags & MAP_FIXED)
2809 		return addr;
2810 	/*
2811 	 * Our priority is to support MAP_SHARED mapped hugely;
2812 	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
2813 	 * But if caller specified an address hint and we allocated area there
2814 	 * successfully, respect that as before.
2815 	 */
2816 	if (uaddr == addr)
2817 		return addr;
2818 
2819 	hpage_size = HPAGE_PMD_SIZE;
2820 	if (shmem_huge != SHMEM_HUGE_FORCE) {
2821 		struct super_block *sb;
2822 		unsigned long __maybe_unused hpage_orders;
2823 		int order = 0;
2824 
2825 		if (file) {
2826 			VM_BUG_ON(file->f_op != &shmem_file_operations);
2827 			sb = file_inode(file)->i_sb;
2828 		} else {
2829 			/*
2830 			 * Called directly from mm/mmap.c, or drivers/char/mem.c
2831 			 * for "/dev/zero", to create a shared anonymous object.
2832 			 */
2833 			if (IS_ERR(shm_mnt))
2834 				return addr;
2835 			sb = shm_mnt->mnt_sb;
2836 
2837 			/*
2838 			 * Find the highest mTHP order used for anonymous shmem to
2839 			 * provide a suitable alignment address.
2840 			 */
2841 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2842 			hpage_orders = READ_ONCE(huge_shmem_orders_always);
2843 			hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
2844 			hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
2845 			if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
2846 				hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);
2847 
2848 			if (hpage_orders > 0) {
2849 				order = highest_order(hpage_orders);
2850 				hpage_size = PAGE_SIZE << order;
2851 			}
2852 #endif
2853 		}
2854 		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
2855 			return addr;
2856 	}
2857 
2858 	if (len < hpage_size)
2859 		return addr;
2860 
2861 	offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
2862 	if (offset && offset + len < 2 * hpage_size)
2863 		return addr;
2864 	if ((addr & (hpage_size - 1)) == offset)
2865 		return addr;
2866 
2867 	inflated_len = len + hpage_size - PAGE_SIZE;
2868 	if (inflated_len > TASK_SIZE)
2869 		return addr;
2870 	if (inflated_len < len)
2871 		return addr;
2872 
2873 	inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
2874 					     inflated_len, 0, flags);
2875 	if (IS_ERR_VALUE(inflated_addr))
2876 		return addr;
2877 	if (inflated_addr & ~PAGE_MASK)
2878 		return addr;
2879 
2880 	inflated_offset = inflated_addr & (hpage_size - 1);
2881 	inflated_addr += offset - inflated_offset;
2882 	if (inflated_offset > offset)
2883 		inflated_addr += hpage_size;
2884 
2885 	if (inflated_addr > TASK_SIZE - len)
2886 		return addr;
2887 	return inflated_addr;
2888 }
2889 
2890 #ifdef CONFIG_NUMA
shmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)2891 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
2892 {
2893 	struct inode *inode = file_inode(vma->vm_file);
2894 	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
2895 }
2896 
shmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * ilx)2897 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
2898 					  unsigned long addr, pgoff_t *ilx)
2899 {
2900 	struct inode *inode = file_inode(vma->vm_file);
2901 	pgoff_t index;
2902 
2903 	/*
2904 	 * Bias interleave by inode number to distribute better across nodes;
2905 	 * but this interface is independent of which page order is used, so
2906 	 * supplies only that bias, letting caller apply the offset (adjusted
2907 	 * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
2908 	 */
2909 	*ilx = inode->i_ino;
2910 	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2911 	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
2912 }
2913 
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2914 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2915 			pgoff_t index, unsigned int order, pgoff_t *ilx)
2916 {
2917 	struct mempolicy *mpol;
2918 
2919 	/* Bias interleave by inode number to distribute better across nodes */
2920 	*ilx = info->vfs_inode.i_ino + (index >> order);
2921 
2922 	mpol = mpol_shared_policy_lookup(&info->policy, index);
2923 	return mpol ? mpol : get_task_policy(current);
2924 }
2925 #else
shmem_get_pgoff_policy(struct shmem_inode_info * info,pgoff_t index,unsigned int order,pgoff_t * ilx)2926 static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
2927 			pgoff_t index, unsigned int order, pgoff_t *ilx)
2928 {
2929 	*ilx = 0;
2930 	return NULL;
2931 }
2932 #endif /* CONFIG_NUMA */
2933 
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)2934 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
2935 {
2936 	struct inode *inode = file_inode(file);
2937 	struct shmem_inode_info *info = SHMEM_I(inode);
2938 	int retval = -ENOMEM;
2939 
2940 	/*
2941 	 * What serializes the accesses to info->flags?
2942 	 * ipc_lock_object() when called from shmctl_do_lock(),
2943 	 * no serialization needed when called from shm_destroy().
2944 	 */
2945 	if (lock && !(info->flags & VM_LOCKED)) {
2946 		if (!user_shm_lock(inode->i_size, ucounts))
2947 			goto out_nomem;
2948 		info->flags |= VM_LOCKED;
2949 		mapping_set_unevictable(file->f_mapping);
2950 	}
2951 	if (!lock && (info->flags & VM_LOCKED) && ucounts) {
2952 		user_shm_unlock(inode->i_size, ucounts);
2953 		info->flags &= ~VM_LOCKED;
2954 		mapping_clear_unevictable(file->f_mapping);
2955 	}
2956 	retval = 0;
2957 
2958 out_nomem:
2959 	return retval;
2960 }
2961 
shmem_mmap(struct file * file,struct vm_area_struct * vma)2962 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
2963 {
2964 	struct inode *inode = file_inode(file);
2965 
2966 	file_accessed(file);
2967 	/* This is anonymous shared memory if it is unlinked at the time of mmap */
2968 	if (inode->i_nlink)
2969 		vma->vm_ops = &shmem_vm_ops;
2970 	else
2971 		vma->vm_ops = &shmem_anon_vm_ops;
2972 	return 0;
2973 }
2974 
shmem_file_open(struct inode * inode,struct file * file)2975 static int shmem_file_open(struct inode *inode, struct file *file)
2976 {
2977 	file->f_mode |= FMODE_CAN_ODIRECT;
2978 	return generic_file_open(inode, file);
2979 }
2980 
2981 #ifdef CONFIG_TMPFS_XATTR
2982 static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
2983 
2984 #if IS_ENABLED(CONFIG_UNICODE)
2985 /*
2986  * shmem_inode_casefold_flags - Deal with casefold file attribute flag
2987  *
2988  * The casefold file attribute needs some special checks. I can just be added to
2989  * an empty dir, and can't be removed from a non-empty dir.
2990  */
shmem_inode_casefold_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry,unsigned int * i_flags)2991 static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
2992 				      struct dentry *dentry, unsigned int *i_flags)
2993 {
2994 	unsigned int old = inode->i_flags;
2995 	struct super_block *sb = inode->i_sb;
2996 
2997 	if (fsflags & FS_CASEFOLD_FL) {
2998 		if (!(old & S_CASEFOLD)) {
2999 			if (!sb->s_encoding)
3000 				return -EOPNOTSUPP;
3001 
3002 			if (!S_ISDIR(inode->i_mode))
3003 				return -ENOTDIR;
3004 
3005 			if (dentry && !simple_empty(dentry))
3006 				return -ENOTEMPTY;
3007 		}
3008 
3009 		*i_flags = *i_flags | S_CASEFOLD;
3010 	} else if (old & S_CASEFOLD) {
3011 		if (dentry && !simple_empty(dentry))
3012 			return -ENOTEMPTY;
3013 	}
3014 
3015 	return 0;
3016 }
3017 #else
shmem_inode_casefold_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry,unsigned int * i_flags)3018 static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
3019 				      struct dentry *dentry, unsigned int *i_flags)
3020 {
3021 	if (fsflags & FS_CASEFOLD_FL)
3022 		return -EOPNOTSUPP;
3023 
3024 	return 0;
3025 }
3026 #endif
3027 
3028 /*
3029  * chattr's fsflags are unrelated to extended attributes,
3030  * but tmpfs has chosen to enable them under the same config option.
3031  */
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry)3032 static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3033 {
3034 	unsigned int i_flags = 0;
3035 	int ret;
3036 
3037 	ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
3038 	if (ret)
3039 		return ret;
3040 
3041 	if (fsflags & FS_NOATIME_FL)
3042 		i_flags |= S_NOATIME;
3043 	if (fsflags & FS_APPEND_FL)
3044 		i_flags |= S_APPEND;
3045 	if (fsflags & FS_IMMUTABLE_FL)
3046 		i_flags |= S_IMMUTABLE;
3047 	/*
3048 	 * But FS_NODUMP_FL does not require any action in i_flags.
3049 	 */
3050 	inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
3051 
3052 	return 0;
3053 }
3054 #else
shmem_set_inode_flags(struct inode * inode,unsigned int fsflags,struct dentry * dentry)3055 static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
3056 {
3057 }
3058 #define shmem_initxattrs NULL
3059 #endif
3060 
shmem_get_offset_ctx(struct inode * inode)3061 static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
3062 {
3063 	return &SHMEM_I(inode)->dir_offsets;
3064 }
3065 
__shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3066 static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
3067 					     struct super_block *sb,
3068 					     struct inode *dir, umode_t mode,
3069 					     dev_t dev, unsigned long flags)
3070 {
3071 	struct inode *inode;
3072 	struct shmem_inode_info *info;
3073 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
3074 	ino_t ino;
3075 	int err;
3076 
3077 	err = shmem_reserve_inode(sb, &ino);
3078 	if (err)
3079 		return ERR_PTR(err);
3080 
3081 	inode = new_inode(sb);
3082 	if (!inode) {
3083 		shmem_free_inode(sb, 0);
3084 		return ERR_PTR(-ENOSPC);
3085 	}
3086 
3087 	inode->i_ino = ino;
3088 	inode_init_owner(idmap, inode, dir, mode);
3089 	inode->i_blocks = 0;
3090 	simple_inode_init_ts(inode);
3091 	inode->i_generation = get_random_u32();
3092 	info = SHMEM_I(inode);
3093 	memset(info, 0, (char *)inode - (char *)info);
3094 	spin_lock_init(&info->lock);
3095 	atomic_set(&info->stop_eviction, 0);
3096 	info->seals = F_SEAL_SEAL;
3097 	info->flags = flags & VM_NORESERVE;
3098 	info->i_crtime = inode_get_mtime(inode);
3099 	info->fsflags = (dir == NULL) ? 0 :
3100 		SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
3101 	if (info->fsflags)
3102 		shmem_set_inode_flags(inode, info->fsflags, NULL);
3103 	INIT_LIST_HEAD(&info->shrinklist);
3104 	INIT_LIST_HEAD(&info->swaplist);
3105 	simple_xattrs_init(&info->xattrs);
3106 	cache_no_acl(inode);
3107 	if (sbinfo->noswap)
3108 		mapping_set_unevictable(inode->i_mapping);
3109 
3110 	/* Don't consider 'deny' for emergencies and 'force' for testing */
3111 	if (sbinfo->huge)
3112 		mapping_set_large_folios(inode->i_mapping);
3113 
3114 	switch (mode & S_IFMT) {
3115 	default:
3116 		inode->i_op = &shmem_special_inode_operations;
3117 		init_special_inode(inode, mode, dev);
3118 		break;
3119 	case S_IFREG:
3120 		inode->i_mapping->a_ops = &shmem_aops;
3121 		inode->i_op = &shmem_inode_operations;
3122 		inode->i_fop = &shmem_file_operations;
3123 		mpol_shared_policy_init(&info->policy,
3124 					 shmem_get_sbmpol(sbinfo));
3125 		break;
3126 	case S_IFDIR:
3127 		inc_nlink(inode);
3128 		/* Some things misbehave if size == 0 on a directory */
3129 		inode->i_size = 2 * BOGO_DIRENT_SIZE;
3130 		inode->i_op = &shmem_dir_inode_operations;
3131 		inode->i_fop = &simple_offset_dir_operations;
3132 		simple_offset_init(shmem_get_offset_ctx(inode));
3133 		break;
3134 	case S_IFLNK:
3135 		/*
3136 		 * Must not load anything in the rbtree,
3137 		 * mpol_free_shared_policy will not be called.
3138 		 */
3139 		mpol_shared_policy_init(&info->policy, NULL);
3140 		break;
3141 	}
3142 
3143 	lockdep_annotate_inode_mutex_key(inode);
3144 	return inode;
3145 }
3146 
3147 #ifdef CONFIG_TMPFS_QUOTA
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3148 static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3149 				     struct super_block *sb, struct inode *dir,
3150 				     umode_t mode, dev_t dev, unsigned long flags)
3151 {
3152 	int err;
3153 	struct inode *inode;
3154 
3155 	inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3156 	if (IS_ERR(inode))
3157 		return inode;
3158 
3159 	err = dquot_initialize(inode);
3160 	if (err)
3161 		goto errout;
3162 
3163 	err = dquot_alloc_inode(inode);
3164 	if (err) {
3165 		dquot_drop(inode);
3166 		goto errout;
3167 	}
3168 	return inode;
3169 
3170 errout:
3171 	inode->i_flags |= S_NOQUOTA;
3172 	iput(inode);
3173 	return ERR_PTR(err);
3174 }
3175 #else
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)3176 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
3177 				     struct super_block *sb, struct inode *dir,
3178 				     umode_t mode, dev_t dev, unsigned long flags)
3179 {
3180 	return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
3181 }
3182 #endif /* CONFIG_TMPFS_QUOTA */
3183 
3184 #ifdef CONFIG_USERFAULTFD
shmem_mfill_atomic_pte(pmd_t * dst_pmd,struct vm_area_struct * dst_vma,unsigned long dst_addr,unsigned long src_addr,uffd_flags_t flags,struct folio ** foliop)3185 int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
3186 			   struct vm_area_struct *dst_vma,
3187 			   unsigned long dst_addr,
3188 			   unsigned long src_addr,
3189 			   uffd_flags_t flags,
3190 			   struct folio **foliop)
3191 {
3192 	struct inode *inode = file_inode(dst_vma->vm_file);
3193 	struct shmem_inode_info *info = SHMEM_I(inode);
3194 	struct address_space *mapping = inode->i_mapping;
3195 	gfp_t gfp = mapping_gfp_mask(mapping);
3196 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
3197 	void *page_kaddr;
3198 	struct folio *folio;
3199 	int ret;
3200 	pgoff_t max_off;
3201 
3202 	if (shmem_inode_acct_blocks(inode, 1)) {
3203 		/*
3204 		 * We may have got a page, returned -ENOENT triggering a retry,
3205 		 * and now we find ourselves with -ENOMEM. Release the page, to
3206 		 * avoid a BUG_ON in our caller.
3207 		 */
3208 		if (unlikely(*foliop)) {
3209 			folio_put(*foliop);
3210 			*foliop = NULL;
3211 		}
3212 		return -ENOMEM;
3213 	}
3214 
3215 	if (!*foliop) {
3216 		ret = -ENOMEM;
3217 		folio = shmem_alloc_folio(gfp, 0, info, pgoff);
3218 		if (!folio)
3219 			goto out_unacct_blocks;
3220 
3221 		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
3222 			page_kaddr = kmap_local_folio(folio, 0);
3223 			/*
3224 			 * The read mmap_lock is held here.  Despite the
3225 			 * mmap_lock being read recursive a deadlock is still
3226 			 * possible if a writer has taken a lock.  For example:
3227 			 *
3228 			 * process A thread 1 takes read lock on own mmap_lock
3229 			 * process A thread 2 calls mmap, blocks taking write lock
3230 			 * process B thread 1 takes page fault, read lock on own mmap lock
3231 			 * process B thread 2 calls mmap, blocks taking write lock
3232 			 * process A thread 1 blocks taking read lock on process B
3233 			 * process B thread 1 blocks taking read lock on process A
3234 			 *
3235 			 * Disable page faults to prevent potential deadlock
3236 			 * and retry the copy outside the mmap_lock.
3237 			 */
3238 			pagefault_disable();
3239 			ret = copy_from_user(page_kaddr,
3240 					     (const void __user *)src_addr,
3241 					     PAGE_SIZE);
3242 			pagefault_enable();
3243 			kunmap_local(page_kaddr);
3244 
3245 			/* fallback to copy_from_user outside mmap_lock */
3246 			if (unlikely(ret)) {
3247 				*foliop = folio;
3248 				ret = -ENOENT;
3249 				/* don't free the page */
3250 				goto out_unacct_blocks;
3251 			}
3252 
3253 			flush_dcache_folio(folio);
3254 		} else {		/* ZEROPAGE */
3255 			clear_user_highpage(&folio->page, dst_addr);
3256 		}
3257 	} else {
3258 		folio = *foliop;
3259 		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
3260 		*foliop = NULL;
3261 	}
3262 
3263 	VM_BUG_ON(folio_test_locked(folio));
3264 	VM_BUG_ON(folio_test_swapbacked(folio));
3265 	__folio_set_locked(folio);
3266 	__folio_set_swapbacked(folio);
3267 	__folio_mark_uptodate(folio);
3268 
3269 	ret = -EFAULT;
3270 	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
3271 	if (unlikely(pgoff >= max_off))
3272 		goto out_release;
3273 
3274 	ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
3275 	if (ret)
3276 		goto out_release;
3277 	ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
3278 	if (ret)
3279 		goto out_release;
3280 
3281 	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
3282 				       &folio->page, true, flags);
3283 	if (ret)
3284 		goto out_delete_from_cache;
3285 
3286 	shmem_recalc_inode(inode, 1, 0);
3287 	folio_unlock(folio);
3288 	return 0;
3289 out_delete_from_cache:
3290 	filemap_remove_folio(folio);
3291 out_release:
3292 	folio_unlock(folio);
3293 	folio_put(folio);
3294 out_unacct_blocks:
3295 	shmem_inode_unacct_blocks(inode, 1);
3296 	return ret;
3297 }
3298 #endif /* CONFIG_USERFAULTFD */
3299 
3300 #ifdef CONFIG_TMPFS
3301 static const struct inode_operations shmem_symlink_inode_operations;
3302 static const struct inode_operations shmem_short_symlink_operations;
3303 
3304 static int
shmem_write_begin(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,struct folio ** foliop,void ** fsdata)3305 shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
3306 		  loff_t pos, unsigned len,
3307 		  struct folio **foliop, void **fsdata)
3308 {
3309 	struct inode *inode = mapping->host;
3310 	struct shmem_inode_info *info = SHMEM_I(inode);
3311 	pgoff_t index = pos >> PAGE_SHIFT;
3312 	struct folio *folio;
3313 	int ret = 0;
3314 
3315 	/* i_rwsem is held by caller */
3316 	if (unlikely(info->seals & (F_SEAL_GROW |
3317 				   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
3318 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
3319 			return -EPERM;
3320 		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
3321 			return -EPERM;
3322 	}
3323 
3324 	ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
3325 	if (ret)
3326 		return ret;
3327 
3328 	if (folio_contain_hwpoisoned_page(folio)) {
3329 		folio_unlock(folio);
3330 		folio_put(folio);
3331 		return -EIO;
3332 	}
3333 
3334 	*foliop = folio;
3335 	return 0;
3336 }
3337 
3338 static int
shmem_write_end(const struct kiocb * iocb,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct folio * folio,void * fsdata)3339 shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
3340 		loff_t pos, unsigned len, unsigned copied,
3341 		struct folio *folio, void *fsdata)
3342 {
3343 	struct inode *inode = mapping->host;
3344 
3345 	if (pos + copied > inode->i_size)
3346 		i_size_write(inode, pos + copied);
3347 
3348 	if (!folio_test_uptodate(folio)) {
3349 		if (copied < folio_size(folio)) {
3350 			size_t from = offset_in_folio(folio, pos);
3351 			folio_zero_segments(folio, 0, from,
3352 					from + copied, folio_size(folio));
3353 		}
3354 		folio_mark_uptodate(folio);
3355 	}
3356 	folio_mark_dirty(folio);
3357 	folio_unlock(folio);
3358 	folio_put(folio);
3359 
3360 	return copied;
3361 }
3362 
shmem_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3363 static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3364 {
3365 	struct file *file = iocb->ki_filp;
3366 	struct inode *inode = file_inode(file);
3367 	struct address_space *mapping = inode->i_mapping;
3368 	pgoff_t index;
3369 	unsigned long offset;
3370 	int error = 0;
3371 	ssize_t retval = 0;
3372 
3373 	for (;;) {
3374 		struct folio *folio = NULL;
3375 		struct page *page = NULL;
3376 		unsigned long nr, ret;
3377 		loff_t end_offset, i_size = i_size_read(inode);
3378 		bool fallback_page_copy = false;
3379 		size_t fsize;
3380 
3381 		if (unlikely(iocb->ki_pos >= i_size))
3382 			break;
3383 
3384 		index = iocb->ki_pos >> PAGE_SHIFT;
3385 		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3386 		if (error) {
3387 			if (error == -EINVAL)
3388 				error = 0;
3389 			break;
3390 		}
3391 		if (folio) {
3392 			folio_unlock(folio);
3393 
3394 			page = folio_file_page(folio, index);
3395 			if (PageHWPoison(page)) {
3396 				folio_put(folio);
3397 				error = -EIO;
3398 				break;
3399 			}
3400 
3401 			if (folio_test_large(folio) &&
3402 			    folio_test_has_hwpoisoned(folio))
3403 				fallback_page_copy = true;
3404 		}
3405 
3406 		/*
3407 		 * We must evaluate after, since reads (unlike writes)
3408 		 * are called without i_rwsem protection against truncate
3409 		 */
3410 		i_size = i_size_read(inode);
3411 		if (unlikely(iocb->ki_pos >= i_size)) {
3412 			if (folio)
3413 				folio_put(folio);
3414 			break;
3415 		}
3416 		end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
3417 		if (folio && likely(!fallback_page_copy))
3418 			fsize = folio_size(folio);
3419 		else
3420 			fsize = PAGE_SIZE;
3421 		offset = iocb->ki_pos & (fsize - 1);
3422 		nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);
3423 
3424 		if (folio) {
3425 			/*
3426 			 * If users can be writing to this page using arbitrary
3427 			 * virtual addresses, take care about potential aliasing
3428 			 * before reading the page on the kernel side.
3429 			 */
3430 			if (mapping_writably_mapped(mapping)) {
3431 				if (likely(!fallback_page_copy))
3432 					flush_dcache_folio(folio);
3433 				else
3434 					flush_dcache_page(page);
3435 			}
3436 
3437 			/*
3438 			 * Mark the folio accessed if we read the beginning.
3439 			 */
3440 			if (!offset)
3441 				folio_mark_accessed(folio);
3442 			/*
3443 			 * Ok, we have the page, and it's up-to-date, so
3444 			 * now we can copy it to user space...
3445 			 */
3446 			if (likely(!fallback_page_copy))
3447 				ret = copy_folio_to_iter(folio, offset, nr, to);
3448 			else
3449 				ret = copy_page_to_iter(page, offset, nr, to);
3450 			folio_put(folio);
3451 		} else if (user_backed_iter(to)) {
3452 			/*
3453 			 * Copy to user tends to be so well optimized, but
3454 			 * clear_user() not so much, that it is noticeably
3455 			 * faster to copy the zero page instead of clearing.
3456 			 */
3457 			ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
3458 		} else {
3459 			/*
3460 			 * But submitting the same page twice in a row to
3461 			 * splice() - or others? - can result in confusion:
3462 			 * so don't attempt that optimization on pipes etc.
3463 			 */
3464 			ret = iov_iter_zero(nr, to);
3465 		}
3466 
3467 		retval += ret;
3468 		iocb->ki_pos += ret;
3469 
3470 		if (!iov_iter_count(to))
3471 			break;
3472 		if (ret < nr) {
3473 			error = -EFAULT;
3474 			break;
3475 		}
3476 		cond_resched();
3477 	}
3478 
3479 	file_accessed(file);
3480 	return retval ? retval : error;
3481 }
3482 
shmem_file_write_iter(struct kiocb * iocb,struct iov_iter * from)3483 static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
3484 {
3485 	struct file *file = iocb->ki_filp;
3486 	struct inode *inode = file->f_mapping->host;
3487 	ssize_t ret;
3488 
3489 	inode_lock(inode);
3490 	ret = generic_write_checks(iocb, from);
3491 	if (ret <= 0)
3492 		goto unlock;
3493 	ret = file_remove_privs(file);
3494 	if (ret)
3495 		goto unlock;
3496 	ret = file_update_time(file);
3497 	if (ret)
3498 		goto unlock;
3499 	ret = generic_perform_write(iocb, from);
3500 unlock:
3501 	inode_unlock(inode);
3502 	return ret;
3503 }
3504 
zero_pipe_buf_get(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3505 static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
3506 			      struct pipe_buffer *buf)
3507 {
3508 	return true;
3509 }
3510 
zero_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3511 static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
3512 				  struct pipe_buffer *buf)
3513 {
3514 }
3515 
zero_pipe_buf_try_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)3516 static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
3517 				    struct pipe_buffer *buf)
3518 {
3519 	return false;
3520 }
3521 
3522 static const struct pipe_buf_operations zero_pipe_buf_ops = {
3523 	.release	= zero_pipe_buf_release,
3524 	.try_steal	= zero_pipe_buf_try_steal,
3525 	.get		= zero_pipe_buf_get,
3526 };
3527 
splice_zeropage_into_pipe(struct pipe_inode_info * pipe,loff_t fpos,size_t size)3528 static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
3529 					loff_t fpos, size_t size)
3530 {
3531 	size_t offset = fpos & ~PAGE_MASK;
3532 
3533 	size = min_t(size_t, size, PAGE_SIZE - offset);
3534 
3535 	if (!pipe_is_full(pipe)) {
3536 		struct pipe_buffer *buf = pipe_head_buf(pipe);
3537 
3538 		*buf = (struct pipe_buffer) {
3539 			.ops	= &zero_pipe_buf_ops,
3540 			.page	= ZERO_PAGE(0),
3541 			.offset	= offset,
3542 			.len	= size,
3543 		};
3544 		pipe->head++;
3545 	}
3546 
3547 	return size;
3548 }
3549 
shmem_file_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)3550 static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
3551 				      struct pipe_inode_info *pipe,
3552 				      size_t len, unsigned int flags)
3553 {
3554 	struct inode *inode = file_inode(in);
3555 	struct address_space *mapping = inode->i_mapping;
3556 	struct folio *folio = NULL;
3557 	size_t total_spliced = 0, used, npages, n, part;
3558 	loff_t isize;
3559 	int error = 0;
3560 
3561 	/* Work out how much data we can actually add into the pipe */
3562 	used = pipe_buf_usage(pipe);
3563 	npages = max_t(ssize_t, pipe->max_usage - used, 0);
3564 	len = min_t(size_t, len, npages * PAGE_SIZE);
3565 
3566 	do {
3567 		bool fallback_page_splice = false;
3568 		struct page *page = NULL;
3569 		pgoff_t index;
3570 		size_t size;
3571 
3572 		if (*ppos >= i_size_read(inode))
3573 			break;
3574 
3575 		index = *ppos >> PAGE_SHIFT;
3576 		error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
3577 		if (error) {
3578 			if (error == -EINVAL)
3579 				error = 0;
3580 			break;
3581 		}
3582 		if (folio) {
3583 			folio_unlock(folio);
3584 
3585 			page = folio_file_page(folio, index);
3586 			if (PageHWPoison(page)) {
3587 				error = -EIO;
3588 				break;
3589 			}
3590 
3591 			if (folio_test_large(folio) &&
3592 			    folio_test_has_hwpoisoned(folio))
3593 				fallback_page_splice = true;
3594 		}
3595 
3596 		/*
3597 		 * i_size must be checked after we know the pages are Uptodate.
3598 		 *
3599 		 * Checking i_size after the check allows us to calculate
3600 		 * the correct value for "nr", which means the zero-filled
3601 		 * part of the page is not copied back to userspace (unless
3602 		 * another truncate extends the file - this is desired though).
3603 		 */
3604 		isize = i_size_read(inode);
3605 		if (unlikely(*ppos >= isize))
3606 			break;
3607 		/*
3608 		 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
3609 		 * pages.
3610 		 */
3611 		size = len;
3612 		if (unlikely(fallback_page_splice)) {
3613 			size_t offset = *ppos & ~PAGE_MASK;
3614 
3615 			size = umin(size, PAGE_SIZE - offset);
3616 		}
3617 		part = min_t(loff_t, isize - *ppos, size);
3618 
3619 		if (folio) {
3620 			/*
3621 			 * If users can be writing to this page using arbitrary
3622 			 * virtual addresses, take care about potential aliasing
3623 			 * before reading the page on the kernel side.
3624 			 */
3625 			if (mapping_writably_mapped(mapping)) {
3626 				if (likely(!fallback_page_splice))
3627 					flush_dcache_folio(folio);
3628 				else
3629 					flush_dcache_page(page);
3630 			}
3631 			folio_mark_accessed(folio);
3632 			/*
3633 			 * Ok, we have the page, and it's up-to-date, so we can
3634 			 * now splice it into the pipe.
3635 			 */
3636 			n = splice_folio_into_pipe(pipe, folio, *ppos, part);
3637 			folio_put(folio);
3638 			folio = NULL;
3639 		} else {
3640 			n = splice_zeropage_into_pipe(pipe, *ppos, part);
3641 		}
3642 
3643 		if (!n)
3644 			break;
3645 		len -= n;
3646 		total_spliced += n;
3647 		*ppos += n;
3648 		in->f_ra.prev_pos = *ppos;
3649 		if (pipe_is_full(pipe))
3650 			break;
3651 
3652 		cond_resched();
3653 	} while (len);
3654 
3655 	if (folio)
3656 		folio_put(folio);
3657 
3658 	file_accessed(in);
3659 	return total_spliced ? total_spliced : error;
3660 }
3661 
shmem_file_llseek(struct file * file,loff_t offset,int whence)3662 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
3663 {
3664 	struct address_space *mapping = file->f_mapping;
3665 	struct inode *inode = mapping->host;
3666 
3667 	if (whence != SEEK_DATA && whence != SEEK_HOLE)
3668 		return generic_file_llseek_size(file, offset, whence,
3669 					MAX_LFS_FILESIZE, i_size_read(inode));
3670 	if (offset < 0)
3671 		return -ENXIO;
3672 
3673 	inode_lock(inode);
3674 	/* We're holding i_rwsem so we can access i_size directly */
3675 	offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
3676 	if (offset >= 0)
3677 		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
3678 	inode_unlock(inode);
3679 	return offset;
3680 }
3681 
shmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3682 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
3683 							 loff_t len)
3684 {
3685 	struct inode *inode = file_inode(file);
3686 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3687 	struct shmem_inode_info *info = SHMEM_I(inode);
3688 	struct shmem_falloc shmem_falloc;
3689 	pgoff_t start, index, end, undo_fallocend;
3690 	int error;
3691 
3692 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3693 		return -EOPNOTSUPP;
3694 
3695 	inode_lock(inode);
3696 
3697 	if (mode & FALLOC_FL_PUNCH_HOLE) {
3698 		struct address_space *mapping = file->f_mapping;
3699 		loff_t unmap_start = round_up(offset, PAGE_SIZE);
3700 		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
3701 		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
3702 
3703 		/* protected by i_rwsem */
3704 		if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
3705 			error = -EPERM;
3706 			goto out;
3707 		}
3708 
3709 		shmem_falloc.waitq = &shmem_falloc_waitq;
3710 		shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
3711 		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
3712 		spin_lock(&inode->i_lock);
3713 		inode->i_private = &shmem_falloc;
3714 		spin_unlock(&inode->i_lock);
3715 
3716 		if ((u64)unmap_end > (u64)unmap_start)
3717 			unmap_mapping_range(mapping, unmap_start,
3718 					    1 + unmap_end - unmap_start, 0);
3719 		shmem_truncate_range(inode, offset, offset + len - 1);
3720 		/* No need to unmap again: hole-punching leaves COWed pages */
3721 
3722 		spin_lock(&inode->i_lock);
3723 		inode->i_private = NULL;
3724 		wake_up_all(&shmem_falloc_waitq);
3725 		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
3726 		spin_unlock(&inode->i_lock);
3727 		error = 0;
3728 		goto out;
3729 	}
3730 
3731 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
3732 	error = inode_newsize_ok(inode, offset + len);
3733 	if (error)
3734 		goto out;
3735 
3736 	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
3737 		error = -EPERM;
3738 		goto out;
3739 	}
3740 
3741 	start = offset >> PAGE_SHIFT;
3742 	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3743 	/* Try to avoid a swapstorm if len is impossible to satisfy */
3744 	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
3745 		error = -ENOSPC;
3746 		goto out;
3747 	}
3748 
3749 	shmem_falloc.waitq = NULL;
3750 	shmem_falloc.start = start;
3751 	shmem_falloc.next  = start;
3752 	shmem_falloc.nr_falloced = 0;
3753 	shmem_falloc.nr_unswapped = 0;
3754 	spin_lock(&inode->i_lock);
3755 	inode->i_private = &shmem_falloc;
3756 	spin_unlock(&inode->i_lock);
3757 
3758 	/*
3759 	 * info->fallocend is only relevant when huge pages might be
3760 	 * involved: to prevent split_huge_page() freeing fallocated
3761 	 * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
3762 	 */
3763 	undo_fallocend = info->fallocend;
3764 	if (info->fallocend < end)
3765 		info->fallocend = end;
3766 
3767 	for (index = start; index < end; ) {
3768 		struct folio *folio;
3769 
3770 		/*
3771 		 * Check for fatal signal so that we abort early in OOM
3772 		 * situations. We don't want to abort in case of non-fatal
3773 		 * signals as large fallocate can take noticeable time and
3774 		 * e.g. periodic timers may result in fallocate constantly
3775 		 * restarting.
3776 		 */
3777 		if (fatal_signal_pending(current))
3778 			error = -EINTR;
3779 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
3780 			error = -ENOMEM;
3781 		else
3782 			error = shmem_get_folio(inode, index, offset + len,
3783 						&folio, SGP_FALLOC);
3784 		if (error) {
3785 			info->fallocend = undo_fallocend;
3786 			/* Remove the !uptodate folios we added */
3787 			if (index > start) {
3788 				shmem_undo_range(inode,
3789 				    (loff_t)start << PAGE_SHIFT,
3790 				    ((loff_t)index << PAGE_SHIFT) - 1, true);
3791 			}
3792 			goto undone;
3793 		}
3794 
3795 		/*
3796 		 * Here is a more important optimization than it appears:
3797 		 * a second SGP_FALLOC on the same large folio will clear it,
3798 		 * making it uptodate and un-undoable if we fail later.
3799 		 */
3800 		index = folio_next_index(folio);
3801 		/* Beware 32-bit wraparound */
3802 		if (!index)
3803 			index--;
3804 
3805 		/*
3806 		 * Inform shmem_writeout() how far we have reached.
3807 		 * No need for lock or barrier: we have the page lock.
3808 		 */
3809 		if (!folio_test_uptodate(folio))
3810 			shmem_falloc.nr_falloced += index - shmem_falloc.next;
3811 		shmem_falloc.next = index;
3812 
3813 		/*
3814 		 * If !uptodate, leave it that way so that freeable folios
3815 		 * can be recognized if we need to rollback on error later.
3816 		 * But mark it dirty so that memory pressure will swap rather
3817 		 * than free the folios we are allocating (and SGP_CACHE folios
3818 		 * might still be clean: we now need to mark those dirty too).
3819 		 */
3820 		folio_mark_dirty(folio);
3821 		folio_unlock(folio);
3822 		folio_put(folio);
3823 		cond_resched();
3824 	}
3825 
3826 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
3827 		i_size_write(inode, offset + len);
3828 undone:
3829 	spin_lock(&inode->i_lock);
3830 	inode->i_private = NULL;
3831 	spin_unlock(&inode->i_lock);
3832 out:
3833 	if (!error)
3834 		file_modified(file);
3835 	inode_unlock(inode);
3836 	return error;
3837 }
3838 
shmem_statfs(struct dentry * dentry,struct kstatfs * buf)3839 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
3840 {
3841 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
3842 
3843 	buf->f_type = TMPFS_MAGIC;
3844 	buf->f_bsize = PAGE_SIZE;
3845 	buf->f_namelen = NAME_MAX;
3846 	if (sbinfo->max_blocks) {
3847 		buf->f_blocks = sbinfo->max_blocks;
3848 		buf->f_bavail =
3849 		buf->f_bfree  = sbinfo->max_blocks -
3850 				percpu_counter_sum(&sbinfo->used_blocks);
3851 	}
3852 	if (sbinfo->max_inodes) {
3853 		buf->f_files = sbinfo->max_inodes;
3854 		buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
3855 	}
3856 	/* else leave those fields 0 like simple_statfs */
3857 
3858 	buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
3859 
3860 	return 0;
3861 }
3862 
3863 /*
3864  * File creation. Allocate an inode, and we're done..
3865  */
3866 static int
shmem_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t dev)3867 shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
3868 	    struct dentry *dentry, umode_t mode, dev_t dev)
3869 {
3870 	struct inode *inode;
3871 	int error;
3872 
3873 	if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
3874 		return -EINVAL;
3875 
3876 	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
3877 	if (IS_ERR(inode))
3878 		return PTR_ERR(inode);
3879 
3880 	error = simple_acl_create(dir, inode);
3881 	if (error)
3882 		goto out_iput;
3883 	error = security_inode_init_security(inode, dir, &dentry->d_name,
3884 					     shmem_initxattrs, NULL);
3885 	if (error && error != -EOPNOTSUPP)
3886 		goto out_iput;
3887 
3888 	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3889 	if (error)
3890 		goto out_iput;
3891 
3892 	dir->i_size += BOGO_DIRENT_SIZE;
3893 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
3894 	inode_inc_iversion(dir);
3895 
3896 	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
3897 		d_add(dentry, inode);
3898 	else
3899 		d_instantiate(dentry, inode);
3900 
3901 	dget(dentry); /* Extra count - pin the dentry in core */
3902 	return error;
3903 
3904 out_iput:
3905 	iput(inode);
3906 	return error;
3907 }
3908 
3909 static int
shmem_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)3910 shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
3911 	      struct file *file, umode_t mode)
3912 {
3913 	struct inode *inode;
3914 	int error;
3915 
3916 	inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
3917 	if (IS_ERR(inode)) {
3918 		error = PTR_ERR(inode);
3919 		goto err_out;
3920 	}
3921 	error = security_inode_init_security(inode, dir, NULL,
3922 					     shmem_initxattrs, NULL);
3923 	if (error && error != -EOPNOTSUPP)
3924 		goto out_iput;
3925 	error = simple_acl_create(dir, inode);
3926 	if (error)
3927 		goto out_iput;
3928 	d_tmpfile(file, inode);
3929 
3930 err_out:
3931 	return finish_open_simple(file, error);
3932 out_iput:
3933 	iput(inode);
3934 	return error;
3935 }
3936 
shmem_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)3937 static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
3938 				  struct dentry *dentry, umode_t mode)
3939 {
3940 	int error;
3941 
3942 	error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
3943 	if (error)
3944 		return ERR_PTR(error);
3945 	inc_nlink(dir);
3946 	return NULL;
3947 }
3948 
shmem_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)3949 static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
3950 			struct dentry *dentry, umode_t mode, bool excl)
3951 {
3952 	return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
3953 }
3954 
3955 /*
3956  * Link a file..
3957  */
shmem_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)3958 static int shmem_link(struct dentry *old_dentry, struct inode *dir,
3959 		      struct dentry *dentry)
3960 {
3961 	struct inode *inode = d_inode(old_dentry);
3962 	int ret = 0;
3963 
3964 	/*
3965 	 * No ordinary (disk based) filesystem counts links as inodes;
3966 	 * but each new link needs a new dentry, pinning lowmem, and
3967 	 * tmpfs dentries cannot be pruned until they are unlinked.
3968 	 * But if an O_TMPFILE file is linked into the tmpfs, the
3969 	 * first link must skip that, to get the accounting right.
3970 	 */
3971 	if (inode->i_nlink) {
3972 		ret = shmem_reserve_inode(inode->i_sb, NULL);
3973 		if (ret)
3974 			goto out;
3975 	}
3976 
3977 	ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
3978 	if (ret) {
3979 		if (inode->i_nlink)
3980 			shmem_free_inode(inode->i_sb, 0);
3981 		goto out;
3982 	}
3983 
3984 	dir->i_size += BOGO_DIRENT_SIZE;
3985 	inode_set_mtime_to_ts(dir,
3986 			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
3987 	inode_inc_iversion(dir);
3988 	inc_nlink(inode);
3989 	ihold(inode);	/* New dentry reference */
3990 	dget(dentry);	/* Extra pinning count for the created dentry */
3991 	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
3992 		d_add(dentry, inode);
3993 	else
3994 		d_instantiate(dentry, inode);
3995 out:
3996 	return ret;
3997 }
3998 
shmem_unlink(struct inode * dir,struct dentry * dentry)3999 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
4000 {
4001 	struct inode *inode = d_inode(dentry);
4002 
4003 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
4004 		shmem_free_inode(inode->i_sb, 0);
4005 
4006 	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4007 
4008 	dir->i_size -= BOGO_DIRENT_SIZE;
4009 	inode_set_mtime_to_ts(dir,
4010 			      inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
4011 	inode_inc_iversion(dir);
4012 	drop_nlink(inode);
4013 	dput(dentry);	/* Undo the count from "create" - does all the work */
4014 
4015 	/*
4016 	 * For now, VFS can't deal with case-insensitive negative dentries, so
4017 	 * we invalidate them
4018 	 */
4019 	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4020 		d_invalidate(dentry);
4021 
4022 	return 0;
4023 }
4024 
shmem_rmdir(struct inode * dir,struct dentry * dentry)4025 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
4026 {
4027 	if (!simple_empty(dentry))
4028 		return -ENOTEMPTY;
4029 
4030 	drop_nlink(d_inode(dentry));
4031 	drop_nlink(dir);
4032 	return shmem_unlink(dir, dentry);
4033 }
4034 
shmem_whiteout(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry)4035 static int shmem_whiteout(struct mnt_idmap *idmap,
4036 			  struct inode *old_dir, struct dentry *old_dentry)
4037 {
4038 	struct dentry *whiteout;
4039 	int error;
4040 
4041 	whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
4042 	if (!whiteout)
4043 		return -ENOMEM;
4044 
4045 	error = shmem_mknod(idmap, old_dir, whiteout,
4046 			    S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4047 	dput(whiteout);
4048 	if (error)
4049 		return error;
4050 
4051 	/*
4052 	 * Cheat and hash the whiteout while the old dentry is still in
4053 	 * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
4054 	 *
4055 	 * d_lookup() will consistently find one of them at this point,
4056 	 * not sure which one, but that isn't even important.
4057 	 */
4058 	d_rehash(whiteout);
4059 	return 0;
4060 }
4061 
4062 /*
4063  * The VFS layer already does all the dentry stuff for rename,
4064  * we just have to decrement the usage count for the target if
4065  * it exists so that the VFS layer correctly free's it when it
4066  * gets overwritten.
4067  */
shmem_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)4068 static int shmem_rename2(struct mnt_idmap *idmap,
4069 			 struct inode *old_dir, struct dentry *old_dentry,
4070 			 struct inode *new_dir, struct dentry *new_dentry,
4071 			 unsigned int flags)
4072 {
4073 	struct inode *inode = d_inode(old_dentry);
4074 	int they_are_dirs = S_ISDIR(inode->i_mode);
4075 	int error;
4076 
4077 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4078 		return -EINVAL;
4079 
4080 	if (flags & RENAME_EXCHANGE)
4081 		return simple_offset_rename_exchange(old_dir, old_dentry,
4082 						     new_dir, new_dentry);
4083 
4084 	if (!simple_empty(new_dentry))
4085 		return -ENOTEMPTY;
4086 
4087 	if (flags & RENAME_WHITEOUT) {
4088 		error = shmem_whiteout(idmap, old_dir, old_dentry);
4089 		if (error)
4090 			return error;
4091 	}
4092 
4093 	error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
4094 	if (error)
4095 		return error;
4096 
4097 	if (d_really_is_positive(new_dentry)) {
4098 		(void) shmem_unlink(new_dir, new_dentry);
4099 		if (they_are_dirs) {
4100 			drop_nlink(d_inode(new_dentry));
4101 			drop_nlink(old_dir);
4102 		}
4103 	} else if (they_are_dirs) {
4104 		drop_nlink(old_dir);
4105 		inc_nlink(new_dir);
4106 	}
4107 
4108 	old_dir->i_size -= BOGO_DIRENT_SIZE;
4109 	new_dir->i_size += BOGO_DIRENT_SIZE;
4110 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
4111 	inode_inc_iversion(old_dir);
4112 	inode_inc_iversion(new_dir);
4113 	return 0;
4114 }
4115 
shmem_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)4116 static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
4117 			 struct dentry *dentry, const char *symname)
4118 {
4119 	int error;
4120 	int len;
4121 	struct inode *inode;
4122 	struct folio *folio;
4123 	char *link;
4124 
4125 	len = strlen(symname) + 1;
4126 	if (len > PAGE_SIZE)
4127 		return -ENAMETOOLONG;
4128 
4129 	inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
4130 				VM_NORESERVE);
4131 	if (IS_ERR(inode))
4132 		return PTR_ERR(inode);
4133 
4134 	error = security_inode_init_security(inode, dir, &dentry->d_name,
4135 					     shmem_initxattrs, NULL);
4136 	if (error && error != -EOPNOTSUPP)
4137 		goto out_iput;
4138 
4139 	error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
4140 	if (error)
4141 		goto out_iput;
4142 
4143 	inode->i_size = len-1;
4144 	if (len <= SHORT_SYMLINK_LEN) {
4145 		link = kmemdup(symname, len, GFP_KERNEL);
4146 		if (!link) {
4147 			error = -ENOMEM;
4148 			goto out_remove_offset;
4149 		}
4150 		inode->i_op = &shmem_short_symlink_operations;
4151 		inode_set_cached_link(inode, link, len - 1);
4152 	} else {
4153 		inode_nohighmem(inode);
4154 		inode->i_mapping->a_ops = &shmem_aops;
4155 		error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
4156 		if (error)
4157 			goto out_remove_offset;
4158 		inode->i_op = &shmem_symlink_inode_operations;
4159 		memcpy(folio_address(folio), symname, len);
4160 		folio_mark_uptodate(folio);
4161 		folio_mark_dirty(folio);
4162 		folio_unlock(folio);
4163 		folio_put(folio);
4164 	}
4165 	dir->i_size += BOGO_DIRENT_SIZE;
4166 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
4167 	inode_inc_iversion(dir);
4168 	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
4169 		d_add(dentry, inode);
4170 	else
4171 		d_instantiate(dentry, inode);
4172 	dget(dentry);
4173 	return 0;
4174 
4175 out_remove_offset:
4176 	simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
4177 out_iput:
4178 	iput(inode);
4179 	return error;
4180 }
4181 
shmem_put_link(void * arg)4182 static void shmem_put_link(void *arg)
4183 {
4184 	folio_mark_accessed(arg);
4185 	folio_put(arg);
4186 }
4187 
shmem_get_link(struct dentry * dentry,struct inode * inode,struct delayed_call * done)4188 static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
4189 				  struct delayed_call *done)
4190 {
4191 	struct folio *folio = NULL;
4192 	int error;
4193 
4194 	if (!dentry) {
4195 		folio = filemap_get_folio(inode->i_mapping, 0);
4196 		if (IS_ERR(folio))
4197 			return ERR_PTR(-ECHILD);
4198 		if (PageHWPoison(folio_page(folio, 0)) ||
4199 		    !folio_test_uptodate(folio)) {
4200 			folio_put(folio);
4201 			return ERR_PTR(-ECHILD);
4202 		}
4203 	} else {
4204 		error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
4205 		if (error)
4206 			return ERR_PTR(error);
4207 		if (!folio)
4208 			return ERR_PTR(-ECHILD);
4209 		if (PageHWPoison(folio_page(folio, 0))) {
4210 			folio_unlock(folio);
4211 			folio_put(folio);
4212 			return ERR_PTR(-ECHILD);
4213 		}
4214 		folio_unlock(folio);
4215 	}
4216 	set_delayed_call(done, shmem_put_link, folio);
4217 	return folio_address(folio);
4218 }
4219 
4220 #ifdef CONFIG_TMPFS_XATTR
4221 
shmem_fileattr_get(struct dentry * dentry,struct file_kattr * fa)4222 static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
4223 {
4224 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4225 
4226 	fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
4227 
4228 	return 0;
4229 }
4230 
shmem_fileattr_set(struct mnt_idmap * idmap,struct dentry * dentry,struct file_kattr * fa)4231 static int shmem_fileattr_set(struct mnt_idmap *idmap,
4232 			      struct dentry *dentry, struct file_kattr *fa)
4233 {
4234 	struct inode *inode = d_inode(dentry);
4235 	struct shmem_inode_info *info = SHMEM_I(inode);
4236 	int ret, flags;
4237 
4238 	if (fileattr_has_fsx(fa))
4239 		return -EOPNOTSUPP;
4240 	if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
4241 		return -EOPNOTSUPP;
4242 
4243 	flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
4244 		(fa->flags & SHMEM_FL_USER_MODIFIABLE);
4245 
4246 	ret = shmem_set_inode_flags(inode, flags, dentry);
4247 
4248 	if (ret)
4249 		return ret;
4250 
4251 	info->fsflags = flags;
4252 
4253 	inode_set_ctime_current(inode);
4254 	inode_inc_iversion(inode);
4255 	return 0;
4256 }
4257 
4258 /*
4259  * Superblocks without xattr inode operations may get some security.* xattr
4260  * support from the LSM "for free". As soon as we have any other xattrs
4261  * like ACLs, we also need to implement the security.* handlers at
4262  * filesystem level, though.
4263  */
4264 
4265 /*
4266  * Callback for security_inode_init_security() for acquiring xattrs.
4267  */
shmem_initxattrs(struct inode * inode,const struct xattr * xattr_array,void * fs_info)4268 static int shmem_initxattrs(struct inode *inode,
4269 			    const struct xattr *xattr_array, void *fs_info)
4270 {
4271 	struct shmem_inode_info *info = SHMEM_I(inode);
4272 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4273 	const struct xattr *xattr;
4274 	struct simple_xattr *new_xattr;
4275 	size_t ispace = 0;
4276 	size_t len;
4277 
4278 	if (sbinfo->max_inodes) {
4279 		for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4280 			ispace += simple_xattr_space(xattr->name,
4281 				xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
4282 		}
4283 		if (ispace) {
4284 			raw_spin_lock(&sbinfo->stat_lock);
4285 			if (sbinfo->free_ispace < ispace)
4286 				ispace = 0;
4287 			else
4288 				sbinfo->free_ispace -= ispace;
4289 			raw_spin_unlock(&sbinfo->stat_lock);
4290 			if (!ispace)
4291 				return -ENOSPC;
4292 		}
4293 	}
4294 
4295 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
4296 		new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
4297 		if (!new_xattr)
4298 			break;
4299 
4300 		len = strlen(xattr->name) + 1;
4301 		new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
4302 					  GFP_KERNEL_ACCOUNT);
4303 		if (!new_xattr->name) {
4304 			kvfree(new_xattr);
4305 			break;
4306 		}
4307 
4308 		memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
4309 		       XATTR_SECURITY_PREFIX_LEN);
4310 		memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
4311 		       xattr->name, len);
4312 
4313 		simple_xattr_add(&info->xattrs, new_xattr);
4314 	}
4315 
4316 	if (xattr->name != NULL) {
4317 		if (ispace) {
4318 			raw_spin_lock(&sbinfo->stat_lock);
4319 			sbinfo->free_ispace += ispace;
4320 			raw_spin_unlock(&sbinfo->stat_lock);
4321 		}
4322 		simple_xattrs_free(&info->xattrs, NULL);
4323 		return -ENOMEM;
4324 	}
4325 
4326 	return 0;
4327 }
4328 
shmem_xattr_handler_get(const struct xattr_handler * handler,struct dentry * unused,struct inode * inode,const char * name,void * buffer,size_t size)4329 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
4330 				   struct dentry *unused, struct inode *inode,
4331 				   const char *name, void *buffer, size_t size)
4332 {
4333 	struct shmem_inode_info *info = SHMEM_I(inode);
4334 
4335 	name = xattr_full_name(handler, name);
4336 	return simple_xattr_get(&info->xattrs, name, buffer, size);
4337 }
4338 
shmem_xattr_handler_set(const struct xattr_handler * handler,struct mnt_idmap * idmap,struct dentry * unused,struct inode * inode,const char * name,const void * value,size_t size,int flags)4339 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
4340 				   struct mnt_idmap *idmap,
4341 				   struct dentry *unused, struct inode *inode,
4342 				   const char *name, const void *value,
4343 				   size_t size, int flags)
4344 {
4345 	struct shmem_inode_info *info = SHMEM_I(inode);
4346 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
4347 	struct simple_xattr *old_xattr;
4348 	size_t ispace = 0;
4349 
4350 	name = xattr_full_name(handler, name);
4351 	if (value && sbinfo->max_inodes) {
4352 		ispace = simple_xattr_space(name, size);
4353 		raw_spin_lock(&sbinfo->stat_lock);
4354 		if (sbinfo->free_ispace < ispace)
4355 			ispace = 0;
4356 		else
4357 			sbinfo->free_ispace -= ispace;
4358 		raw_spin_unlock(&sbinfo->stat_lock);
4359 		if (!ispace)
4360 			return -ENOSPC;
4361 	}
4362 
4363 	old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
4364 	if (!IS_ERR(old_xattr)) {
4365 		ispace = 0;
4366 		if (old_xattr && sbinfo->max_inodes)
4367 			ispace = simple_xattr_space(old_xattr->name,
4368 						    old_xattr->size);
4369 		simple_xattr_free(old_xattr);
4370 		old_xattr = NULL;
4371 		inode_set_ctime_current(inode);
4372 		inode_inc_iversion(inode);
4373 	}
4374 	if (ispace) {
4375 		raw_spin_lock(&sbinfo->stat_lock);
4376 		sbinfo->free_ispace += ispace;
4377 		raw_spin_unlock(&sbinfo->stat_lock);
4378 	}
4379 	return PTR_ERR(old_xattr);
4380 }
4381 
4382 static const struct xattr_handler shmem_security_xattr_handler = {
4383 	.prefix = XATTR_SECURITY_PREFIX,
4384 	.get = shmem_xattr_handler_get,
4385 	.set = shmem_xattr_handler_set,
4386 };
4387 
4388 static const struct xattr_handler shmem_trusted_xattr_handler = {
4389 	.prefix = XATTR_TRUSTED_PREFIX,
4390 	.get = shmem_xattr_handler_get,
4391 	.set = shmem_xattr_handler_set,
4392 };
4393 
4394 static const struct xattr_handler shmem_user_xattr_handler = {
4395 	.prefix = XATTR_USER_PREFIX,
4396 	.get = shmem_xattr_handler_get,
4397 	.set = shmem_xattr_handler_set,
4398 };
4399 
4400 static const struct xattr_handler * const shmem_xattr_handlers[] = {
4401 	&shmem_security_xattr_handler,
4402 	&shmem_trusted_xattr_handler,
4403 	&shmem_user_xattr_handler,
4404 	NULL
4405 };
4406 
shmem_listxattr(struct dentry * dentry,char * buffer,size_t size)4407 static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
4408 {
4409 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
4410 	return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
4411 }
4412 #endif /* CONFIG_TMPFS_XATTR */
4413 
4414 static const struct inode_operations shmem_short_symlink_operations = {
4415 	.getattr	= shmem_getattr,
4416 	.setattr	= shmem_setattr,
4417 	.get_link	= simple_get_link,
4418 #ifdef CONFIG_TMPFS_XATTR
4419 	.listxattr	= shmem_listxattr,
4420 #endif
4421 };
4422 
4423 static const struct inode_operations shmem_symlink_inode_operations = {
4424 	.getattr	= shmem_getattr,
4425 	.setattr	= shmem_setattr,
4426 	.get_link	= shmem_get_link,
4427 #ifdef CONFIG_TMPFS_XATTR
4428 	.listxattr	= shmem_listxattr,
4429 #endif
4430 };
4431 
shmem_get_parent(struct dentry * child)4432 static struct dentry *shmem_get_parent(struct dentry *child)
4433 {
4434 	return ERR_PTR(-ESTALE);
4435 }
4436 
shmem_match(struct inode * ino,void * vfh)4437 static int shmem_match(struct inode *ino, void *vfh)
4438 {
4439 	__u32 *fh = vfh;
4440 	__u64 inum = fh[2];
4441 	inum = (inum << 32) | fh[1];
4442 	return ino->i_ino == inum && fh[0] == ino->i_generation;
4443 }
4444 
4445 /* Find any alias of inode, but prefer a hashed alias */
shmem_find_alias(struct inode * inode)4446 static struct dentry *shmem_find_alias(struct inode *inode)
4447 {
4448 	struct dentry *alias = d_find_alias(inode);
4449 
4450 	return alias ?: d_find_any_alias(inode);
4451 }
4452 
shmem_fh_to_dentry(struct super_block * sb,struct fid * fid,int fh_len,int fh_type)4453 static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
4454 		struct fid *fid, int fh_len, int fh_type)
4455 {
4456 	struct inode *inode;
4457 	struct dentry *dentry = NULL;
4458 	u64 inum;
4459 
4460 	if (fh_len < 3)
4461 		return NULL;
4462 
4463 	inum = fid->raw[2];
4464 	inum = (inum << 32) | fid->raw[1];
4465 
4466 	inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
4467 			shmem_match, fid->raw);
4468 	if (inode) {
4469 		dentry = shmem_find_alias(inode);
4470 		iput(inode);
4471 	}
4472 
4473 	return dentry;
4474 }
4475 
shmem_encode_fh(struct inode * inode,__u32 * fh,int * len,struct inode * parent)4476 static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
4477 				struct inode *parent)
4478 {
4479 	if (*len < 3) {
4480 		*len = 3;
4481 		return FILEID_INVALID;
4482 	}
4483 
4484 	if (inode_unhashed(inode)) {
4485 		/* Unfortunately insert_inode_hash is not idempotent,
4486 		 * so as we hash inodes here rather than at creation
4487 		 * time, we need a lock to ensure we only try
4488 		 * to do it once
4489 		 */
4490 		static DEFINE_SPINLOCK(lock);
4491 		spin_lock(&lock);
4492 		if (inode_unhashed(inode))
4493 			__insert_inode_hash(inode,
4494 					    inode->i_ino + inode->i_generation);
4495 		spin_unlock(&lock);
4496 	}
4497 
4498 	fh[0] = inode->i_generation;
4499 	fh[1] = inode->i_ino;
4500 	fh[2] = ((__u64)inode->i_ino) >> 32;
4501 
4502 	*len = 3;
4503 	return 1;
4504 }
4505 
4506 static const struct export_operations shmem_export_ops = {
4507 	.get_parent     = shmem_get_parent,
4508 	.encode_fh      = shmem_encode_fh,
4509 	.fh_to_dentry	= shmem_fh_to_dentry,
4510 };
4511 
4512 enum shmem_param {
4513 	Opt_gid,
4514 	Opt_huge,
4515 	Opt_mode,
4516 	Opt_mpol,
4517 	Opt_nr_blocks,
4518 	Opt_nr_inodes,
4519 	Opt_size,
4520 	Opt_uid,
4521 	Opt_inode32,
4522 	Opt_inode64,
4523 	Opt_noswap,
4524 	Opt_quota,
4525 	Opt_usrquota,
4526 	Opt_grpquota,
4527 	Opt_usrquota_block_hardlimit,
4528 	Opt_usrquota_inode_hardlimit,
4529 	Opt_grpquota_block_hardlimit,
4530 	Opt_grpquota_inode_hardlimit,
4531 	Opt_casefold_version,
4532 	Opt_casefold,
4533 	Opt_strict_encoding,
4534 };
4535 
4536 static const struct constant_table shmem_param_enums_huge[] = {
4537 	{"never",	SHMEM_HUGE_NEVER },
4538 	{"always",	SHMEM_HUGE_ALWAYS },
4539 	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
4540 	{"advise",	SHMEM_HUGE_ADVISE },
4541 	{}
4542 };
4543 
4544 const struct fs_parameter_spec shmem_fs_parameters[] = {
4545 	fsparam_gid   ("gid",		Opt_gid),
4546 	fsparam_enum  ("huge",		Opt_huge,  shmem_param_enums_huge),
4547 	fsparam_u32oct("mode",		Opt_mode),
4548 	fsparam_string("mpol",		Opt_mpol),
4549 	fsparam_string("nr_blocks",	Opt_nr_blocks),
4550 	fsparam_string("nr_inodes",	Opt_nr_inodes),
4551 	fsparam_string("size",		Opt_size),
4552 	fsparam_uid   ("uid",		Opt_uid),
4553 	fsparam_flag  ("inode32",	Opt_inode32),
4554 	fsparam_flag  ("inode64",	Opt_inode64),
4555 	fsparam_flag  ("noswap",	Opt_noswap),
4556 #ifdef CONFIG_TMPFS_QUOTA
4557 	fsparam_flag  ("quota",		Opt_quota),
4558 	fsparam_flag  ("usrquota",	Opt_usrquota),
4559 	fsparam_flag  ("grpquota",	Opt_grpquota),
4560 	fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
4561 	fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
4562 	fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
4563 	fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
4564 #endif
4565 	fsparam_string("casefold",	Opt_casefold_version),
4566 	fsparam_flag  ("casefold",	Opt_casefold),
4567 	fsparam_flag  ("strict_encoding", Opt_strict_encoding),
4568 	{}
4569 };
4570 
4571 #if IS_ENABLED(CONFIG_UNICODE)
shmem_parse_opt_casefold(struct fs_context * fc,struct fs_parameter * param,bool latest_version)4572 static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4573 				    bool latest_version)
4574 {
4575 	struct shmem_options *ctx = fc->fs_private;
4576 	int version = UTF8_LATEST;
4577 	struct unicode_map *encoding;
4578 	char *version_str = param->string + 5;
4579 
4580 	if (!latest_version) {
4581 		if (strncmp(param->string, "utf8-", 5))
4582 			return invalfc(fc, "Only UTF-8 encodings are supported "
4583 				       "in the format: utf8-<version number>");
4584 
4585 		version = utf8_parse_version(version_str);
4586 		if (version < 0)
4587 			return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
4588 	}
4589 
4590 	encoding = utf8_load(version);
4591 
4592 	if (IS_ERR(encoding)) {
4593 		return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
4594 			       unicode_major(version), unicode_minor(version),
4595 			       unicode_rev(version));
4596 	}
4597 
4598 	pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
4599 		unicode_major(version), unicode_minor(version), unicode_rev(version));
4600 
4601 	ctx->encoding = encoding;
4602 
4603 	return 0;
4604 }
4605 #else
shmem_parse_opt_casefold(struct fs_context * fc,struct fs_parameter * param,bool latest_version)4606 static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
4607 				    bool latest_version)
4608 {
4609 	return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4610 }
4611 #endif
4612 
shmem_parse_one(struct fs_context * fc,struct fs_parameter * param)4613 static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
4614 {
4615 	struct shmem_options *ctx = fc->fs_private;
4616 	struct fs_parse_result result;
4617 	unsigned long long size;
4618 	char *rest;
4619 	int opt;
4620 	kuid_t kuid;
4621 	kgid_t kgid;
4622 
4623 	opt = fs_parse(fc, shmem_fs_parameters, param, &result);
4624 	if (opt < 0)
4625 		return opt;
4626 
4627 	switch (opt) {
4628 	case Opt_size:
4629 		size = memparse(param->string, &rest);
4630 		if (*rest == '%') {
4631 			size <<= PAGE_SHIFT;
4632 			size *= totalram_pages();
4633 			do_div(size, 100);
4634 			rest++;
4635 		}
4636 		if (*rest)
4637 			goto bad_value;
4638 		ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
4639 		ctx->seen |= SHMEM_SEEN_BLOCKS;
4640 		break;
4641 	case Opt_nr_blocks:
4642 		ctx->blocks = memparse(param->string, &rest);
4643 		if (*rest || ctx->blocks > LONG_MAX)
4644 			goto bad_value;
4645 		ctx->seen |= SHMEM_SEEN_BLOCKS;
4646 		break;
4647 	case Opt_nr_inodes:
4648 		ctx->inodes = memparse(param->string, &rest);
4649 		if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
4650 			goto bad_value;
4651 		ctx->seen |= SHMEM_SEEN_INODES;
4652 		break;
4653 	case Opt_mode:
4654 		ctx->mode = result.uint_32 & 07777;
4655 		break;
4656 	case Opt_uid:
4657 		kuid = result.uid;
4658 
4659 		/*
4660 		 * The requested uid must be representable in the
4661 		 * filesystem's idmapping.
4662 		 */
4663 		if (!kuid_has_mapping(fc->user_ns, kuid))
4664 			goto bad_value;
4665 
4666 		ctx->uid = kuid;
4667 		break;
4668 	case Opt_gid:
4669 		kgid = result.gid;
4670 
4671 		/*
4672 		 * The requested gid must be representable in the
4673 		 * filesystem's idmapping.
4674 		 */
4675 		if (!kgid_has_mapping(fc->user_ns, kgid))
4676 			goto bad_value;
4677 
4678 		ctx->gid = kgid;
4679 		break;
4680 	case Opt_huge:
4681 		ctx->huge = result.uint_32;
4682 		if (ctx->huge != SHMEM_HUGE_NEVER &&
4683 		    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
4684 		      has_transparent_hugepage()))
4685 			goto unsupported_parameter;
4686 		ctx->seen |= SHMEM_SEEN_HUGE;
4687 		break;
4688 	case Opt_mpol:
4689 		if (IS_ENABLED(CONFIG_NUMA)) {
4690 			mpol_put(ctx->mpol);
4691 			ctx->mpol = NULL;
4692 			if (mpol_parse_str(param->string, &ctx->mpol))
4693 				goto bad_value;
4694 			break;
4695 		}
4696 		goto unsupported_parameter;
4697 	case Opt_inode32:
4698 		ctx->full_inums = false;
4699 		ctx->seen |= SHMEM_SEEN_INUMS;
4700 		break;
4701 	case Opt_inode64:
4702 		if (sizeof(ino_t) < 8) {
4703 			return invalfc(fc,
4704 				       "Cannot use inode64 with <64bit inums in kernel\n");
4705 		}
4706 		ctx->full_inums = true;
4707 		ctx->seen |= SHMEM_SEEN_INUMS;
4708 		break;
4709 	case Opt_noswap:
4710 		if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
4711 			return invalfc(fc,
4712 				       "Turning off swap in unprivileged tmpfs mounts unsupported");
4713 		}
4714 		ctx->noswap = true;
4715 		ctx->seen |= SHMEM_SEEN_NOSWAP;
4716 		break;
4717 	case Opt_quota:
4718 		if (fc->user_ns != &init_user_ns)
4719 			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4720 		ctx->seen |= SHMEM_SEEN_QUOTA;
4721 		ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
4722 		break;
4723 	case Opt_usrquota:
4724 		if (fc->user_ns != &init_user_ns)
4725 			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4726 		ctx->seen |= SHMEM_SEEN_QUOTA;
4727 		ctx->quota_types |= QTYPE_MASK_USR;
4728 		break;
4729 	case Opt_grpquota:
4730 		if (fc->user_ns != &init_user_ns)
4731 			return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
4732 		ctx->seen |= SHMEM_SEEN_QUOTA;
4733 		ctx->quota_types |= QTYPE_MASK_GRP;
4734 		break;
4735 	case Opt_usrquota_block_hardlimit:
4736 		size = memparse(param->string, &rest);
4737 		if (*rest || !size)
4738 			goto bad_value;
4739 		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4740 			return invalfc(fc,
4741 				       "User quota block hardlimit too large.");
4742 		ctx->qlimits.usrquota_bhardlimit = size;
4743 		break;
4744 	case Opt_grpquota_block_hardlimit:
4745 		size = memparse(param->string, &rest);
4746 		if (*rest || !size)
4747 			goto bad_value;
4748 		if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
4749 			return invalfc(fc,
4750 				       "Group quota block hardlimit too large.");
4751 		ctx->qlimits.grpquota_bhardlimit = size;
4752 		break;
4753 	case Opt_usrquota_inode_hardlimit:
4754 		size = memparse(param->string, &rest);
4755 		if (*rest || !size)
4756 			goto bad_value;
4757 		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4758 			return invalfc(fc,
4759 				       "User quota inode hardlimit too large.");
4760 		ctx->qlimits.usrquota_ihardlimit = size;
4761 		break;
4762 	case Opt_grpquota_inode_hardlimit:
4763 		size = memparse(param->string, &rest);
4764 		if (*rest || !size)
4765 			goto bad_value;
4766 		if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
4767 			return invalfc(fc,
4768 				       "Group quota inode hardlimit too large.");
4769 		ctx->qlimits.grpquota_ihardlimit = size;
4770 		break;
4771 	case Opt_casefold_version:
4772 		return shmem_parse_opt_casefold(fc, param, false);
4773 	case Opt_casefold:
4774 		return shmem_parse_opt_casefold(fc, param, true);
4775 	case Opt_strict_encoding:
4776 #if IS_ENABLED(CONFIG_UNICODE)
4777 		ctx->strict_encoding = true;
4778 		break;
4779 #else
4780 		return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
4781 #endif
4782 	}
4783 	return 0;
4784 
4785 unsupported_parameter:
4786 	return invalfc(fc, "Unsupported parameter '%s'", param->key);
4787 bad_value:
4788 	return invalfc(fc, "Bad value for '%s'", param->key);
4789 }
4790 
shmem_next_opt(char ** s)4791 static char *shmem_next_opt(char **s)
4792 {
4793 	char *sbegin = *s;
4794 	char *p;
4795 
4796 	if (sbegin == NULL)
4797 		return NULL;
4798 
4799 	/*
4800 	 * NUL-terminate this option: unfortunately,
4801 	 * mount options form a comma-separated list,
4802 	 * but mpol's nodelist may also contain commas.
4803 	 */
4804 	for (;;) {
4805 		p = strchr(*s, ',');
4806 		if (p == NULL)
4807 			break;
4808 		*s = p + 1;
4809 		if (!isdigit(*(p+1))) {
4810 			*p = '\0';
4811 			return sbegin;
4812 		}
4813 	}
4814 
4815 	*s = NULL;
4816 	return sbegin;
4817 }
4818 
shmem_parse_monolithic(struct fs_context * fc,void * data)4819 static int shmem_parse_monolithic(struct fs_context *fc, void *data)
4820 {
4821 	return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
4822 }
4823 
4824 /*
4825  * Reconfigure a shmem filesystem.
4826  */
shmem_reconfigure(struct fs_context * fc)4827 static int shmem_reconfigure(struct fs_context *fc)
4828 {
4829 	struct shmem_options *ctx = fc->fs_private;
4830 	struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
4831 	unsigned long used_isp;
4832 	struct mempolicy *mpol = NULL;
4833 	const char *err;
4834 
4835 	raw_spin_lock(&sbinfo->stat_lock);
4836 	used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;
4837 
4838 	if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
4839 		if (!sbinfo->max_blocks) {
4840 			err = "Cannot retroactively limit size";
4841 			goto out;
4842 		}
4843 		if (percpu_counter_compare(&sbinfo->used_blocks,
4844 					   ctx->blocks) > 0) {
4845 			err = "Too small a size for current use";
4846 			goto out;
4847 		}
4848 	}
4849 	if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
4850 		if (!sbinfo->max_inodes) {
4851 			err = "Cannot retroactively limit inodes";
4852 			goto out;
4853 		}
4854 		if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
4855 			err = "Too few inodes for current use";
4856 			goto out;
4857 		}
4858 	}
4859 
4860 	if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
4861 	    sbinfo->next_ino > UINT_MAX) {
4862 		err = "Current inum too high to switch to 32-bit inums";
4863 		goto out;
4864 	}
4865 	if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
4866 		err = "Cannot disable swap on remount";
4867 		goto out;
4868 	}
4869 	if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
4870 		err = "Cannot enable swap on remount if it was disabled on first mount";
4871 		goto out;
4872 	}
4873 
4874 	if (ctx->seen & SHMEM_SEEN_QUOTA &&
4875 	    !sb_any_quota_loaded(fc->root->d_sb)) {
4876 		err = "Cannot enable quota on remount";
4877 		goto out;
4878 	}
4879 
4880 #ifdef CONFIG_TMPFS_QUOTA
4881 #define CHANGED_LIMIT(name)						\
4882 	(ctx->qlimits.name## hardlimit &&				\
4883 	(ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))
4884 
4885 	if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
4886 	    CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
4887 		err = "Cannot change global quota limit on remount";
4888 		goto out;
4889 	}
4890 #endif /* CONFIG_TMPFS_QUOTA */
4891 
4892 	if (ctx->seen & SHMEM_SEEN_HUGE)
4893 		sbinfo->huge = ctx->huge;
4894 	if (ctx->seen & SHMEM_SEEN_INUMS)
4895 		sbinfo->full_inums = ctx->full_inums;
4896 	if (ctx->seen & SHMEM_SEEN_BLOCKS)
4897 		sbinfo->max_blocks  = ctx->blocks;
4898 	if (ctx->seen & SHMEM_SEEN_INODES) {
4899 		sbinfo->max_inodes  = ctx->inodes;
4900 		sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
4901 	}
4902 
4903 	/*
4904 	 * Preserve previous mempolicy unless mpol remount option was specified.
4905 	 */
4906 	if (ctx->mpol) {
4907 		mpol = sbinfo->mpol;
4908 		sbinfo->mpol = ctx->mpol;	/* transfers initial ref */
4909 		ctx->mpol = NULL;
4910 	}
4911 
4912 	if (ctx->noswap)
4913 		sbinfo->noswap = true;
4914 
4915 	raw_spin_unlock(&sbinfo->stat_lock);
4916 	mpol_put(mpol);
4917 	return 0;
4918 out:
4919 	raw_spin_unlock(&sbinfo->stat_lock);
4920 	return invalfc(fc, "%s", err);
4921 }
4922 
shmem_show_options(struct seq_file * seq,struct dentry * root)4923 static int shmem_show_options(struct seq_file *seq, struct dentry *root)
4924 {
4925 	struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
4926 	struct mempolicy *mpol;
4927 
4928 	if (sbinfo->max_blocks != shmem_default_max_blocks())
4929 		seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
4930 	if (sbinfo->max_inodes != shmem_default_max_inodes())
4931 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
4932 	if (sbinfo->mode != (0777 | S_ISVTX))
4933 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
4934 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
4935 		seq_printf(seq, ",uid=%u",
4936 				from_kuid_munged(&init_user_ns, sbinfo->uid));
4937 	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
4938 		seq_printf(seq, ",gid=%u",
4939 				from_kgid_munged(&init_user_ns, sbinfo->gid));
4940 
4941 	/*
4942 	 * Showing inode{64,32} might be useful even if it's the system default,
4943 	 * since then people don't have to resort to checking both here and
4944 	 * /proc/config.gz to confirm 64-bit inums were successfully applied
4945 	 * (which may not even exist if IKCONFIG_PROC isn't enabled).
4946 	 *
4947 	 * We hide it when inode64 isn't the default and we are using 32-bit
4948 	 * inodes, since that probably just means the feature isn't even under
4949 	 * consideration.
4950 	 *
4951 	 * As such:
4952 	 *
4953 	 *                     +-----------------+-----------------+
4954 	 *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
4955 	 *  +------------------+-----------------+-----------------+
4956 	 *  | full_inums=true  | show            | show            |
4957 	 *  | full_inums=false | show            | hide            |
4958 	 *  +------------------+-----------------+-----------------+
4959 	 *
4960 	 */
4961 	if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
4962 		seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
4963 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4964 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
4965 	if (sbinfo->huge)
4966 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
4967 #endif
4968 	mpol = shmem_get_sbmpol(sbinfo);
4969 	shmem_show_mpol(seq, mpol);
4970 	mpol_put(mpol);
4971 	if (sbinfo->noswap)
4972 		seq_printf(seq, ",noswap");
4973 #ifdef CONFIG_TMPFS_QUOTA
4974 	if (sb_has_quota_active(root->d_sb, USRQUOTA))
4975 		seq_printf(seq, ",usrquota");
4976 	if (sb_has_quota_active(root->d_sb, GRPQUOTA))
4977 		seq_printf(seq, ",grpquota");
4978 	if (sbinfo->qlimits.usrquota_bhardlimit)
4979 		seq_printf(seq, ",usrquota_block_hardlimit=%lld",
4980 			   sbinfo->qlimits.usrquota_bhardlimit);
4981 	if (sbinfo->qlimits.grpquota_bhardlimit)
4982 		seq_printf(seq, ",grpquota_block_hardlimit=%lld",
4983 			   sbinfo->qlimits.grpquota_bhardlimit);
4984 	if (sbinfo->qlimits.usrquota_ihardlimit)
4985 		seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
4986 			   sbinfo->qlimits.usrquota_ihardlimit);
4987 	if (sbinfo->qlimits.grpquota_ihardlimit)
4988 		seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
4989 			   sbinfo->qlimits.grpquota_ihardlimit);
4990 #endif
4991 	return 0;
4992 }
4993 
4994 #endif /* CONFIG_TMPFS */
4995 
shmem_put_super(struct super_block * sb)4996 static void shmem_put_super(struct super_block *sb)
4997 {
4998 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
4999 
5000 #if IS_ENABLED(CONFIG_UNICODE)
5001 	if (sb->s_encoding)
5002 		utf8_unload(sb->s_encoding);
5003 #endif
5004 
5005 #ifdef CONFIG_TMPFS_QUOTA
5006 	shmem_disable_quotas(sb);
5007 #endif
5008 	free_percpu(sbinfo->ino_batch);
5009 	percpu_counter_destroy(&sbinfo->used_blocks);
5010 	mpol_put(sbinfo->mpol);
5011 	kfree(sbinfo);
5012 	sb->s_fs_info = NULL;
5013 }
5014 
5015 #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
5016 static const struct dentry_operations shmem_ci_dentry_ops = {
5017 	.d_hash = generic_ci_d_hash,
5018 	.d_compare = generic_ci_d_compare,
5019 };
5020 #endif
5021 
shmem_fill_super(struct super_block * sb,struct fs_context * fc)5022 static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
5023 {
5024 	struct shmem_options *ctx = fc->fs_private;
5025 	struct inode *inode;
5026 	struct shmem_sb_info *sbinfo;
5027 	int error = -ENOMEM;
5028 
5029 	/* Round up to L1_CACHE_BYTES to resist false sharing */
5030 	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
5031 				L1_CACHE_BYTES), GFP_KERNEL);
5032 	if (!sbinfo)
5033 		return error;
5034 
5035 	sb->s_fs_info = sbinfo;
5036 
5037 #ifdef CONFIG_TMPFS
5038 	/*
5039 	 * Per default we only allow half of the physical ram per
5040 	 * tmpfs instance, limiting inodes to one per page of lowmem;
5041 	 * but the internal instance is left unlimited.
5042 	 */
5043 	if (!(sb->s_flags & SB_KERNMOUNT)) {
5044 		if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
5045 			ctx->blocks = shmem_default_max_blocks();
5046 		if (!(ctx->seen & SHMEM_SEEN_INODES))
5047 			ctx->inodes = shmem_default_max_inodes();
5048 		if (!(ctx->seen & SHMEM_SEEN_INUMS))
5049 			ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
5050 		sbinfo->noswap = ctx->noswap;
5051 	} else {
5052 		sb->s_flags |= SB_NOUSER;
5053 	}
5054 	sb->s_export_op = &shmem_export_ops;
5055 	sb->s_flags |= SB_NOSEC | SB_I_VERSION;
5056 
5057 #if IS_ENABLED(CONFIG_UNICODE)
5058 	if (!ctx->encoding && ctx->strict_encoding) {
5059 		pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
5060 		error = -EINVAL;
5061 		goto failed;
5062 	}
5063 
5064 	if (ctx->encoding) {
5065 		sb->s_encoding = ctx->encoding;
5066 		set_default_d_op(sb, &shmem_ci_dentry_ops);
5067 		if (ctx->strict_encoding)
5068 			sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
5069 	}
5070 #endif
5071 
5072 #else
5073 	sb->s_flags |= SB_NOUSER;
5074 #endif /* CONFIG_TMPFS */
5075 	sb->s_d_flags |= DCACHE_DONTCACHE;
5076 	sbinfo->max_blocks = ctx->blocks;
5077 	sbinfo->max_inodes = ctx->inodes;
5078 	sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
5079 	if (sb->s_flags & SB_KERNMOUNT) {
5080 		sbinfo->ino_batch = alloc_percpu(ino_t);
5081 		if (!sbinfo->ino_batch)
5082 			goto failed;
5083 	}
5084 	sbinfo->uid = ctx->uid;
5085 	sbinfo->gid = ctx->gid;
5086 	sbinfo->full_inums = ctx->full_inums;
5087 	sbinfo->mode = ctx->mode;
5088 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5089 	if (ctx->seen & SHMEM_SEEN_HUGE)
5090 		sbinfo->huge = ctx->huge;
5091 	else
5092 		sbinfo->huge = tmpfs_huge;
5093 #endif
5094 	sbinfo->mpol = ctx->mpol;
5095 	ctx->mpol = NULL;
5096 
5097 	raw_spin_lock_init(&sbinfo->stat_lock);
5098 	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
5099 		goto failed;
5100 	spin_lock_init(&sbinfo->shrinklist_lock);
5101 	INIT_LIST_HEAD(&sbinfo->shrinklist);
5102 
5103 	sb->s_maxbytes = MAX_LFS_FILESIZE;
5104 	sb->s_blocksize = PAGE_SIZE;
5105 	sb->s_blocksize_bits = PAGE_SHIFT;
5106 	sb->s_magic = TMPFS_MAGIC;
5107 	sb->s_op = &shmem_ops;
5108 	sb->s_time_gran = 1;
5109 #ifdef CONFIG_TMPFS_XATTR
5110 	sb->s_xattr = shmem_xattr_handlers;
5111 #endif
5112 #ifdef CONFIG_TMPFS_POSIX_ACL
5113 	sb->s_flags |= SB_POSIXACL;
5114 #endif
5115 	uuid_t uuid;
5116 	uuid_gen(&uuid);
5117 	super_set_uuid(sb, uuid.b, sizeof(uuid));
5118 
5119 #ifdef CONFIG_TMPFS_QUOTA
5120 	if (ctx->seen & SHMEM_SEEN_QUOTA) {
5121 		sb->dq_op = &shmem_quota_operations;
5122 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
5123 		sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
5124 
5125 		/* Copy the default limits from ctx into sbinfo */
5126 		memcpy(&sbinfo->qlimits, &ctx->qlimits,
5127 		       sizeof(struct shmem_quota_limits));
5128 
5129 		if (shmem_enable_quotas(sb, ctx->quota_types))
5130 			goto failed;
5131 	}
5132 #endif /* CONFIG_TMPFS_QUOTA */
5133 
5134 	inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
5135 				S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
5136 	if (IS_ERR(inode)) {
5137 		error = PTR_ERR(inode);
5138 		goto failed;
5139 	}
5140 	inode->i_uid = sbinfo->uid;
5141 	inode->i_gid = sbinfo->gid;
5142 	sb->s_root = d_make_root(inode);
5143 	if (!sb->s_root)
5144 		goto failed;
5145 	return 0;
5146 
5147 failed:
5148 	shmem_put_super(sb);
5149 	return error;
5150 }
5151 
shmem_get_tree(struct fs_context * fc)5152 static int shmem_get_tree(struct fs_context *fc)
5153 {
5154 	return get_tree_nodev(fc, shmem_fill_super);
5155 }
5156 
shmem_free_fc(struct fs_context * fc)5157 static void shmem_free_fc(struct fs_context *fc)
5158 {
5159 	struct shmem_options *ctx = fc->fs_private;
5160 
5161 	if (ctx) {
5162 		mpol_put(ctx->mpol);
5163 		kfree(ctx);
5164 	}
5165 }
5166 
5167 static const struct fs_context_operations shmem_fs_context_ops = {
5168 	.free			= shmem_free_fc,
5169 	.get_tree		= shmem_get_tree,
5170 #ifdef CONFIG_TMPFS
5171 	.parse_monolithic	= shmem_parse_monolithic,
5172 	.parse_param		= shmem_parse_one,
5173 	.reconfigure		= shmem_reconfigure,
5174 #endif
5175 };
5176 
5177 static struct kmem_cache *shmem_inode_cachep __ro_after_init;
5178 
shmem_alloc_inode(struct super_block * sb)5179 static struct inode *shmem_alloc_inode(struct super_block *sb)
5180 {
5181 	struct shmem_inode_info *info;
5182 	info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
5183 	if (!info)
5184 		return NULL;
5185 	return &info->vfs_inode;
5186 }
5187 
shmem_free_in_core_inode(struct inode * inode)5188 static void shmem_free_in_core_inode(struct inode *inode)
5189 {
5190 	if (S_ISLNK(inode->i_mode))
5191 		kfree(inode->i_link);
5192 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
5193 }
5194 
shmem_destroy_inode(struct inode * inode)5195 static void shmem_destroy_inode(struct inode *inode)
5196 {
5197 	if (S_ISREG(inode->i_mode))
5198 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
5199 	if (S_ISDIR(inode->i_mode))
5200 		simple_offset_destroy(shmem_get_offset_ctx(inode));
5201 }
5202 
shmem_init_inode(void * foo)5203 static void shmem_init_inode(void *foo)
5204 {
5205 	struct shmem_inode_info *info = foo;
5206 	inode_init_once(&info->vfs_inode);
5207 }
5208 
shmem_init_inodecache(void)5209 static void __init shmem_init_inodecache(void)
5210 {
5211 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
5212 				sizeof(struct shmem_inode_info),
5213 				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
5214 }
5215 
shmem_destroy_inodecache(void)5216 static void __init shmem_destroy_inodecache(void)
5217 {
5218 	kmem_cache_destroy(shmem_inode_cachep);
5219 }
5220 
5221 /* Keep the page in page cache instead of truncating it */
shmem_error_remove_folio(struct address_space * mapping,struct folio * folio)5222 static int shmem_error_remove_folio(struct address_space *mapping,
5223 				   struct folio *folio)
5224 {
5225 	return 0;
5226 }
5227 
5228 static const struct address_space_operations shmem_aops = {
5229 	.dirty_folio	= noop_dirty_folio,
5230 #ifdef CONFIG_TMPFS
5231 	.write_begin	= shmem_write_begin,
5232 	.write_end	= shmem_write_end,
5233 #endif
5234 #ifdef CONFIG_MIGRATION
5235 	.migrate_folio	= migrate_folio,
5236 #endif
5237 	.error_remove_folio = shmem_error_remove_folio,
5238 };
5239 
5240 static const struct file_operations shmem_file_operations = {
5241 	.mmap		= shmem_mmap,
5242 	.open		= shmem_file_open,
5243 	.get_unmapped_area = shmem_get_unmapped_area,
5244 #ifdef CONFIG_TMPFS
5245 	.llseek		= shmem_file_llseek,
5246 	.read_iter	= shmem_file_read_iter,
5247 	.write_iter	= shmem_file_write_iter,
5248 	.fsync		= noop_fsync,
5249 	.splice_read	= shmem_file_splice_read,
5250 	.splice_write	= iter_file_splice_write,
5251 	.fallocate	= shmem_fallocate,
5252 #endif
5253 };
5254 
5255 static const struct inode_operations shmem_inode_operations = {
5256 	.getattr	= shmem_getattr,
5257 	.setattr	= shmem_setattr,
5258 #ifdef CONFIG_TMPFS_XATTR
5259 	.listxattr	= shmem_listxattr,
5260 	.set_acl	= simple_set_acl,
5261 	.fileattr_get	= shmem_fileattr_get,
5262 	.fileattr_set	= shmem_fileattr_set,
5263 #endif
5264 };
5265 
5266 static const struct inode_operations shmem_dir_inode_operations = {
5267 #ifdef CONFIG_TMPFS
5268 	.getattr	= shmem_getattr,
5269 	.create		= shmem_create,
5270 	.lookup		= simple_lookup,
5271 	.link		= shmem_link,
5272 	.unlink		= shmem_unlink,
5273 	.symlink	= shmem_symlink,
5274 	.mkdir		= shmem_mkdir,
5275 	.rmdir		= shmem_rmdir,
5276 	.mknod		= shmem_mknod,
5277 	.rename		= shmem_rename2,
5278 	.tmpfile	= shmem_tmpfile,
5279 	.get_offset_ctx	= shmem_get_offset_ctx,
5280 #endif
5281 #ifdef CONFIG_TMPFS_XATTR
5282 	.listxattr	= shmem_listxattr,
5283 	.fileattr_get	= shmem_fileattr_get,
5284 	.fileattr_set	= shmem_fileattr_set,
5285 #endif
5286 #ifdef CONFIG_TMPFS_POSIX_ACL
5287 	.setattr	= shmem_setattr,
5288 	.set_acl	= simple_set_acl,
5289 #endif
5290 };
5291 
5292 static const struct inode_operations shmem_special_inode_operations = {
5293 	.getattr	= shmem_getattr,
5294 #ifdef CONFIG_TMPFS_XATTR
5295 	.listxattr	= shmem_listxattr,
5296 #endif
5297 #ifdef CONFIG_TMPFS_POSIX_ACL
5298 	.setattr	= shmem_setattr,
5299 	.set_acl	= simple_set_acl,
5300 #endif
5301 };
5302 
5303 static const struct super_operations shmem_ops = {
5304 	.alloc_inode	= shmem_alloc_inode,
5305 	.free_inode	= shmem_free_in_core_inode,
5306 	.destroy_inode	= shmem_destroy_inode,
5307 #ifdef CONFIG_TMPFS
5308 	.statfs		= shmem_statfs,
5309 	.show_options	= shmem_show_options,
5310 #endif
5311 #ifdef CONFIG_TMPFS_QUOTA
5312 	.get_dquots	= shmem_get_dquots,
5313 #endif
5314 	.evict_inode	= shmem_evict_inode,
5315 	.drop_inode	= generic_delete_inode,
5316 	.put_super	= shmem_put_super,
5317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5318 	.nr_cached_objects	= shmem_unused_huge_count,
5319 	.free_cached_objects	= shmem_unused_huge_scan,
5320 #endif
5321 };
5322 
5323 static const struct vm_operations_struct shmem_vm_ops = {
5324 	.fault		= shmem_fault,
5325 	.map_pages	= filemap_map_pages,
5326 #ifdef CONFIG_NUMA
5327 	.set_policy     = shmem_set_policy,
5328 	.get_policy     = shmem_get_policy,
5329 #endif
5330 };
5331 
5332 static const struct vm_operations_struct shmem_anon_vm_ops = {
5333 	.fault		= shmem_fault,
5334 	.map_pages	= filemap_map_pages,
5335 #ifdef CONFIG_NUMA
5336 	.set_policy     = shmem_set_policy,
5337 	.get_policy     = shmem_get_policy,
5338 #endif
5339 };
5340 
shmem_init_fs_context(struct fs_context * fc)5341 int shmem_init_fs_context(struct fs_context *fc)
5342 {
5343 	struct shmem_options *ctx;
5344 
5345 	ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
5346 	if (!ctx)
5347 		return -ENOMEM;
5348 
5349 	ctx->mode = 0777 | S_ISVTX;
5350 	ctx->uid = current_fsuid();
5351 	ctx->gid = current_fsgid();
5352 
5353 #if IS_ENABLED(CONFIG_UNICODE)
5354 	ctx->encoding = NULL;
5355 #endif
5356 
5357 	fc->fs_private = ctx;
5358 	fc->ops = &shmem_fs_context_ops;
5359 	return 0;
5360 }
5361 
5362 static struct file_system_type shmem_fs_type = {
5363 	.owner		= THIS_MODULE,
5364 	.name		= "tmpfs",
5365 	.init_fs_context = shmem_init_fs_context,
5366 #ifdef CONFIG_TMPFS
5367 	.parameters	= shmem_fs_parameters,
5368 #endif
5369 	.kill_sb	= kill_litter_super,
5370 	.fs_flags	= FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
5371 };
5372 
5373 #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5374 
5375 #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)			\
5376 {									\
5377 	.attr	= { .name = __stringify(_name), .mode = _mode },	\
5378 	.show	= _show,						\
5379 	.store	= _store,						\
5380 }
5381 
5382 #define TMPFS_ATTR_W(_name, _store)				\
5383 	static struct kobj_attribute tmpfs_attr_##_name =	\
5384 			__INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
5385 
5386 #define TMPFS_ATTR_RW(_name, _show, _store)			\
5387 	static struct kobj_attribute tmpfs_attr_##_name =	\
5388 			__INIT_KOBJ_ATTR(_name, 0644, _show, _store)
5389 
5390 #define TMPFS_ATTR_RO(_name, _show)				\
5391 	static struct kobj_attribute tmpfs_attr_##_name =	\
5392 			__INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
5393 
5394 #if IS_ENABLED(CONFIG_UNICODE)
casefold_show(struct kobject * kobj,struct kobj_attribute * a,char * buf)5395 static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
5396 			char *buf)
5397 {
5398 		return sysfs_emit(buf, "supported\n");
5399 }
5400 TMPFS_ATTR_RO(casefold, casefold_show);
5401 #endif
5402 
5403 static struct attribute *tmpfs_attributes[] = {
5404 #if IS_ENABLED(CONFIG_UNICODE)
5405 	&tmpfs_attr_casefold.attr,
5406 #endif
5407 	NULL
5408 };
5409 
5410 static const struct attribute_group tmpfs_attribute_group = {
5411 	.attrs = tmpfs_attributes,
5412 	.name = "features"
5413 };
5414 
5415 static struct kobject *tmpfs_kobj;
5416 
tmpfs_sysfs_init(void)5417 static int __init tmpfs_sysfs_init(void)
5418 {
5419 	int ret;
5420 
5421 	tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
5422 	if (!tmpfs_kobj)
5423 		return -ENOMEM;
5424 
5425 	ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
5426 	if (ret)
5427 		kobject_put(tmpfs_kobj);
5428 
5429 	return ret;
5430 }
5431 #endif /* CONFIG_SYSFS && CONFIG_TMPFS */
5432 
shmem_init(void)5433 void __init shmem_init(void)
5434 {
5435 	int error;
5436 
5437 	shmem_init_inodecache();
5438 
5439 #ifdef CONFIG_TMPFS_QUOTA
5440 	register_quota_format(&shmem_quota_format);
5441 #endif
5442 
5443 	error = register_filesystem(&shmem_fs_type);
5444 	if (error) {
5445 		pr_err("Could not register tmpfs\n");
5446 		goto out2;
5447 	}
5448 
5449 	shm_mnt = kern_mount(&shmem_fs_type);
5450 	if (IS_ERR(shm_mnt)) {
5451 		error = PTR_ERR(shm_mnt);
5452 		pr_err("Could not kern_mount tmpfs\n");
5453 		goto out1;
5454 	}
5455 
5456 #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
5457 	error = tmpfs_sysfs_init();
5458 	if (error) {
5459 		pr_err("Could not init tmpfs sysfs\n");
5460 		goto out1;
5461 	}
5462 #endif
5463 
5464 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5465 	if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
5466 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5467 	else
5468 		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
5469 
5470 	/*
5471 	 * Default to setting PMD-sized THP to inherit the global setting and
5472 	 * disable all other multi-size THPs.
5473 	 */
5474 	if (!shmem_orders_configured)
5475 		huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
5476 #endif
5477 	return;
5478 
5479 out1:
5480 	unregister_filesystem(&shmem_fs_type);
5481 out2:
5482 #ifdef CONFIG_TMPFS_QUOTA
5483 	unregister_quota_format(&shmem_quota_format);
5484 #endif
5485 	shmem_destroy_inodecache();
5486 	shm_mnt = ERR_PTR(error);
5487 }
5488 
5489 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5490 static ssize_t shmem_enabled_show(struct kobject *kobj,
5491 				  struct kobj_attribute *attr, char *buf)
5492 {
5493 	static const int values[] = {
5494 		SHMEM_HUGE_ALWAYS,
5495 		SHMEM_HUGE_WITHIN_SIZE,
5496 		SHMEM_HUGE_ADVISE,
5497 		SHMEM_HUGE_NEVER,
5498 		SHMEM_HUGE_DENY,
5499 		SHMEM_HUGE_FORCE,
5500 	};
5501 	int len = 0;
5502 	int i;
5503 
5504 	for (i = 0; i < ARRAY_SIZE(values); i++) {
5505 		len += sysfs_emit_at(buf, len,
5506 				shmem_huge == values[i] ? "%s[%s]" : "%s%s",
5507 				i ? " " : "", shmem_format_huge(values[i]));
5508 	}
5509 	len += sysfs_emit_at(buf, len, "\n");
5510 
5511 	return len;
5512 }
5513 
shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5514 static ssize_t shmem_enabled_store(struct kobject *kobj,
5515 		struct kobj_attribute *attr, const char *buf, size_t count)
5516 {
5517 	char tmp[16];
5518 	int huge, err;
5519 
5520 	if (count + 1 > sizeof(tmp))
5521 		return -EINVAL;
5522 	memcpy(tmp, buf, count);
5523 	tmp[count] = '\0';
5524 	if (count && tmp[count - 1] == '\n')
5525 		tmp[count - 1] = '\0';
5526 
5527 	huge = shmem_parse_huge(tmp);
5528 	if (huge == -EINVAL)
5529 		return huge;
5530 
5531 	shmem_huge = huge;
5532 	if (shmem_huge > SHMEM_HUGE_DENY)
5533 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
5534 
5535 	err = start_stop_khugepaged();
5536 	return err ? err : count;
5537 }
5538 
5539 struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
5540 static DEFINE_SPINLOCK(huge_shmem_orders_lock);
5541 
thpsize_shmem_enabled_show(struct kobject * kobj,struct kobj_attribute * attr,char * buf)5542 static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
5543 					  struct kobj_attribute *attr, char *buf)
5544 {
5545 	int order = to_thpsize(kobj)->order;
5546 	const char *output;
5547 
5548 	if (test_bit(order, &huge_shmem_orders_always))
5549 		output = "[always] inherit within_size advise never";
5550 	else if (test_bit(order, &huge_shmem_orders_inherit))
5551 		output = "always [inherit] within_size advise never";
5552 	else if (test_bit(order, &huge_shmem_orders_within_size))
5553 		output = "always inherit [within_size] advise never";
5554 	else if (test_bit(order, &huge_shmem_orders_madvise))
5555 		output = "always inherit within_size [advise] never";
5556 	else
5557 		output = "always inherit within_size advise [never]";
5558 
5559 	return sysfs_emit(buf, "%s\n", output);
5560 }
5561 
thpsize_shmem_enabled_store(struct kobject * kobj,struct kobj_attribute * attr,const char * buf,size_t count)5562 static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
5563 					   struct kobj_attribute *attr,
5564 					   const char *buf, size_t count)
5565 {
5566 	int order = to_thpsize(kobj)->order;
5567 	ssize_t ret = count;
5568 
5569 	if (sysfs_streq(buf, "always")) {
5570 		spin_lock(&huge_shmem_orders_lock);
5571 		clear_bit(order, &huge_shmem_orders_inherit);
5572 		clear_bit(order, &huge_shmem_orders_madvise);
5573 		clear_bit(order, &huge_shmem_orders_within_size);
5574 		set_bit(order, &huge_shmem_orders_always);
5575 		spin_unlock(&huge_shmem_orders_lock);
5576 	} else if (sysfs_streq(buf, "inherit")) {
5577 		/* Do not override huge allocation policy with non-PMD sized mTHP */
5578 		if (shmem_huge == SHMEM_HUGE_FORCE &&
5579 		    order != HPAGE_PMD_ORDER)
5580 			return -EINVAL;
5581 
5582 		spin_lock(&huge_shmem_orders_lock);
5583 		clear_bit(order, &huge_shmem_orders_always);
5584 		clear_bit(order, &huge_shmem_orders_madvise);
5585 		clear_bit(order, &huge_shmem_orders_within_size);
5586 		set_bit(order, &huge_shmem_orders_inherit);
5587 		spin_unlock(&huge_shmem_orders_lock);
5588 	} else if (sysfs_streq(buf, "within_size")) {
5589 		spin_lock(&huge_shmem_orders_lock);
5590 		clear_bit(order, &huge_shmem_orders_always);
5591 		clear_bit(order, &huge_shmem_orders_inherit);
5592 		clear_bit(order, &huge_shmem_orders_madvise);
5593 		set_bit(order, &huge_shmem_orders_within_size);
5594 		spin_unlock(&huge_shmem_orders_lock);
5595 	} else if (sysfs_streq(buf, "advise")) {
5596 		spin_lock(&huge_shmem_orders_lock);
5597 		clear_bit(order, &huge_shmem_orders_always);
5598 		clear_bit(order, &huge_shmem_orders_inherit);
5599 		clear_bit(order, &huge_shmem_orders_within_size);
5600 		set_bit(order, &huge_shmem_orders_madvise);
5601 		spin_unlock(&huge_shmem_orders_lock);
5602 	} else if (sysfs_streq(buf, "never")) {
5603 		spin_lock(&huge_shmem_orders_lock);
5604 		clear_bit(order, &huge_shmem_orders_always);
5605 		clear_bit(order, &huge_shmem_orders_inherit);
5606 		clear_bit(order, &huge_shmem_orders_within_size);
5607 		clear_bit(order, &huge_shmem_orders_madvise);
5608 		spin_unlock(&huge_shmem_orders_lock);
5609 	} else {
5610 		ret = -EINVAL;
5611 	}
5612 
5613 	if (ret > 0) {
5614 		int err = start_stop_khugepaged();
5615 
5616 		if (err)
5617 			ret = err;
5618 	}
5619 	return ret;
5620 }
5621 
5622 struct kobj_attribute thpsize_shmem_enabled_attr =
5623 	__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
5624 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
5625 
5626 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
5627 
setup_transparent_hugepage_shmem(char * str)5628 static int __init setup_transparent_hugepage_shmem(char *str)
5629 {
5630 	int huge;
5631 
5632 	huge = shmem_parse_huge(str);
5633 	if (huge == -EINVAL) {
5634 		pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
5635 		return huge;
5636 	}
5637 
5638 	shmem_huge = huge;
5639 	return 1;
5640 }
5641 __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
5642 
setup_transparent_hugepage_tmpfs(char * str)5643 static int __init setup_transparent_hugepage_tmpfs(char *str)
5644 {
5645 	int huge;
5646 
5647 	huge = shmem_parse_huge(str);
5648 	if (huge < 0) {
5649 		pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
5650 		return huge;
5651 	}
5652 
5653 	tmpfs_huge = huge;
5654 	return 1;
5655 }
5656 __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
5657 
5658 static char str_dup[PAGE_SIZE] __initdata;
setup_thp_shmem(char * str)5659 static int __init setup_thp_shmem(char *str)
5660 {
5661 	char *token, *range, *policy, *subtoken;
5662 	unsigned long always, inherit, madvise, within_size;
5663 	char *start_size, *end_size;
5664 	int start, end, nr;
5665 	char *p;
5666 
5667 	if (!str || strlen(str) + 1 > PAGE_SIZE)
5668 		goto err;
5669 	strscpy(str_dup, str);
5670 
5671 	always = huge_shmem_orders_always;
5672 	inherit = huge_shmem_orders_inherit;
5673 	madvise = huge_shmem_orders_madvise;
5674 	within_size = huge_shmem_orders_within_size;
5675 	p = str_dup;
5676 	while ((token = strsep(&p, ";")) != NULL) {
5677 		range = strsep(&token, ":");
5678 		policy = token;
5679 
5680 		if (!policy)
5681 			goto err;
5682 
5683 		while ((subtoken = strsep(&range, ",")) != NULL) {
5684 			if (strchr(subtoken, '-')) {
5685 				start_size = strsep(&subtoken, "-");
5686 				end_size = subtoken;
5687 
5688 				start = get_order_from_str(start_size,
5689 							   THP_ORDERS_ALL_FILE_DEFAULT);
5690 				end = get_order_from_str(end_size,
5691 							 THP_ORDERS_ALL_FILE_DEFAULT);
5692 			} else {
5693 				start_size = end_size = subtoken;
5694 				start = end = get_order_from_str(subtoken,
5695 								 THP_ORDERS_ALL_FILE_DEFAULT);
5696 			}
5697 
5698 			if (start < 0) {
5699 				pr_err("invalid size %s in thp_shmem boot parameter\n",
5700 				       start_size);
5701 				goto err;
5702 			}
5703 
5704 			if (end < 0) {
5705 				pr_err("invalid size %s in thp_shmem boot parameter\n",
5706 				       end_size);
5707 				goto err;
5708 			}
5709 
5710 			if (start > end)
5711 				goto err;
5712 
5713 			nr = end - start + 1;
5714 			if (!strcmp(policy, "always")) {
5715 				bitmap_set(&always, start, nr);
5716 				bitmap_clear(&inherit, start, nr);
5717 				bitmap_clear(&madvise, start, nr);
5718 				bitmap_clear(&within_size, start, nr);
5719 			} else if (!strcmp(policy, "advise")) {
5720 				bitmap_set(&madvise, start, nr);
5721 				bitmap_clear(&inherit, start, nr);
5722 				bitmap_clear(&always, start, nr);
5723 				bitmap_clear(&within_size, start, nr);
5724 			} else if (!strcmp(policy, "inherit")) {
5725 				bitmap_set(&inherit, start, nr);
5726 				bitmap_clear(&madvise, start, nr);
5727 				bitmap_clear(&always, start, nr);
5728 				bitmap_clear(&within_size, start, nr);
5729 			} else if (!strcmp(policy, "within_size")) {
5730 				bitmap_set(&within_size, start, nr);
5731 				bitmap_clear(&inherit, start, nr);
5732 				bitmap_clear(&madvise, start, nr);
5733 				bitmap_clear(&always, start, nr);
5734 			} else if (!strcmp(policy, "never")) {
5735 				bitmap_clear(&inherit, start, nr);
5736 				bitmap_clear(&madvise, start, nr);
5737 				bitmap_clear(&always, start, nr);
5738 				bitmap_clear(&within_size, start, nr);
5739 			} else {
5740 				pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
5741 				goto err;
5742 			}
5743 		}
5744 	}
5745 
5746 	huge_shmem_orders_always = always;
5747 	huge_shmem_orders_madvise = madvise;
5748 	huge_shmem_orders_inherit = inherit;
5749 	huge_shmem_orders_within_size = within_size;
5750 	shmem_orders_configured = true;
5751 	return 1;
5752 
5753 err:
5754 	pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
5755 	return 0;
5756 }
5757 __setup("thp_shmem=", setup_thp_shmem);
5758 
5759 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
5760 
5761 #else /* !CONFIG_SHMEM */
5762 
5763 /*
5764  * tiny-shmem: simple shmemfs and tmpfs using ramfs code
5765  *
5766  * This is intended for small system where the benefits of the full
5767  * shmem code (swap-backed and resource-limited) are outweighed by
5768  * their complexity. On systems without swap this code should be
5769  * effectively equivalent, but much lighter weight.
5770  */
5771 
5772 static struct file_system_type shmem_fs_type = {
5773 	.name		= "tmpfs",
5774 	.init_fs_context = ramfs_init_fs_context,
5775 	.parameters	= ramfs_fs_parameters,
5776 	.kill_sb	= ramfs_kill_sb,
5777 	.fs_flags	= FS_USERNS_MOUNT,
5778 };
5779 
shmem_init(void)5780 void __init shmem_init(void)
5781 {
5782 	BUG_ON(register_filesystem(&shmem_fs_type) != 0);
5783 
5784 	shm_mnt = kern_mount(&shmem_fs_type);
5785 	BUG_ON(IS_ERR(shm_mnt));
5786 }
5787 
shmem_unuse(unsigned int type)5788 int shmem_unuse(unsigned int type)
5789 {
5790 	return 0;
5791 }
5792 
shmem_lock(struct file * file,int lock,struct ucounts * ucounts)5793 int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
5794 {
5795 	return 0;
5796 }
5797 
shmem_unlock_mapping(struct address_space * mapping)5798 void shmem_unlock_mapping(struct address_space *mapping)
5799 {
5800 }
5801 
5802 #ifdef CONFIG_MMU
shmem_get_unmapped_area(struct file * file,unsigned long addr,unsigned long len,unsigned long pgoff,unsigned long flags)5803 unsigned long shmem_get_unmapped_area(struct file *file,
5804 				      unsigned long addr, unsigned long len,
5805 				      unsigned long pgoff, unsigned long flags)
5806 {
5807 	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
5808 }
5809 #endif
5810 
shmem_truncate_range(struct inode * inode,loff_t lstart,loff_t lend)5811 void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
5812 {
5813 	truncate_inode_pages_range(inode->i_mapping, lstart, lend);
5814 }
5815 EXPORT_SYMBOL_GPL(shmem_truncate_range);
5816 
5817 #define shmem_vm_ops				generic_file_vm_ops
5818 #define shmem_anon_vm_ops			generic_file_vm_ops
5819 #define shmem_file_operations			ramfs_file_operations
5820 #define shmem_acct_size(flags, size)		0
5821 #define shmem_unacct_size(flags, size)		do {} while (0)
5822 
shmem_get_inode(struct mnt_idmap * idmap,struct super_block * sb,struct inode * dir,umode_t mode,dev_t dev,unsigned long flags)5823 static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
5824 				struct super_block *sb, struct inode *dir,
5825 				umode_t mode, dev_t dev, unsigned long flags)
5826 {
5827 	struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
5828 	return inode ? inode : ERR_PTR(-ENOSPC);
5829 }
5830 
5831 #endif /* CONFIG_SHMEM */
5832 
5833 /* common code */
5834 
__shmem_file_setup(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags,unsigned int i_flags)5835 static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
5836 			loff_t size, unsigned long flags, unsigned int i_flags)
5837 {
5838 	struct inode *inode;
5839 	struct file *res;
5840 
5841 	if (IS_ERR(mnt))
5842 		return ERR_CAST(mnt);
5843 
5844 	if (size < 0 || size > MAX_LFS_FILESIZE)
5845 		return ERR_PTR(-EINVAL);
5846 
5847 	if (is_idmapped_mnt(mnt))
5848 		return ERR_PTR(-EINVAL);
5849 
5850 	if (shmem_acct_size(flags, size))
5851 		return ERR_PTR(-ENOMEM);
5852 
5853 	inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
5854 				S_IFREG | S_IRWXUGO, 0, flags);
5855 	if (IS_ERR(inode)) {
5856 		shmem_unacct_size(flags, size);
5857 		return ERR_CAST(inode);
5858 	}
5859 	inode->i_flags |= i_flags;
5860 	inode->i_size = size;
5861 	clear_nlink(inode);	/* It is unlinked */
5862 	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
5863 	if (!IS_ERR(res))
5864 		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
5865 				&shmem_file_operations);
5866 	if (IS_ERR(res))
5867 		iput(inode);
5868 	return res;
5869 }
5870 
5871 /**
5872  * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
5873  * 	kernel internal.  There will be NO LSM permission checks against the
5874  * 	underlying inode.  So users of this interface must do LSM checks at a
5875  *	higher layer.  The users are the big_key and shm implementations.  LSM
5876  *	checks are provided at the key or shm level rather than the inode.
5877  * @name: name for dentry (to be seen in /proc/<pid>/maps)
5878  * @size: size to be set for the file
5879  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5880  */
shmem_kernel_file_setup(const char * name,loff_t size,unsigned long flags)5881 struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
5882 {
5883 	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
5884 }
5885 EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
5886 
5887 /**
5888  * shmem_file_setup - get an unlinked file living in tmpfs
5889  * @name: name for dentry (to be seen in /proc/<pid>/maps)
5890  * @size: size to be set for the file
5891  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5892  */
shmem_file_setup(const char * name,loff_t size,unsigned long flags)5893 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
5894 {
5895 	return __shmem_file_setup(shm_mnt, name, size, flags, 0);
5896 }
5897 EXPORT_SYMBOL_GPL(shmem_file_setup);
5898 
5899 /**
5900  * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
5901  * @mnt: the tmpfs mount where the file will be created
5902  * @name: name for dentry (to be seen in /proc/<pid>/maps)
5903  * @size: size to be set for the file
5904  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
5905  */
shmem_file_setup_with_mnt(struct vfsmount * mnt,const char * name,loff_t size,unsigned long flags)5906 struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
5907 				       loff_t size, unsigned long flags)
5908 {
5909 	return __shmem_file_setup(mnt, name, size, flags, 0);
5910 }
5911 EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
5912 
5913 /**
5914  * shmem_zero_setup - setup a shared anonymous mapping
5915  * @vma: the vma to be mmapped is prepared by do_mmap
5916  */
shmem_zero_setup(struct vm_area_struct * vma)5917 int shmem_zero_setup(struct vm_area_struct *vma)
5918 {
5919 	struct file *file;
5920 	loff_t size = vma->vm_end - vma->vm_start;
5921 
5922 	/*
5923 	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
5924 	 * between XFS directory reading and selinux: since this file is only
5925 	 * accessible to the user through its mapping, use S_PRIVATE flag to
5926 	 * bypass file security, in the same way as shmem_kernel_file_setup().
5927 	 */
5928 	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
5929 	if (IS_ERR(file))
5930 		return PTR_ERR(file);
5931 
5932 	if (vma->vm_file)
5933 		fput(vma->vm_file);
5934 	vma->vm_file = file;
5935 	vma->vm_ops = &shmem_anon_vm_ops;
5936 
5937 	return 0;
5938 }
5939 
5940 /**
5941  * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
5942  * @mapping:	the folio's address_space
5943  * @index:	the folio index
5944  * @gfp:	the page allocator flags to use if allocating
5945  *
5946  * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
5947  * with any new page allocations done using the specified allocation flags.
5948  * But read_cache_page_gfp() uses the ->read_folio() method: which does not
5949  * suit tmpfs, since it may have pages in swapcache, and needs to find those
5950  * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
5951  *
5952  * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
5953  * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
5954  */
shmem_read_folio_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5955 struct folio *shmem_read_folio_gfp(struct address_space *mapping,
5956 		pgoff_t index, gfp_t gfp)
5957 {
5958 #ifdef CONFIG_SHMEM
5959 	struct inode *inode = mapping->host;
5960 	struct folio *folio;
5961 	int error;
5962 
5963 	error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
5964 				    gfp, NULL, NULL);
5965 	if (error)
5966 		return ERR_PTR(error);
5967 
5968 	folio_unlock(folio);
5969 	return folio;
5970 #else
5971 	/*
5972 	 * The tiny !SHMEM case uses ramfs without swap
5973 	 */
5974 	return mapping_read_folio_gfp(mapping, index, gfp);
5975 #endif
5976 }
5977 EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);
5978 
shmem_read_mapping_page_gfp(struct address_space * mapping,pgoff_t index,gfp_t gfp)5979 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
5980 					 pgoff_t index, gfp_t gfp)
5981 {
5982 	struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
5983 	struct page *page;
5984 
5985 	if (IS_ERR(folio))
5986 		return &folio->page;
5987 
5988 	page = folio_file_page(folio, index);
5989 	if (PageHWPoison(page)) {
5990 		folio_put(folio);
5991 		return ERR_PTR(-EIO);
5992 	}
5993 
5994 	return page;
5995 }
5996 EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
5997